In [None]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer

%matplotlib inline

# Define train data

In [None]:
train=pd.read_csv("/Users/apple/Documents/Train.csv")
train = train[['TweetText', 'Sentiment', 'Topic']]
train = train[train.Sentiment != 'irrelevant']
train.head()

In [None]:
X_train=train['TweetText']

# Define test data

In [None]:
test=pd.read_csv("/Users/apple/Documents/Test.csv")
test = test[['TweetText', 'Sentiment', 'Topic']]
test = test[test.Sentiment != 'irrelevant']
test.head()

In [None]:
X_test=test['TweetText']

# Clean data

In [None]:
def remove_pattern(input_txt, pattern):
    r= re.findall(pattern, input_txt)
    for i in r:
        input_txt=re.sub(i,'',input_txt)
        
    return input_txt

In [None]:
train['tidy_tweet']=np.vectorize(remove_pattern)(train['TweetText'], "@[\w]*")
train['tidy_tweet']=train['tidy_tweet'].str.replace("[^a-z,A-z#]"," ")
train['tidy_tweet']=train['tidy_tweet'].apply(lambda x:' '.join([w for w in x.split() if len(w)>3]))
train.head()

In [None]:
test['tidy_tweet']=np.vectorize(remove_pattern)(test['TweetText'], "@[\w]*")
test['tidy_tweet']=test['tidy_tweet'].str.replace("[^a-z,A-z#]"," ")
test['tidy_tweet']=test['tidy_tweet'].apply(lambda x:' '.join([w for w in x.split() if len(w)>3]))
test.head()

# Tokenizer

In [None]:
tokenized_tweet = train['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

In [None]:
tokenized_tweet = test['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

In [None]:
from nltk.stem.porter import *
stemmer=PorterStemmer()
tokenized_tweet=tokenized_tweet.apply(lambda x:[stemmer.stem(i) for i in x])
tokenized_tweet.head()

# Wordcloud to explain train data

In [None]:
train_words=' '.join([text for text in train['tidy_tweet']])
from wordcloud import WordCloud
wordcloud=WordCloud(width=1000, height=800, random_state=21, max_font_size=110).generate(train_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# Wordcloud to explain test data

In [None]:
test_words=' '.join([text for text in test['tidy_tweet']])
from wordcloud import WordCloud
wordcloud=WordCloud(width=1000, height=800, random_state=21, max_font_size=110).generate(test_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# Show the hashtag trends in train

In [None]:
def hashtag_extract(x):
    hashtags=[]
    for i in x:
        ht=re.findall(r"#(\w+)",i)
        hashtags.append(ht)
    return hashtags

train_positive=hashtag_extract(train['tidy_tweet'][train['Sentiment']== 'positive'])
train_negative=hashtag_extract(train['tidy_tweet'][train['Sentiment']=='negative'])
train_neutral=hashtag_extract(train['tidy_tweet'][train['Sentiment']=='neutral'])

train_positive_sum=sum(train_positive,[])
train_negative_sum=sum(train_negative,[])
train_neutral_sum=sum(train_neutral,[])

a=nltk.FreqDist(train_positive_sum)
d=pd.DataFrame({'Hashtag': list(a.keys()), 'Count':list(a.values())})
d=d.nlargest(columns="Count",n=10)
plt.figure(figsize=(16,5))
ax=sns.barplot(data=d, x= "Hashtag", y= "Count")
plt.show()

In [None]:
b=nltk.FreqDist(train_negative_sum)
e=pd.DataFrame({'Hashtag':list(b.keys()),'Count':list(b.values())})
e=e.nlargest(columns="Count",n=10)
plt.figure(figsize=(16,5))
ax=sns.barplot(data=e, x="Hashtag", y="Count")
plt.show()

In [None]:
g=nltk.FreqDist(train_neutral_sum)
h=pd.DataFrame({'Hashtag':list(g.keys()),'Count':list(g.values())})
h=h.nlargest(columns="Count",n=10)
plt.figure(figsize=(16,5))
ax=sns.barplot(data=h, x="Hashtag", y="Count")
plt.show()

# Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(max_df=0.90, min_df=2, stop_words='english', max_features=1000)
xtrain_vec=vectorizer.fit_transform(train['tidy_tweet'])
xtest_vec=vectorizer.transform(test['tidy_tweet'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf_train=tfidf_vectorizer.fit_transform(train['tidy_tweet'])
tfidf_test=tfidf_vectorizer.transform(test['tidy_tweet'])

# Get scores

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
ytrain = train['Sentiment']
ytest = test['Sentiment']
ztrain = train['Topic']
ztest = test['Topic']
#init logistic regression
egg_sen=LogisticRegression(multi_class='auto', solver='lbfgs')
egg_sen.fit(xtrain_vec, ytrain)
egg_pred=egg_sen.predict(xtest_vec)
print(accuracy_score(egg_pred,ytest))

In [None]:
from sklearn.metrics import precision_score
precision_score(ytest, egg_pred, average=None)

In [None]:
egg_top=LogisticRegression(multi_class='auto', solver='lbfgs')
egg_top.fit(xtrain_vec, ztrain)
egg_pred_1=egg_top.predict(xtest_vec)
print(accuracy_score(egg_pred_1,ztest))

In [None]:
from sklearn.naive_bayes import MultinomialNB 
ham_sen=MultinomialNB()
ham_sen.fit(xtrain_vec, ytrain)
ham_pre=ham_sen.predict(xtest_vec)
print(accuracy_score(ham_pre, ytest))

In [None]:
ham_top=MultinomialNB()
ham_top.fit(xtrain_vec, ztrain)
ham_pre_1=ham_top.predict(xtest_vec)
print(accuracy_score(ham_pre_1, ztest))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(egg_pred, ytest))

In [None]:
pd.DataFrame({'Topic':test['Topic'], 'NB_topic': ham_pre_1}).head()

# Predict

In [None]:
new=input('File Link')
new1=pd.read_csv(new)
NS=new1['Sentiment']
NTO=new1['Topic']

new1['tidytweet'] = np.vectorize(remove_pattern)(new1['TweetText'], "@[\w]*")
new1['tidytweet'] = new1['tidytweet'].str.replace("[^a-z,A-z#]"," ")
new1['tidytweet'] = new1['tidytweet'].apply(lambda x:' '.join([w for w in x.split() if len(w)>3]))
N_vec = vectorizer.transform(new1['tidytweet'])
sen_new = egg_sen.predict(N_vec)
top_new = egg_top.predict(N_vec)
sen_new1 = ham_sen.predict(N_vec)
top_new1 = ham_top.predict(N_vec)

In [None]:
#print('Accuracy of Sentiment Analysis by LogisticRegression',accuracy_score(sen_new, NS))
#print('Accuracy of Topic Analysis by LogisticRegression', accuracy_score(top_new, NTO))
#print('Accuracy of Sentiment Analysis by Naive Bayes',accuracy_score(sen_new1, NS))
#print('Accuracy of Topic Analysis by Naive Bayes',accuracy_score(top_new1, NTO))

accuracy_list={'Name':['Sentiment by LogisticRegression', 'Topic by LogisticRegression', 'Sentiment by Naive Bayes', 'Topic by Naive Bayes'],
      'Accuracy':[accuracy_score(sen_new, NS), accuracy_score(top_new, NTO), accuracy_score(sen_new1, NS), accuracy_score(top_new1, NTO)]}
df = pd.DataFrame(accuracy_list)

ob=('senLR', 'topLR', 'senNB', 'topNB')
y_pos = np.arange(len(ob))
acc=(accuracy_score(sen_new, NS), accuracy_score(top_new, NTO), accuracy_score(sen_new1, NS), accuracy_score(top_new1, NTO))

plt.bar(y_pos, acc, align='center', alpha=0.5)
plt.xticks(y_pos, ob)
plt.ylabel('score')
plt.title('friendly lab1+2')
plt.show()

In [None]:
n= pd.DataFrame({'TweetText': new1['TweetText'], 'Sentiment': new1['Sentiment'], 'Log_Sentiment': sen_new, 'NB_Sentiment': sen_new1, 'Topic': new1['Topic'], 'Log_Topic': top_new, 'NB_Topic': top_new1})
n[0:15]