### Import all required libraries

In [212]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import re
from nltk.tokenize.casual import EMOTICON_RE
% matplotlib inline

### Setup twitter dataset

In [189]:
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [190]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [191]:
pos_dataset = pd.DataFrame(pos_tweets,columns=['Tweet'])
pos_dataset['Sentiment'] = 1
neg_dataset = pd.DataFrame(neg_tweets,columns=['Tweet'])
neg_dataset['Sentiment'] = 0

dataset = pos_dataset.append(neg_dataset,ignore_index=True)

In [99]:
#dataset = dataset.sample(frac=1)

### Cleanup and tokenize the text data

In [192]:
negation_words = set(['no','not'] + [word for word in stopwords.words('english') if re.search("n'",word)])
unwanted_words = set(stopwords.words('english')).difference(negation_words)
ps = PorterStemmer()

def tokenize(raw_tweet):
    emoticons = set(EMOTICON_RE.findall(raw_tweet))
    raw_tweet = raw_tweet.split()
    tweet = []
    for word in raw_tweet:
        if word in emoticons:
            tweet.append(word)
            continue
        elif word.lower() not in unwanted_words:
            word = re.sub("[^a-zA-Z]"," ",word)
            tweet.append(ps.stem(word.lower()))
    tweet = " ".join(tweet)
    return tweet

In [193]:
dataset['corpus'] = dataset['Tweet'].apply(tokenize)

In [194]:
display(dataset.head())

Unnamed: 0,Tweet,Sentiment,corpus
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1,followfriday france int pkuchly milipol ...
1,@Lamb2ja Hey James! How odd :/ Please call our...,1,lamb ja hey james odd :/ pleas call contact ...
2,@DespiteOfficial we had a listen last night :)...,1,despiteoffici listen last night :) bleed amaz...
3,@97sides CONGRATS :),1,side congrat :)
4,yeaaaah yippppy!!! my accnt verified rqst has...,1,yeaaaah yippppy accnt verifi rqst succeed g...


### Setup Matrices for ML model

In [213]:
cv = CountVectorizer(max_features = 1500)
X = pd.DataFrame(cv.fit_transform(dataset['corpus']).toarray())
y = dataset['Sentiment']

In [214]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.15, random_state = 0)

### Fit ML models

In [215]:
classifiers = [GaussianNB(),
               DecisionTreeClassifier(criterion='entropy', random_state=0),
               SVC(kernel = 'linear', random_state = 0),
               KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski'),
               RandomForestClassifier(n_estimators=100 , criterion='entropy', random_state=0)
              ]
y_pred = []
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred.append(classifier.predict(X_test))

### View Results

In [216]:
cm = [confusion_matrix(y_test, y_p) for y_p in y_pred]
for matrix in cm:
    print(matrix,"\n")

[[626 141]
 [387 346]] 

[[585 182]
 [251 482]] 

[[605 162]
 [217 516]] 

[[535 232]
 [253 480]] 

[[616 151]
 [228 505]] 



In [217]:
for array in cm:
    print(array.trace()/array.sum())

0.648
0.711333333333
0.747333333333
0.676666666667
0.747333333333


In [218]:
num_maxes = 10
feature_argmaxes = np.argpartition(-classifier.feature_importances_,num_maxes)[:num_maxes]
for arg in feature_argmaxes:
    print(cv.get_feature_names()[arg])

http
sad
happi
great
thank
love
good
miss
co
can


In [201]:
np.partition(-classifier.feature_importances_,num_maxes)

array([ -1.13683180e-02,  -1.32368792e-02,  -2.27111610e-02, ...,
        -0.00000000e+00,  -3.08188247e-05,  -0.00000000e+00])

### Evaluate incorrect tweets

In [219]:
incorrect = y_test != y_pred[-1]

In [221]:
dataset.loc[y_test[incorrect].index,:].head()

Unnamed: 0,Tweet,Sentiment,corpus
5906,Twitter Help Center | Why can&amp;#39;t I foll...,0,twitter help center can amp t follow peo...
2670,i drew @JustinNFJK :))\n#WIP #SWS #crush @SWSt...,1,drew justinnfjk wip sw crush swstheba...
6084,#Showbox the only thing that works is that nar...,0,showbox thing work naruto cartoon let keep fi...
3724,if u hardcore and stay up dis late favorite :),1,u hardcor stay di late favorit :)
3184,"Best ATP players (Wimbledon 2015): Federer, Na...",1,best atp player wimbledon federer nad...


### Implement VADER method for comparison

In [205]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [206]:
sid = SentimentIntensityAnalyzer()

In [207]:
keys = ['pos','neg','neu','compound']
vader = pd.DataFrame(dataset['Tweet'],index=dataset.index)
for key in keys:
    vader[key] = vader['Tweet'].apply(lambda tweet: sid.polarity_scores(tweet)[key])

In [208]:
vader.head()

Unnamed: 0,Tweet,pos,neg,neu,compound
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,0.385,0.0,0.615,0.7579
1,@Lamb2ja Hey James! How odd :/ Please call our...,0.27,0.145,0.585,0.6229
2,@DespiteOfficial we had a listen last night :)...,0.294,0.0,0.706,0.7959
3,@97sides CONGRATS :),0.877,0.0,0.123,0.7983
4,yeaaaah yippppy!!! my accnt verified rqst has...,0.282,0.0,0.718,0.795


In [209]:
vader['Sentiment'] = vader['compound'].apply(lambda x: 1 if x>0 else 0)

In [210]:
vader_cm = confusion_matrix(dataset['Sentiment'],vader['Sentiment'])
print(vader_cm, '\n')

print(vader_cm.trace()/vader_cm.sum())

[[3970 1030]
 [ 390 4610]] 

0.858


The NLTK VADER method is much more successful at predicting tweets than the simple ML models implemented before; 85% accuracy vs 74%.