### Import all required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import re
% matplotlib inline



### Setup twitter dataset

In [2]:
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [3]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
pos_dataset = pd.DataFrame(pos_tweets,columns=['Tweet'])
pos_dataset['Sentiment'] = 1
neg_dataset = pd.DataFrame(neg_tweets,columns=['Tweet'])
neg_dataset['Sentiment'] = 0

dataset = pos_dataset.append(neg_dataset)

In [99]:
#dataset = dataset.sample(frac=1)

### Cleanup and tokenize the text data

In [5]:
negation_words = set(['no','not'] + [word for word in stopwords.words('english') if re.search("n'",word)])
unwanted_words = set(stopwords.words('english')).difference(negation_words)
ps = PorterStemmer()

def tokenize(raw_tweet):
    tweet = raw_tweet.lower()
    tweet = tweet.split()
    tweet = [ps.stem(word) for word in tweet if word not in unwanted_words]
    tweet = " ".join(tweet)
    tweet = re.sub("[^a-zA-Z]"," ",tweet)
    return tweet

In [32]:
dataset['corpus'] = dataset['Tweet'].apply(tokenize)

In [33]:
display(dataset.head())

Unnamed: 0,Tweet,Sentiment,corpus
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1,followfriday france int pkuchly milipol ...
1,@Lamb2ja Hey James! How odd :/ Please call our...,1,lamb ja hey james odd pleas call contact ...
2,@DespiteOfficial we had a listen last night :)...,1,despiteoffici listen last night bleed amaz...
3,@97sides CONGRATS :),1,side congrat
4,yeaaaah yippppy!!! my accnt verified rqst has...,1,yeaaaah yippppy accnt verifi rqst succeed g...


### Setup Matrices for ML model

In [44]:
cv = CountVectorizer(max_features = 1500)
X = pd.DataFrame(cv.fit_transform(dataset['corpus']).toarray())
y = dataset['Sentiment']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.15, random_state = 0)

### Fit ML models

In [48]:
classifiers = [GaussianNB(),
              DecisionTreeClassifier(criterion='entropy', random_state=0)
              ]
y_pred = []
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred.append(classifier.predict(X_test))

### View Results

In [49]:
cm = [confusion_matrix(y_test, y_p) for y_p in y_pred]
for matrix in cm:
    print(matrix,"\n")

[[629 138]
 [388 345]] 

[[598 169]
 [246 487]] 



In [50]:
for array in cm:
    print(array.trace()/array.sum())

0.649333333333
0.723333333333


### Evaluate incorrect tweets

In [56]:
incorrect = y_test != y_pred[-1]

In [60]:
dataset.loc[y_test[incorrect].index,:].head()

Unnamed: 0,Tweet,Sentiment,corpus
906,Hi happys!!:) http://t.co/AGiLlxJdbi,1,hi happys http t co agillxjdbi
906,Twitter Help Center | Why can&amp;#39;t I foll...,0,twitter help center can amp t follow peo...
2343,"My mister is the best mister :) supportive, ki...",1,mister best mister supportive kind absolu...
2343,OH MY GAAAWD :( https://t.co/ZAd3jg0jzF,0,oh gaaawd https t co zad jg jzf
2670,i drew @JustinNFJK :))\n#WIP #SWS #crush @SWSt...,1,drew justinnfjk wip sw crush swstheba...


### Implement VADER method for comparison

In [103]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [72]:
sid = SentimentIntensityAnalyzer()

In [109]:
keys = ['pos','neg','neu','compound']
vader = pd.DataFrame(dataset['Tweet'],index=dataset.index)
for key in keys:
    vader[key] = vader['Tweet'].apply(lambda tweet: sid.polarity_scores(tweet)[key])

In [110]:
vader.head()

Unnamed: 0,Tweet,pos,neg,neu,compound
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,0.385,0.0,0.615,0.7579
1,@Lamb2ja Hey James! How odd :/ Please call our...,0.27,0.145,0.585,0.6229
2,@DespiteOfficial we had a listen last night :)...,0.294,0.0,0.706,0.7959
3,@97sides CONGRATS :),0.877,0.0,0.123,0.7983
4,yeaaaah yippppy!!! my accnt verified rqst has...,0.282,0.0,0.718,0.795


In [111]:
vader['Sentiment'] = vader['compound'].apply(lambda x: 1 if x>0 else 0)

In [112]:
vader_cm = confusion_matrix(dataset['Sentiment'],vader['Sentiment'])
print(vader_cm, '\n')

print(vader_cm.trace()/vader_cm.sum())

[[3970 1030]
 [ 390 4610]] 

0.858


The NLTK VADER method is much more successful at predicting tweets than the simple ML models implemented before; 85% accuracy vs 72%.