### Import all required libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import re
% matplotlib inline



### Setup twitter dataset

In [3]:
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [4]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [98]:
pos_dataset = pd.DataFrame(pos_tweets,columns=['Tweet'])
pos_dataset['Sentiment'] = 1
neg_dataset = pd.DataFrame(neg_tweets,columns=['Tweet'])
neg_dataset['Sentiment'] = 0

dataset = pos_dataset.append(neg_dataset)

In [99]:
#dataset = dataset.sample(frac=1)

### Cleanup and tokenize the text data

In [100]:
negation_words = set(['no','not'] + [word for word in stopwords.words('english') if re.search("n'",word)])
unwanted_words = set(stopwords.words('english')).difference(negation_words)
ps = PorterStemmer()

def tokenize(raw_tweet):
    tweet = raw_tweet.lower()
    tweet = tweet.split()
    tweet = [ps.stem(word) for word in tweet if word not in unwanted_words]
    tweet = " ".join(tweet)
    tweet = re.sub("[^a-zA-Z]"," ",tweet)
    return tweet

In [101]:
corpus = dataset['Tweet'].apply(tokenize)

In [102]:
display(corpus.head())
display(dataset.head())

0     followfriday  france int  pkuchly    milipol ...
1     lamb ja hey james  odd    pleas call contact ...
2     despiteoffici listen last night    bleed amaz...
3                                      side congrat   
4    yeaaaah yippppy    accnt verifi rqst succeed g...
Name: Tweet, dtype: object

Unnamed: 0,Tweet,Sentiment
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1
1,@Lamb2ja Hey James! How odd :/ Please call our...,1
2,@DespiteOfficial we had a listen last night :)...,1
3,@97sides CONGRATS :),1
4,yeaaaah yippppy!!! my accnt verified rqst has...,1


### Setup Matrices for ML model

In [103]:
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset['Sentiment'].values

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.15, random_state = 0)

### Fit ML models

In [105]:
classifiers = [GaussianNB(),
              DecisionTreeClassifier(criterion='entropy', random_state=0)
              ]
y_pred = []
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred.append(classifier.predict(X_test))

### View Results

In [106]:
cm = [confusion_matrix(y_test, y_p) for y_p in y_pred]
for matrix in cm:
    print(matrix,"\n")

[[629 138]
 [388 345]] 

[[598 169]
 [246 487]] 



In [107]:
for array in cm:
    print(array.trace()/array.sum())

0.649333333333
0.723333333333


### Evaluate incorrect tweets

In [112]:
incorrect = y_test != y_pred[-1]
incorrect

array([False, False, False, ...,  True,  True, False], dtype=bool)

In [113]:
cv.inverse_transform(X_test[incorrect])

[array(['amp', 'can', 'co', 'follow', 'help', 'https', 'iq', 'one',
        'people', 'realli', 'sorri', 'support', 'twitter', 'via'], 
       dtype='<U27'), array(['absolut', 'amaz', 'best', 'co', 'http', 'kind'], 
       dtype='<U27'), array(['co', 'http'], 
       dtype='<U27'), array(['done', 'first', 'give', 'go', 'll', 'tweet'], 
       dtype='<U27'), array(['cross', 'end', 'favourit', 'finger', 'isnt', 'keep', 'let',
        'thing', 'work'], 
       dtype='<U27'), array(['di', 'favorit', 'late', 'stay'], 
       dtype='<U27'), array(['best', 'player'], 
       dtype='<U27'), array(['day', 'got', 'need', 'someth', 'tuesday', 'whole'], 
       dtype='<U27'), array(['couldn', 'go', 'last', 'love', 'night', 'parti', 'seen', 'shame',
        'would'], 
       dtype='<U27'), array(['tri', 'yeah'], 
       dtype='<U27'), array(['bed', 'got', 'yet'], 
       dtype='<U27'), array(['hope', 'next', 'ye', 'year'], 
       dtype='<U27'), array(['bring', 'made', 'mouth'], 
       dtype='<U27