### Import all required libraries

In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import re
% matplotlib inline

### Setup twitter dataset

In [5]:
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [6]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [22]:
pos_dataset = pd.DataFrame(pos_tweets,columns=['Tweet'])
pos_dataset['Sentiment'] = 1
neg_dataset = pd.DataFrame(neg_tweets,columns=['Tweet'])
neg_dataset['Sentiment'] = 0

dataset = pos_dataset.append(neg_dataset)

In [26]:
dataset = dataset.sample(frac=1)

### Cleanup and tokenize the text data

In [39]:
ps = PorterStemmer()
def tokenize(raw_tweet):
    tweet = re.sub("[^a-zA-Z]"," ",raw_tweet)
    tweet = tweet.lower()
    tweet = tweet.split()
    tweet = [ps.stem(word) for word in tweet if word not in set(stopwords.words('english'))]
    tweet = " ".join(tweet)
    return tweet    

In [40]:
corpus = dataset['Tweet'].apply(tokenize)

In [44]:
display(corpus.head())
display(dataset.head())

1547                                  hous scari af night
2818    nickiepedia parent na plan usual work huhu cou...
473                                smileformeacc hug mani
3921                   gvmba shoulda let borrow one knive
1780    sophielbradshaw realli sorri hear sophi certai...
Name: Tweet, dtype: object

Unnamed: 0,Tweet,Sentiment
1547,My house scary AF at night :(,0
2818,@nickiepedia :( I can't. My parents are here n...,0
473,@Smileformeacc *hug* you have many :),1
3921,@Gvmba shoulda let me borrow one of those kniv...,1
1780,@sophielbradshaw We are really sorry to hear t...,0


### Setup Matrices for ML model

In [78]:
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset['Sentiment'].values

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.15, random_state = 0)

### Fit ML models

In [86]:
classifiers = [GaussianNB(),
              DecisionTreeClassifier(criterion='entropy', random_state=0)
              ]
y_pred = []
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred.append(classifier.predict(X_test))

### View Results

In [87]:
cm = [confusion_matrix(y_test, y_p) for y_p in y_pred]
cm

[array([[592, 162],
        [348, 398]], dtype=int64), array([[558, 196],
        [252, 494]], dtype=int64)]

In [90]:
for array in cm:
    print(array.trace()/array.sum())

0.66
0.701333333333
