In [54]:
#importing libraries
import re
import warnings 
import nltk
from nltk.tokenize import TweetTokenizer #for tokenize text 
from nltk.stem.snowball import SnowballStemmer # for Stemming word 
#from nltk.stem.lancaster import LancasterStemmer 
import pandas as pd
from sklearn.cross_validation import train_test_split #for splitting data into train and test
from sklearn.feature_extraction.text import CountVectorizer #for vectorize text into sparse matrix 
from sklearn import metrics # for findin the accuracy of model 
import collections #for finding each class true prediction frequency 
from sklearn.naive_bayes import MultinomialNB # import and instantiate MultinomialNB
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [20]:
%matplotlib inline
warnings.filterwarnings("ignore")

In [21]:
tweet = pd.read_csv("Tweets.csv") #reading tweet.csv file using pandas
df=tweet.iloc[:,(10,1)]  
df.columns = ['text', 'sentiment'] #only two column from data text and sentiment 
data= df
print data.head() 

                                                text sentiment
0                @VirginAmerica What @dhepburn said.   neutral
1  @VirginAmerica plus you've added commercials t...  positive
2  @VirginAmerica I didn't today... Must mean I n...   neutral
3  @VirginAmerica it's really aggressive to blast...  negative
4  @VirginAmerica and it's a really big bad thing...  negative

[5 rows x 2 columns]


In [22]:
# removes every thing except text 
data['text']=data['text'].str.replace("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|([0-9])","")
# now tokenize text
data['text']=data['text'].apply(nltk.word_tokenize)
print data.head() #first five row is printing after tokenized

                                                text sentiment
0                                       [What, said]   neutral
1  [plus, youve, added, commercials, to, the, exp...  positive
2  [I, didnt, today, Must, mean, I, need, to, tak...   neutral
3  [its, really, aggressive, to, blast, obnoxious...  negative
4  [and, its, a, really, big, bad, thing, about, it]  negative

[5 rows x 2 columns]


In [23]:
#Stemming each word 
stemmer = SnowballStemmer('english')
data['text']=data['text'].apply(lambda x: [stemmer.stem(y) for y in x])
print data.head()

                                                text sentiment
0                                       [what, said]   neutral
1  [plus, youv, ad, commerci, to, the, experi, ta...  positive
2  [i, didnt, today, must, mean, i, need, to, tak...   neutral
3  [it, realli, aggress, to, blast, obnoxi, enter...  negative
4   [and, it, a, realli, big, bad, thing, about, it]  negative

[5 rows x 2 columns]


In [24]:
# removing stopword 
stopwords = nltk.corpus.stopwords.words('english')
data['text']=data['text'].apply(lambda x: [y for y in x if y not in stopwords])
print data.head()

                                                text sentiment
0                                             [said]   neutral
1          [plus, youv, ad, commerci, experi, tacki]  positive
2  [didnt, today, must, mean, need, take, anoth, ...   neutral
3  [realli, aggress, blast, obnoxi, entertain, gu...  negative
4                          [realli, big, bad, thing]  negative

[5 rows x 2 columns]


In [25]:
# Detokenize cleaned dataframe for vectorizing
data['text'] = data['text'].str.join(" ")
print data.head()
print "data shape = ", data.shape

                                                text sentiment
0                                               said   neutral
1                 plus youv ad commerci experi tacki  positive
2         didnt today must mean need take anoth trip   neutral
3  realli aggress blast obnoxi entertain guest fa...  negative
4                               realli big bad thing  negative

[5 rows x 2 columns]
data shape =  (14640, 2)


In [26]:
#transforming postive to 2, netural to 1, negative to 0
sentiment = sorted(data['sentiment'].unique())
sentiment_mapping = dict(zip(sentiment, range(0, len(sentiment) + 1)))
data['sentiment']  = data['sentiment'].map(sentiment_mapping).astype(int)
print data.head()

                                                text  sentiment
0                                               said          1
1                 plus youv ad commerci experi tacki          2
2         didnt today must mean need take anoth trip          1
3  realli aggress blast obnoxi entertain guest fa...          0
4                               realli big bad thing          0

[5 rows x 2 columns]


In [27]:
X = data['text']
y = data['sentiment']


In [28]:
#splitting data into train and test sets  
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print "size of train data = ", X_train.shape,"size of test data = ", X_test.shape
print "size of target train data = ", y_train.shape, "size of target test data = ",y_test.shape

size of train data =  (10980,) size of test data =  (3660,)
size of target train data =  (10980,) size of target test data =  (3660,)


In [29]:
vect = CountVectorizer()
#  fit and transform X_train into X_tr
X_tr =  vect.fit_transform(X_train)
# transform X_test into X_te
X_te = vect.transform(X_test)
print "After vectorizing size of x_train = ", X_tr.shape
print "After vectorizing size of x_test = ", X_te.shape 


After vectorizing size of x_train =  (10980, 8360)
After vectorizing size of x_test =  (3660, 8360)


In [30]:
nb = MultinomialNB()
# train the model using X_train_dtm
nb.fit(X_tr, y_train)
# make class predictions for X_te
y_pred = nb.predict(X_te)
# calculate accuracy of class predictions
score = metrics.accuracy_score(y_test, y_pred)
print score

0.769672131148


In [31]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred)

array([[2184,   77,   30],
       [ 425,  295,   54],
       [ 208,   49,  338]])

In [32]:
#postive = 2, netural = 1, negative = 0

In [33]:
#Precision: When a given class is predicted, how often are those predictions correct?

In [34]:
# manually calculate the precision for class 0
precision0 = 2184 / float(2184 + 425 + 208 )
print(precision0)

0.77529286475


In [35]:
# manually calculate the precision for class 1
precision1 = 295 / float(77 + 295 + 49 )
print(precision1)

0.700712589074


In [36]:
# manually calculate the precision for class 2
precision2 = 338 / float(30 + 54 + 338 )
print(precision2)

0.800947867299


In [37]:
#Recall: "When a given class is the true class, how often is that class predicted?"

In [38]:
# manually calculate the recall for class 0
recall0 = 2184 / float(2184 + 77 + 30 )
print(recall0)

0.953295504147


In [39]:
# manually calculate the recall for class 1
recall1 = 295 / float(425 + 295 + 54 )
print(recall1)

0.381136950904


In [40]:
# manually calculate the recall for class 2
recall2 = 338 / float(208 + 49 + 338 )
print(recall2)

0.568067226891


In [41]:
#F1 score:  is a weighted average of precision and recall.

In [42]:
# manually calculate the F1 score for class 0
f1 = 2 * (precision0 * recall0) / (precision0 + recall0)
print(f1)

0.855129209084


In [43]:
# manually calculate the F1 score for class 1
f1 = 2 * (precision1 * recall1) / (precision1 + recall1)
print(f1)

0.493723849372


In [44]:
# manually calculate the F1 score for class 2
f1 = 2 * (precision2 * recall2) / (precision2 + recall2)
print(f1)

0.664700098328


In [45]:
#Support: "How many observations exist for which a given class is the true class?"

In [46]:
support = collections.Counter(y_test)
print (support)

Counter({0: 2291, 1: 774, 2: 595})


In [47]:
# print obove result using classification report 
print(metrics.classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.78      0.95      0.86      2291
          1       0.70      0.38      0.49       774
          2       0.80      0.57      0.66       595

avg / total       0.76      0.77      0.75      3660

