# Trying different Algorithm for Sentiment analysis

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report,confusion_matrix
from sklearn import metrics



In [2]:
import os
os.chdir('/home/paperspace/fastai/courses/dl1/')

In [11]:
df_train = pd.read_csv('./data/datamining/train.csv')
df_test = pd.read_csv('./data/datamining/test.csv')

In [29]:
df_train.head()

Unnamed: 0,id,text,airline,tweet_location,user_timezone,sentiment
0,0,@JetBlue great flight! Great view! :-) http://...,Delta,,,positive
1,1,"@united they're not, actually. gate agent was ...",United,chicago,,negative
2,2,@AmericanAir No worries they called back 4 hrs...,American,"Dallas, Texas",,negative
3,3,@united thank you. There was one here a few mo...,United,"New York, NY",America/New_York,positive
4,4,@united Brothers luggage was lost on Copa Airl...,United,"Kearney, Nebraska",Central Time (US & Canada),negative


In [34]:
vectorizer = CountVectorizer(stop_words='english')
train_features = vectorizer.fit_transform([r for r in df_train['text']])

In [57]:
X_train, X_test, y_train, y_test = train_test_split(train_features, 
                 df_train['sentiment'].values,        
                 test_size=0.2)

## Logistis Regression

Cross Validation: https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6 <br>

In [58]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
print("Logistic Regression classification rate:", LR.score(X_test, y_test))

Logistic Regression classification rate: 0.9167468719923003


In [59]:
scores = cross_val_score(LR, X_train, y_train, cv=10)
print("Cross-validated scores:", scores)

Cross-validated scores: [0.92307692 0.91937425 0.90373045 0.92418773 0.90854392 0.90734055
 0.90493381 0.90854392 0.92409639 0.91325301]


In [80]:
predictions = LR.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[1587   57]
 [ 116  318]]
             precision    recall  f1-score   support

   negative       0.93      0.97      0.95      1644
   positive       0.85      0.73      0.79       434

avg / total       0.91      0.92      0.91      2078



## Naive Bayes

In [61]:
from sklearn.naive_bayes import MultinomialNB

In [63]:
NB = MultinomialNB()
NB.fit(X_train, y_train)
print("Naive Bayes classification rate:", NB.score(X_test, y_test))

Naive Bayes classification rate: 0.9013474494706448


In [64]:
scores = cross_val_score(NB, X_train, y_train, cv=10)
print("Cross-validated scores:", scores)

Cross-validated scores: [0.87860577 0.89651023 0.86883273 0.8856799  0.89169675 0.88929001
 0.87725632 0.88808664 0.9        0.90120482]


In [79]:
predictions = NB.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[1576   68]
 [ 137  297]]
             precision    recall  f1-score   support

   negative       0.92      0.96      0.94      1644
   positive       0.81      0.68      0.74       434

avg / total       0.90      0.90      0.90      2078



## Decision Tree

In [65]:
from sklearn import tree

In [70]:
DT = tree.DecisionTreeClassifier()
DT.fit(X_train, y_train)
print("Decision Tree classification rate:", DT.score(X_test, y_test))

Decision Tree classification rate: 0.8604427333974976


In [68]:
scores = cross_val_score(DT, X_train, y_train, cv=10)
print("Cross-validated scores:", scores)

Cross-validated scores: [0.86778846 0.86642599 0.8423586  0.84356197 0.84717208 0.82912154
 0.86281588 0.84115523 0.86506024 0.84457831]


In [77]:
predictions = DT.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[1490  154]
 [ 136  298]]
             precision    recall  f1-score   support

   negative       0.92      0.91      0.91      1644
   positive       0.66      0.69      0.67       434

avg / total       0.86      0.86      0.86      2078



## Neural Network 

In [71]:
from sklearn.neural_network import MLPClassifier

In [81]:
mlp = MLPClassifier(hidden_layer_sizes=(20,20,20),max_iter=500)
mlp.fit(X_train,y_train)
print("Neural Network classification rate:", DT.score(X_test, y_test))

Neural Network classification rate: 0.8604427333974976


In [82]:
predictions = mlp.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[1559   85]
 [ 123  311]]
             precision    recall  f1-score   support

   negative       0.93      0.95      0.94      1644
   positive       0.79      0.72      0.75       434

avg / total       0.90      0.90      0.90      2078



# Using Bigger data

In [3]:
df_train = pd.read_csv('./data/datamining/Tweets.csv')

In [11]:
df_train = df_train.rename(columns={'airline_sentiment':'sentiment'})

In [12]:
vectorizer = CountVectorizer(stop_words='english')
train_features = vectorizer.fit_transform([r for r in df_train['text']])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(train_features, 
                 df_train['sentiment'].values,        
                 test_size=0.2)

In [14]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
print("Logistic Regression classification rate:", LR.score(X_test, y_test))

Logistic Regression classification rate: 0.7756147540983607


In [15]:
scores = cross_val_score(LR, X_train, y_train, cv=10)
print("Cross-validated scores:", scores)

Cross-validated scores: [0.78327645 0.76450512 0.79863481 0.79010239 0.77796755 0.78650726
 0.78992314 0.78479932 0.78736123 0.80325064]


In [17]:
# Logistic Regression
predictions = LR.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[1610  136   48]
 [ 245  337   58]
 [ 102   68  324]]
             precision    recall  f1-score   support

   negative       0.82      0.90      0.86      1794
    neutral       0.62      0.53      0.57       640
   positive       0.75      0.66      0.70       494

avg / total       0.77      0.78      0.77      2928



In [19]:
# Bayesian
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()
NB.fit(X_train, y_train)
print("Naive Bayes classification rate:", NB.score(X_test, y_test))

Naive Bayes classification rate: 0.7592213114754098


In [20]:
scores = cross_val_score(NB, X_train, y_train, cv=10)
print("Cross-validated scores:", scores)

Cross-validated scores: [0.74488055 0.75426621 0.75426621 0.75426621 0.76345004 0.74978651
 0.72758326 0.72843723 0.74807857 0.77331052]


In [21]:
predictions = NB.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[1677   79   38]
 [ 328  264   48]
 [ 157   55  282]]
             precision    recall  f1-score   support

   negative       0.78      0.93      0.85      1794
    neutral       0.66      0.41      0.51       640
   positive       0.77      0.57      0.65       494

avg / total       0.75      0.76      0.74      2928



In [22]:
# Decision Tree
from sklearn import tree
DT = tree.DecisionTreeClassifier()
DT.fit(X_train, y_train)
print("Decision Tree classification rate:", DT.score(X_test, y_test))

Decision Tree classification rate: 0.6796448087431693


In [23]:
scores = cross_val_score(DT, X_train, y_train, cv=10)
print("Cross-validated scores:", scores)

Cross-validated scores: [0.6612628  0.69027304 0.65955631 0.66211604 0.69000854 0.69086251
 0.66268147 0.67976089 0.68488471 0.679213  ]


In [24]:
predictions = DT.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[1389  295  110]
 [ 257  319   64]
 [ 121   91  282]]
             precision    recall  f1-score   support

   negative       0.79      0.77      0.78      1794
    neutral       0.45      0.50      0.47       640
   positive       0.62      0.57      0.59       494

avg / total       0.68      0.68      0.68      2928



In [25]:
# Neural Network
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(20,20,20),max_iter=500)
mlp.fit(X_train,y_train)
print("Neural Network classification rate:", DT.score(X_test, y_test))

Neural Network classification rate: 0.6796448087431693


In [26]:
predictions = mlp.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[1476  231   87]
 [ 215  353   72]
 [  97   94  303]]
             precision    recall  f1-score   support

   negative       0.83      0.82      0.82      1794
    neutral       0.52      0.55      0.54       640
   positive       0.66      0.61      0.63       494

avg / total       0.73      0.73      0.73      2928

