In [1]:
# Natural Language Processing

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
df = pd.read_csv('sentiment.tsv', delimiter = '\t', header=None,names = ["sentiment", "text"])
# dataset = pd.read_csv('sentiment.tsv')
df.head()

Unnamed: 0,sentiment,text
0,neg,"@jamielewislewis i cant believe it, it really ..."
1,pos,having a vodka tonic and looking forward to go...
2,pos,@ddlovatofans1neg1 Could you follow me please....
3,pos,@jordanknight for once.................. PLEAS...
4,neg,Had a dream about a walk in fast food resturau...


In [3]:
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    

[nltk_data] Downloading package stopwords to /home/max/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
corpus

['jamielewislewi cant believ realli doesnt belong hope doesnt ruin im bad thank how life treatin x',
 'vodka tonic look forward go saddl ranch westgat husband san kiddo',
 'ddlovatofan neg could follow pleas would realli appreci',
 'jordanknight pleas tell us u think person',
 'dream walk fast food resturaunt sold ice cream guitar string sold gaug',
 'troybrownbbnew ye quot friend quot',
 'pack work expier',
 'hate internet explrer angri angri choic http plurk com p rqqi',
 'federalcas said go eat negneg p disappoint',
 'babyk yeah surgeri till cold gone poor thing still runni eye',
 'britruxpin say answer pant',
 'place call negut take somebodi http myloc xri',
 'still fli la ltr hayyy sleep depriv',
 'go look like wont get earli',
 'increibl someon forgot u soo fast',
 'littl girl holli find time aw get load vote bet',
 'aapko huyi asuvidha ke liy hume khed hay train neg minut late',
 'brightondol ugh po go strong spent like hour play rollercoast tycoon bore',
 'back think everyon',


In [5]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

yes_no_cols = ["sentiment"]
df[yes_no_cols] = df[yes_no_cols] == 'pos'
y = df.iloc[:, 0].values
y.shape

(2001,)

In [6]:
y

array([False,  True,  True, ...,  True,  True,  True], dtype=bool)

In [8]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [7]:
from sklearn.metrics import accuracy_score as accuracy
from sklearn import linear_model
from sklearn import tree
from sklearn import svm
from sklearn import ensemble
from sklearn import neighbors
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import cross_val_score

kf = KFold(len(y),n_folds=10,shuffle=True)
kf2 = StratifiedKFold(y,n_folds=10,shuffle=True)

print('Dump Classifier:               {:.3f}'.format(accuracy(y, [0 for ii in y.tolist()])))

results = cross_val_score(GaussianNB(), X = X, y = y, cv = kf)
print("GaussianNB Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))
results = cross_val_score(ensemble.RandomForestClassifier(), X = X, y = y, cv = kf)
print("RandomForest Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))
results = cross_val_score( ensemble.GradientBoostingClassifier(), X = X, y = y, cv = kf)
print("GradientBoosting Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))
results = cross_val_score( XGBClassifier(), X = X, y = y, cv = kf)
print("XGB Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))

results = cross_val_score(GaussianNB(), X = X, y = y, cv = kf2)
print("GaussianNB Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))
results = cross_val_score(ensemble.RandomForestClassifier(), X = X, y = y, cv = kf2)
print("RandomForest Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))
results = cross_val_score( ensemble.GradientBoostingClassifier(), X = X, y = y, cv = kf2)
print("GradientBoosting Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))
results = cross_val_score( XGBClassifier(), X = X, y = y, cv = kf2)
print("XGB Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))



Dump Classifier:               0.492
GaussianNB Accuracy: 0.619 (0.035)
RandomForest Accuracy: 0.643 (0.021)
GradientBoosting Accuracy: 0.644 (0.021)
XGB Accuracy: 0.634 (0.029)
GaussianNB Accuracy: 0.610 (0.035)
RandomForest Accuracy: 0.650 (0.030)
GradientBoosting Accuracy: 0.641 (0.047)
XGB Accuracy: 0.648 (0.041)


In [11]:
from sklearn.svm import SVC

results = cross_val_score( SVC(), X = X, y = y, cv = kf2)
print("SVC Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))

XGB Accuracy: 0.508 (0.001)
