# IMPORTING PACKAGES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mohit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# READING THE DATA

In [3]:
train = pd.read_csv('train.csv',encoding='latin1')

In [4]:
train.drop('ItemID',axis=1,inplace=True)

In [5]:
train.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99989 entries, 0 to 99988
Data columns (total 2 columns):
Sentiment        99989 non-null int64
SentimentText    99989 non-null object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [7]:
train['Sentiment'].value_counts()

1    56457
0    43532
Name: Sentiment, dtype: int64

# DATA CLEANING

In [8]:
#removal of urls
def rem_url(col):
    text=col[0]
    text=re.sub(r'http\S+',"",text)
    return text
train[['SentimentText']]=train[['SentimentText']].apply(rem_url,axis=1)

In [9]:
#removal of usernames
def rem_user(col):
    text=col[0]
    text=re.sub(r'@[A-Za-z0-9_]{1,15}',"",text)
    return text
train[['SentimentText']]=train[['SentimentText']].apply(rem_user,axis=1)

In [10]:
#removal of special characters
def rem_special(col):
    text=col[0]
    text=re.sub(r'[^A-Za-z0-9]+'," ",text)
    return text
train[['SentimentText']]=train[['SentimentText']].apply(rem_special,axis=1)

In [11]:
#removal of spaces from start and end
def rem_space(col):
    text=col[0]
    text.strip()
    return text
train[['SentimentText']]=train[['SentimentText']].apply(rem_space,axis=1)

In [12]:
#removal of digits
def rem_dig(col):
    text=col[0]
    text=re.sub(r'\d+',"",text)
    return text
train[['SentimentText']]=train[['SentimentText']].apply(rem_dig,axis=1)

In [13]:
#conversion to lowercase
def con_tolower(col):
    text=col[0]
    text=text.lower()
    return text
train[['SentimentText']]=train[['SentimentText']].apply(con_tolower,axis=1)

In [14]:
train.head(50)

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my apl friend
1,0,i missed the new moon trailer
2,1,omg its already o
3,0,omgaga im sooo im gunna cry i ve been at this...
4,0,i think mi bf is cheating on me t t
5,0,or i just worry too much
6,1,juuuuuuuuuuuuuuuuussssst chillin
7,0,sunny again work tomorrow tv tonight
8,1,handed in my uniform today i miss you already
9,1,hmmmm i wonder how she my number


# TOKENIZATION AND STEMMING

In [15]:
ps=PorterStemmer()
from nltk.corpus import stopwords

In [16]:
train.shape[0]

99989

In [17]:
corpus = []
for i in range(0,99989):
    tweet = train['SentimentText'][i]
    tweet = tweet.split()
    tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]
    tweet = ' '.join(tweet)
    corpus.append(tweet)

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
cv=CountVectorizer(max_features=2500)

In [20]:
X=cv.fit_transform(corpus).toarray()

In [21]:
y=np.asarray(train['Sentiment'])

In [22]:
y

array([0, 0, 1, ..., 0, 1, 1], dtype=int64)

# SPLITTING TRAIN TEST DATA

In [23]:
from sklearn.cross_validation import train_test_split



In [24]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.3,random_state=0)

# FITTING CLASSIFICATION MODELS

In [25]:
from sklearn.naive_bayes import GaussianNB

In [26]:
model = GaussianNB()

In [27]:
model.fit(X_train,y_train)

GaussianNB(priors=None)

In [28]:
predictions=model.predict(X_test)

In [29]:
from sklearn.metrics import confusion_matrix,classification_report

In [30]:
cm=confusion_matrix(y_test,predictions)

In [32]:
print(cm)

[[ 7872  5288]
 [ 4507 12330]]


In [33]:
cr=classification_report(y_test,predictions)

In [34]:
print(cr)

             precision    recall  f1-score   support

          0       0.64      0.60      0.62     13160
          1       0.70      0.73      0.72     16837

avg / total       0.67      0.67      0.67     29997



In [35]:
#using randomforest
from sklearn.ensemble import RandomForestClassifier

In [43]:
model1=RandomForestClassifier(n_estimators=100)

In [44]:
model1.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [45]:
predictions1=model1.predict(X_test)

In [46]:
cm1=confusion_matrix(y_test,predictions1)

In [47]:
print(cm1)

[[ 8705  4455]
 [ 3839 12998]]


In [48]:
cr1=classification_report(y_test,predictions1)

In [49]:
print(cr1)

             precision    recall  f1-score   support

          0       0.69      0.66      0.68     13160
          1       0.74      0.77      0.76     16837

avg / total       0.72      0.72      0.72     29997

