In [36]:
# import the required libraries
%matplotlib inline
import pandas as pd
import numpy as np


import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import text_normalizer as tn


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [37]:
gnb =GaussianNB()
lr= LogisticRegression()
svc= SVC()
dt= DecisionTreeClassifier()
rf = RandomForestClassifier()

In [2]:
#product review
amazon_cells= pd.read_csv('data/sentiment labelled sentences/amazon_cells_labelled.txt', delimiter='\t',header=None)

In [3]:
amazon_cells.columns = ['Review', 'Sentiment']

In [4]:
amazon_cells.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [15]:
amazon_cells.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     1000 non-null   object
 1   Sentiment  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [5]:
amazon_cells.isna().sum()

Review       0
Sentiment    0
dtype: int64

In [6]:
amazon_cells['Sentiment'].value_counts(normalize=True)

0    0.5
1    0.5
Name: Sentiment, dtype: float64

In [7]:
# split the data into training and testing set

X, y = amazon_cells['Review'], amazon_cells['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12, test_size=.3)

In [12]:
# normalize the train and test reviews

X_train_norm = tn.normalize_corpus(corpus=X_train, 
                    accented_char_removal=True, 
                    html_stripping= False,
                    contraction_expansion=False, 
                    stopword_removal=True,
                    text_lower_case=True,
                    remove_digits=False,
                   )

X_test_norm = tn.normalize_corpus(corpus=X_test, 
                    accented_char_removal=True, 
                    html_stripping= False,
                    contraction_expansion=False, 
                    stopword_removal=True,
                    text_lower_case=True,
                    remove_digits=False,
                   )

In [19]:
# fitting the model using TFIDF for feature engineering..

count_vect = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1, 2))
tfidf = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1, 2))

In [20]:
cv_train_features =count_vect.fit_transform(X_train_norm)
cv_test_features= count_vect.transform(X_test_norm)

tv_train_features=tfidf.fit_transform(X_train_norm)
tv_test_features= tfidf.transform(X_test_norm)

In [21]:
# bag of words
print('train features: ', cv_train_features.shape)
print('test features: ', cv_test_features.shape)

train features:  (700, 4280)
test features:  (300, 4280)


In [22]:
# tfidf 
print('train features: ', tv_train_features.shape)
print('test features: ', tv_test_features.shape)

train features:  (700, 4280)
test features:  (300, 4280)


In [38]:
clf_list = [gnb, lr, svc, dt, rf]
clf_names = ['Gaussian NB', 'Logistic Regression', 'SVC', 'Decision tree', 'Random forest']
model_list= []
scores_list = []
for clf in clf_list:
    model = clf.fit(tv_train_features.toarray(), y_train)
    scores = clf.score(tv_test_features.toarray(), y_test) 
    model_list.append(model)
    scores_list.append(scores)

In [39]:
pd.Series(scores_list, index=clf_names)

Gaussian NB            0.740000
Logistic Regression    0.803333
SVC                    0.803333
Decision tree          0.760000
Random forest          0.790000
dtype: float64