In [None]:
import numpy as np # linear algebra
import pandas as pd 

In [47]:
# graphics imports
import plotly
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS
import plotly.express as px
# Natural language tool kits
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize.toktok import ToktokTokenizer  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# download stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
# string operations
import string 
import re

# general imports
import math

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# load data
df = pd.read_csv('/content/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
lens = df['review'].str.len()

fig = go.Figure()
fig.add_trace(
    go.Histogram(x=lens, xbins=dict(size=200))
    )
fig.update_layout(title='Length of reviews', 
                    xaxis_title="Length",
                    yaxis_title="# of reviews")
plotly.offline.iplot(fig)

In [None]:
poslens = df[df['sentiment']=='positive']['review'].str.len()
neglens = df[df['sentiment']=='negative']['review'].str.len()
fig = go.Figure()
fig.add_trace(
    go.Histogram(x=poslens, xbins=dict(size=200), name='positive'),
    )
fig.add_trace(
    go.Histogram(x=neglens, xbins=dict(size=200), name='negative'),
    )
fig.update_layout(title='Length of reviews', 
                    xaxis_title="Length",
                    yaxis_title="# of reviews",)
plotly.offline.iplot(fig)

In [None]:
# the text mode is enough...
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [None]:
# show the reviews again... 
df[['review']].head(20)

Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is..."
5,"Probably my all-time favorite movie, a story o..."
6,I sure would like to see a resurrection of a u...
7,"This show was an amazing, fresh & innovative i..."
8,Encouraged by the positive comments about this...
9,If you like original gut wrenching laughter yo...


In [None]:
#Lowercase
df['review_lw'] = df['review'].str.lower()
df[['review','review_lw']].head(5)

Unnamed: 0,review,review_lw
0,One of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,a wonderful little production. <br /><br />the...
2,I thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...","petter mattei's ""love in the time of money"" is..."


In [11]:
#Stopwords and punctuations
sw = stopwords.words('english')

print(f'Stopwords sample: {sw[0:10]}')
print(f'Number of stopwords: {len(sw)}')

Stopwords sample: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
Number of stopwords: 179


In [12]:
print(f'Punctuation {string.punctuation}')

Punctuation !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [13]:
def transform_text(s):
    
    # remove html
    html=re.compile(r'<.*?>')
    s = html.sub(r'',s)
    
    # remove numbers
    s = re.sub(r'\d+', '', s)
    
    # remove punctuation
    # remove stopwords
    tokens = nltk.word_tokenize(s)
    
    new_string = []
    for w in tokens:
        # remove words with len = 2 AND stopwords
        if len(w) > 2 and w not in sw:
            new_string.append(w)
    
    
    
    s = ' '.join(new_string)
    s = s.strip()

    exclude = set(string.punctuation)
    s = ''.join(ch for ch in s if ch not in exclude)
    
    return s.strip()

In [17]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
df['review']=df['review'].apply(remove_stopwords)

In [16]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

#Removing the html strip
from bs4 import BeautifulSoup
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text
#Apply function on review column
df['review']=df['review'].apply(denoise_text)

In [18]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
df['review']=df['review'].apply(simple_stemmer)

# New Section

# New Section

In [19]:
transform_text('there is a tree near <br/> the river 123! see')

'tree near river see'

In [20]:
df['review_sw'] = df['review_lw'].apply(transform_text)
df[['review','review_lw', 'review_sw']].head(20)

Unnamed: 0,review,review_lw,review_sw
0,one review mention watch 1 Oz episod ' hooked....,one of the other reviewers has mentioned that ...,one reviewers mentioned watching episode ll ho...
1,wonder littl production. film techniqu unassum...,a wonderful little production. <br /><br />the...,wonderful little production filming technique ...
2,thought wonder way spend time hot summer weeke...,i thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,basic ' famili littl boy ( jake ) think ' zomb...,basically there's a family where a little boy ...,basically family little boy jake thinks zombie...
4,"petter mattei ' "" love time money "" visual stu...","petter mattei's ""love in the time of money"" is...",petter mattei love time money visually stunnin...
5,"probabl all-tim favorit movi , stori selfless ...","probably my all-time favorite movie, a story o...",probably alltime favorite movie story selfless...
6,sure would like see resurrect date seahunt ser...,i sure would like to see a resurrection of a u...,sure would like see resurrection dated seahunt...
7,"show amaz , fresh &amp ; innov idea 70 ' first...","this show was an amazing, fresh & innovative i...",show amazing fresh innovative idea first aired...
8,encourag posit comment film look forward watch...,encouraged by the positive comments about this...,encouraged positive comments film looking forw...
9,like origin gut wrench laughter like movie. yo...,if you like original gut wrenching laughter yo...,like original gut wrenching laughter like movi...


In [21]:
#lemmatizer
lemmatizer = WordNetLemmatizer() 

print(lemmatizer.lemmatize("rocks", pos="v"))
print(lemmatizer.lemmatize("gone", pos="v"))

rock
go


In [22]:
def lemmatizer_text(s):
    tokens = nltk.word_tokenize(s)
    
    new_string = []
    for w in tokens:
        lem = lemmatizer.lemmatize(w, pos="v")
        # exclude if lenght of lemma is smaller than 2
        if len(lem) > 2:
            new_string.append(lem)
    
    s = ' '.join(new_string)
    return s.strip()

In [23]:
df['review_lm'] = df['review_sw'].apply(lemmatizer_text)
df[['review','review_lw', 'review_sw', 'review_lm']].head(20)

Unnamed: 0,review,review_lw,review_sw,review_lm
0,one review mention watch 1 Oz episod ' hooked....,one of the other reviewers has mentioned that ...,one reviewers mentioned watching episode ll ho...,one reviewers mention watch episode hook right...
1,wonder littl production. film techniqu unassum...,a wonderful little production. <br /><br />the...,wonderful little production filming technique ...,wonderful little production film technique una...
2,thought wonder way spend time hot summer weeke...,i thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...,think wonderful way spend time hot summer week...
3,basic ' famili littl boy ( jake ) think ' zomb...,basically there's a family where a little boy ...,basically family little boy jake thinks zombie...,basically family little boy jake think zombie ...
4,"petter mattei ' "" love time money "" visual stu...","petter mattei's ""love in the time of money"" is...",petter mattei love time money visually stunnin...,petter mattei love time money visually stun fi...
5,"probabl all-tim favorit movi , stori selfless ...","probably my all-time favorite movie, a story o...",probably alltime favorite movie story selfless...,probably alltime favorite movie story selfless...
6,sure would like see resurrect date seahunt ser...,i sure would like to see a resurrection of a u...,sure would like see resurrection dated seahunt...,sure would like see resurrection date seahunt ...
7,"show amaz , fresh &amp ; innov idea 70 ' first...","this show was an amazing, fresh & innovative i...",show amazing fresh innovative idea first aired...,show amaze fresh innovative idea first air fir...
8,encourag posit comment film look forward watch...,encouraged by the positive comments about this...,encouraged positive comments film looking forw...,encourage positive comment film look forward w...
9,like origin gut wrench laughter like movie. yo...,if you like original gut wrenching laughter yo...,like original gut wrenching laughter like movi...,like original gut wrench laughter like movie y...


In [24]:
def get_top_text_ngrams(corpus, n, g):
    vec = CountVectorizer(ngram_range=(g, g)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [25]:
most_common_uni = get_top_text_ngrams(df.review,20,1)
most_common_uni = dict(most_common_uni)
temp = pd.DataFrame(columns = ["Common_words" , 'Count'])
temp["Common_words"] = list(most_common_uni.keys())
temp["Count"] = list(most_common_uni.values())
fig = px.bar(temp, x="Count", y="Common_words", title='Commmon Words in Text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [26]:
most_common_bi = get_top_text_ngrams(df.review,20,2)
most_common_bi = dict(most_common_bi)
temp = pd.DataFrame(columns = ["Common_words" , 'Count'])
temp["Common_words"] = list(most_common_bi.keys())
temp["Count"] = list(most_common_bi.values())
fig = px.bar(temp, x="Count", y="Common_words", title='Commmon Bigrams in Text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [27]:
most_common_tri = get_top_text_ngrams(df.review,20,3)
most_common_tri = dict(most_common_tri)
temp = pd.DataFrame(columns = ["Common_words" , 'Count'])
temp["Common_words"] = list(most_common_tri.keys())
temp["Count"] = list(most_common_tri.values())
fig = px.bar(temp, x="Count", y="Common_words", title='Commmon Trigrams in Text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [28]:
#normalized train reviews
norm_train_reviews=df.review[:40000]
norm_train_reviews[0]

"one review mention watch 1 Oz episod ' hooked. right , exactli happen me.th first thing struck Oz brutal unflinch scene violenc , set right word go. trust , show faint heart timid. show pull punch regard drug , sex violence. hardcor , classic use word.it call OZ nicknam given oswald maximum secur state penitentary. focus mainli emerald citi , experiment section prison cell glass front face inward , privaci high agenda. Em citi home mani .. aryan , muslim , gangsta , latino , christian , italian , irish .... scuffl , death stare , dodgi deal shadi agreement never far away.i would say main appeal show due fact goe show ' dare. forget pretti pictur paint mainstream audienc , forget charm , forget romanc ... OZ ' mess around. first episod ever saw struck nasti surreal , ' say readi , watch , develop tast Oz , got accustom high level graphic violence. violenc , injustic ( crook guard ' sold nickel , inmat ' kill order get away , well manner , middl class inmat turn prison bitch due lack st

In [29]:
#Normalized test reviews
norm_test_reviews=df.review[40000:]
norm_test_reviews[45005]

"read review watch piec cinemat garbag took least 2 page find somebodi els ' think appallingli unfunni montag ' acm humour 70 inde era ! ' least funni set sketch *comedy* ' ever seen ' till come along. half skit alreadi done ( infinit better ) act monti python woodi allen ... say nice piec anim last 90 second highlight film would still get close sum mindless drivel-ridden wast 75 minut is. semin comedi ? world semin realli mean semen. scatolog humour ? world scat actual feces. precursor joke ? mean handbook comedy. tit bum odd beaver. nice ... pubesc boy least one hand free ' found playboy exists. give break earli 70 ? way. sketch comedi go back least ten year prior. way could even forgiv film even made gunpoint. retro ? hardly. sketch clown subtli pervert children may cut edg circl ( could actual funni ) come realli quit sad. kept go throughout entir 75 minut ? sheer belief may save genuin funni skit end. gave film 1 lower score ... recommend insomniac coma patient ... perhap peopl su

BAG OF WORDS

In [30]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

BOW_cv_train: (40000, 6137577)
BOW_cv_test: (10000, 6137577)


TFIDF

In [31]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (40000, 6137577)
Tfidf_test: (10000, 6137577)


In [32]:
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(df['sentiment'])
print(sentiment_data.shape)

(50000, 1)


In [35]:
#Spliting the sentiment data
train_sentiments=sentiment_data[:40000]
test_sentiments=sentiment_data[40000:]
print(train_sentiments)
print(test_sentiments)

[[1]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


Modelling the dataset


Multinomial Naive Bayes for bag of words and tfidf features

In [36]:
#training the model
mnb=MultinomialNB()
#fitting the svm for bag of words
mnb_bow=mnb.fit(cv_train_reviews,train_sentiments)
print(mnb_bow)
#fitting the svm for tfidf features
mnb_tfidf=mnb.fit(tv_train_reviews,train_sentiments)
print(mnb_tfidf)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [37]:
#Predicting the model for bag of words
mnb_bow_predict=mnb.predict(cv_test_reviews)
print(mnb_bow_predict)
#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(tv_test_reviews)
print(mnb_tfidf_predict)

[0 0 0 ... 0 1 1]
[0 0 0 ... 0 1 1]


In [38]:
#Accuracy score for bag of words
mnb_bow_score=accuracy_score(test_sentiments,mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)
#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(test_sentiments,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

mnb_bow_score : 0.749
mnb_tfidf_score : 0.7496


In [39]:
#Classification report for bag of words 
mnb_bow_report=classification_report(test_sentiments,mnb_bow_predict,target_names=['Positive','Negative'])
print(mnb_bow_report)
#Classification report for tfidf features
mnb_tfidf_report=classification_report(test_sentiments,mnb_tfidf_predict,target_names=['Positive','Negative'])
print(mnb_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.75      0.75      0.75      4993
    Negative       0.75      0.75      0.75      5007

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000

              precision    recall  f1-score   support

    Positive       0.75      0.75      0.75      4993
    Negative       0.75      0.75      0.75      5007

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000



In [40]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,mnb_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,mnb_tfidf_predict,labels=[1,0])
print(cm_tfidf)

[[3754 1253]
 [1257 3736]]
[[3747 1260]
 [1244 3749]]


Logistic regression

In [41]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,train_sentiments)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_reviews,train_sentiments)
print(lr_tfidf)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [42]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)
##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

[0 0 0 ... 0 1 1]
[0 0 0 ... 0 1 1]


In [49]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_sentiments,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_sentiments,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_bow_score : 0.747
lr_tfidf_score : 0.7485


In [50]:
#Classification report for bag of words 
lr_bow_report=classification_report(test_sentiments,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

#Classification report for tfidf features
lr_tfidf_report=classification_report(test_sentiments,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.75      0.74      0.75      4993
    Negative       0.75      0.75      0.75      5007

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000

              precision    recall  f1-score   support

    Positive       0.74      0.76      0.75      4993
    Negative       0.76      0.73      0.74      5007

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000



In [53]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,lr_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,lr_tfidf_predict,labels=[1,0])
print(cm_tfidf)

[[3764 1243]
 [1287 3706]]
[[3672 1335]
 [1180 3813]]


In [54]:
#training the linear svm
svm=SGDClassifier(loss='hinge',max_iter=500,random_state=42)
#fitting the svm for bag of words
svm_bow=svm.fit(cv_train_reviews,train_sentiments)
print(svm_bow)
#fitting the svm for tfidf features
svm_tfidf=svm.fit(tv_train_reviews,train_sentiments)
print(svm_tfidf)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=500, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=500, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


In [55]:
#Predicting the model for bag of words
svm_bow_predict=svm.predict(cv_test_reviews)
print(svm_bow_predict)
#Predicting the model for tfidf features
svm_tfidf_predict=svm.predict(tv_test_reviews)
print(svm_tfidf_predict)

[1 0 1 ... 1 1 1]
[1 1 1 ... 1 1 1]


In [56]:
#Accuracy score for bag of words
svm_bow_score=accuracy_score(test_sentiments,svm_bow_predict)
print("svm_bow_score :",svm_bow_score)
#Accuracy score for tfidf features
svm_tfidf_score=accuracy_score(test_sentiments,svm_tfidf_predict)
print("svm_tfidf_score :",svm_tfidf_score)

svm_bow_score : 0.58
svm_tfidf_score : 0.5112


In [57]:
#Classification report for bag of words 
svm_bow_report=classification_report(test_sentiments,svm_bow_predict,target_names=['Positive','Negative'])
print(svm_bow_report)
#Classification report for tfidf features
svm_tfidf_report=classification_report(test_sentiments,svm_tfidf_predict,target_names=['Positive','Negative'])
print(svm_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.94      0.17      0.29      4993
    Negative       0.54      0.99      0.70      5007

    accuracy                           0.58     10000
   macro avg       0.74      0.58      0.49     10000
weighted avg       0.74      0.58      0.50     10000

              precision    recall  f1-score   support

    Positive       1.00      0.02      0.04      4993
    Negative       0.51      1.00      0.67      5007

    accuracy                           0.51     10000
   macro avg       0.75      0.51      0.36     10000
weighted avg       0.75      0.51      0.36     10000



In [58]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,svm_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,svm_tfidf_predict,labels=[1,0])
print(cm_tfidf)

[[4954   53]
 [4147  846]]
[[5007    0]
 [4888  105]]
