In [1]:
# Load Required Libraries

import pandas as pd
import numpy as np
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.classifiers import NaiveBayesClassifier
from textblob import Blobber
from sklearn.model_selection import train_test_split #to split the training and testing data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import string
import re

# 1) Import the movie review data as a data frame and ensure that the data is loaded properly.

In [2]:
# Read movie data tsv file 
movie_data=pd.read_csv('labeledTrainData.tsv',sep='\t')

In [3]:
movie_data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


# 2) How many of each positive and negative reviews are there?

In [4]:
movie_data.groupby('sentiment')['sentiment'].count()

sentiment
0    12500
1    12500
Name: sentiment, dtype: int64

In [5]:
movie_data.sentiment.value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

# 3) Use TextBlob to classify each movie review as positive or negative. Assume that a polarity score greater than or equal to zero is a positive sentiment and less than 0 is a negative sentiment.

In [6]:
movie_data['textblob_senti_score'] = movie_data['review'].apply(lambda data: TextBlob(data).polarity)

In [7]:
movie_data.head(10)

Unnamed: 0,id,sentiment,review,textblob_senti_score
0,5814_8,1,With all this stuff going down at the moment w...,0.001277
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941
3,3630_4,0,It must be assumed that those who praised this...,0.134753
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842
5,8196_8,1,I dont know why people think this is such a ba...,0.105882
6,7166_2,0,"This movie could have been very good, but come...",-0.027054
7,10633_1,0,I watched this video at a friend's house. I'm ...,0.06875
8,319_1,0,"A friend of mine bought this film for £1, and ...",0.09881
9,8713_10,1,<br /><br />This movie is full of references. ...,0.258333


In [8]:
movie_data['reviewtextblob_sentiment'] = movie_data['textblob_senti_score'].apply(lambda snti: 1 if snti >= 0 else 0)

In [9]:
movie_data.head(10)

Unnamed: 0,id,sentiment,review,textblob_senti_score,reviewtextblob_sentiment
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0
5,8196_8,1,I dont know why people think this is such a ba...,0.105882,1
6,7166_2,0,"This movie could have been very good, but come...",-0.027054,0
7,10633_1,0,I watched this video at a friend's house. I'm ...,0.06875,1
8,319_1,0,"A friend of mine bought this film for £1, and ...",0.09881,1
9,8713_10,1,<br /><br />This movie is full of references. ...,0.258333,1


# 4) Check the accuracy of this model. Is this model better than random guessing?

In [10]:
movie_data.groupby('reviewtextblob_sentiment')['reviewtextblob_sentiment'].count()

reviewtextblob_sentiment
0     5983
1    19017
Name: reviewtextblob_sentiment, dtype: int64

In [11]:
movie_data.groupby('sentiment')['sentiment'].count()

sentiment
0    12500
1    12500
Name: sentiment, dtype: int64

if we see the "Sentiment" column, the data has 12500(50%) positive and 12500(50%) negitive reviews out of all the data(25000)

but when we see the "reviewtextblob_sentiment" it has total 1907(76.08%) positive and 5983(23.938%) 
negitive reviews


In [12]:
# Find the positive and negitive percentage of sentiment and reviewtextblob_sentiscore

textblog_accuracy_positive= sum((movie_data['sentiment'] > 0) & (movie_data['textblob_senti_score'] >= 0))
textblog_accuracy_negitive= sum((movie_data['sentiment'] <= 0) & (movie_data['textblob_senti_score'] < 0))



In [13]:
total_reviews = textblog_accuracy_positive+textblog_accuracy_negitive
total_reviews

17131

17131/25000=0.6924*100 = 69.24%,
The percentage of accuracy of the model is 69.24%, so the model 
we choose is accurate rather than going with the assumption.

# 5) For up to five points extra credit, use another prebuilt text sentiment analyzer, e.g., VADER, and repeat steps (3) and (4).

In [14]:
# import SentimentIntensityAnalyzer class
# from vaderSentiment.vaderSentiment module.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 

In [15]:
# init the sentiment analyzer
sia = SentimentIntensityAnalyzer()

In [16]:
movie_data['vader_senti1'] = movie_data['review'].apply(lambda snti : sia.polarity_scores(snti)["compound"])


In [17]:
movie_data

Unnamed: 0,id,sentiment,review,textblob_senti_score,reviewtextblob_sentiment,vader_senti1
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,-0.8879
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,0.9736
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,-0.9883
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,-0.1202
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.6115
...,...,...,...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...,0.102083,1,0.8750
24996,5064_1,0,I don't believe they made this film. Completel...,0.090813,1,0.9861
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil...",0.145256,1,0.9252
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...,0.065625,1,-0.9598


repeat steps (3)

We can also calculate the percentage of each sentiment present in that sentence using "pos", "neu" and "neg" keys after computing the polarity score.

In [18]:
movie_data['vader_Polarity'] = movie_data['review'].apply(lambda snti : sia.polarity_scores(snti))

In [19]:
movie_data['Neg'] = movie_data['vader_Polarity'].apply(lambda snti : snti['neg'])

In [20]:
movie_data['pos'] = movie_data['vader_Polarity'].apply(lambda snti : snti['pos'])

In [21]:
movie_data['neu'] = movie_data['vader_Polarity'].apply(lambda snti : snti['neu'])

In [22]:
movie_data

Unnamed: 0,id,sentiment,review,textblob_senti_score,reviewtextblob_sentiment,vader_senti1,vader_Polarity,Neg,pos,neu
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,-0.8879,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",0.128,0.121,0.751
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,0.9736,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",0.080,0.207,0.713
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,-0.9883,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co...",0.135,0.055,0.809
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,-0.1202,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co...",0.062,0.054,0.884
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.6115,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co...",0.122,0.135,0.743
...,...,...,...,...,...,...,...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...,0.102083,1,0.8750,"{'neg': 0.026, 'neu': 0.825, 'pos': 0.149, 'co...",0.026,0.149,0.825
24996,5064_1,0,I don't believe they made this film. Completel...,0.090813,1,0.9861,"{'neg': 0.082, 'neu': 0.68, 'pos': 0.238, 'com...",0.082,0.238,0.680
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil...",0.145256,1,0.9252,"{'neg': 0.053, 'neu': 0.8, 'pos': 0.147, 'comp...",0.053,0.147,0.800
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...,0.065625,1,-0.9598,"{'neg': 0.154, 'neu': 0.753, 'pos': 0.093, 'co...",0.154,0.093,0.753


In [23]:
print(f"The percententage of Positive sentiment is :",round((sum(movie_data["pos"])/movie_data["pos"].count()) * 100,2),"%")
print(f"The percententage of Negitive sentiment is :" ,round((sum(movie_data["Neg"])/movie_data["Neg"].count()) * 100,2),"%")
print(f"The percententage of Neutral sentimentis :" ,round((sum(movie_data["neu"])/movie_data["neu"].count()) * 100,2),"%")

The percententage of Positive sentiment is : 13.82 %
The percententage of Negitive sentiment is : 9.22 %
The percententage of Neutral sentimentis : 76.96 %


Repeat setp (4)

In [24]:
 
movie_data.groupby('vader_senti1')['vader_senti1'].count()

vader_senti1
-0.9996     1
-0.9995     4
-0.9994     3
-0.9993     3
-0.9992     5
           ..
 0.9995    14
 0.9996     8
 0.9997     3
 0.9998     8
 0.9999     1
Name: vader_senti1, Length: 8073, dtype: int64

In [25]:
Vader_nof_positive_reviews =sum((movie_data['vader_senti1'] >= 0))
Vader_nof_negative_reviews =sum((movie_data['vader_senti1'] < 0))

In [26]:
Vader_nof_positive_reviews

16611

In [27]:
Vader_nof_negative_reviews

8389

In [28]:
# Find the positive and negitive percentage of sentiment and reviewtextblob_sentiscore

vador_accuracy_positive= sum((movie_data['sentiment'] > 0) & (movie_data['vader_senti1'] >= 0))
vador_accuracy_negitive= sum((movie_data['sentiment'] <= 0) & (movie_data['vader_senti1'] < 0))

In [29]:
vador_total_reviews = vador_accuracy_positive+vador_accuracy_negitive
vador_total_reviews

17351

17351/25000=0.69404*100 = 69.40%,
The percentage of accuracy of the model is 69.40%, so the model 
we choose is accurate rather than going with the assumption of 50%.

# Part 2: Prepping Text for a Custom Model

# 1) Convert all text to lowercase letters.

In [30]:
movie_data['review'] = movie_data['review'].str.lower()

In [31]:
movie_data.head(10)

Unnamed: 0,id,sentiment,review,textblob_senti_score,reviewtextblob_sentiment,vader_senti1,vader_Polarity,Neg,pos,neu
0,5814_8,1,with all this stuff going down at the moment w...,0.001277,1,-0.8879,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",0.128,0.121,0.751
1,2381_9,1,"\the classic war of the worlds\"" by timothy hi...",0.256349,1,0.9736,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",0.08,0.207,0.713
2,7759_3,0,the film starts with a manager (nicholas bell)...,-0.053941,0,-0.9883,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co...",0.135,0.055,0.809
3,3630_4,0,it must be assumed that those who praised this...,0.134753,1,-0.1202,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co...",0.062,0.054,0.884
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.6115,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co...",0.122,0.135,0.743
5,8196_8,1,i dont know why people think this is such a ba...,0.105882,1,0.3935,"{'neg': 0.177, 'neu': 0.607, 'pos': 0.215, 'co...",0.177,0.215,0.607
6,7166_2,0,"this movie could have been very good, but come...",-0.027054,0,-0.6863,"{'neg': 0.158, 'neu': 0.717, 'pos': 0.125, 'co...",0.158,0.125,0.717
7,10633_1,0,i watched this video at a friend's house. i'm ...,0.06875,1,-0.4517,"{'neg': 0.059, 'neu': 0.903, 'pos': 0.038, 'co...",0.059,0.038,0.903
8,319_1,0,"a friend of mine bought this film for £1, and ...",0.09881,1,0.9707,"{'neg': 0.069, 'neu': 0.746, 'pos': 0.185, 'co...",0.069,0.185,0.746
9,8713_10,1,<br /><br />this movie is full of references. ...,0.258333,1,0.7184,"{'neg': 0.062, 'neu': 0.759, 'pos': 0.179, 'co...",0.062,0.179,0.759


# 2) Remove punctuation and special characters from the text.

In [32]:
# Removing punctuations

movie_data['review'] = movie_data['review'].apply(lambda snti: snti.translate(str.maketrans('', '', string.punctuation)))       
                                            

In [33]:
# Display data after removing punctuations
movie_data

Unnamed: 0,id,sentiment,review,textblob_senti_score,reviewtextblob_sentiment,vader_senti1,vader_Polarity,Neg,pos,neu
0,5814_8,1,with all this stuff going down at the moment w...,0.001277,1,-0.8879,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",0.128,0.121,0.751
1,2381_9,1,the classic war of the worlds by timothy hines...,0.256349,1,0.9736,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",0.080,0.207,0.713
2,7759_3,0,the film starts with a manager nicholas bell g...,-0.053941,0,-0.9883,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co...",0.135,0.055,0.809
3,3630_4,0,it must be assumed that those who praised this...,0.134753,1,-0.1202,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co...",0.062,0.054,0.884
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.6115,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co...",0.122,0.135,0.743
...,...,...,...,...,...,...,...,...,...,...
24995,3453_3,0,it seems like more consideration has gone into...,0.102083,1,0.8750,"{'neg': 0.026, 'neu': 0.825, 'pos': 0.149, 'co...",0.026,0.149,0.825
24996,5064_1,0,i dont believe they made this film completely ...,0.090813,1,0.9861,"{'neg': 0.082, 'neu': 0.68, 'pos': 0.238, 'com...",0.082,0.238,0.680
24997,10905_3,0,guy is a loser cant get girls needs to build u...,0.145256,1,0.9252,"{'neg': 0.053, 'neu': 0.8, 'pos': 0.147, 'comp...",0.053,0.147,0.800
24998,10194_3,0,this 30 minute documentary buñuel made in the ...,0.065625,1,-0.9598,"{'neg': 0.154, 'neu': 0.753, 'pos': 0.093, 'co...",0.154,0.093,0.753


In [34]:
# Remove special characters from the string
pattern = r'[^A-Za-z0-9]+'
movie_data['review'] = movie_data['review'].apply(lambda snti: re.sub(pattern, ' ', snti))  

In [35]:
movie_data

Unnamed: 0,id,sentiment,review,textblob_senti_score,reviewtextblob_sentiment,vader_senti1,vader_Polarity,Neg,pos,neu
0,5814_8,1,with all this stuff going down at the moment w...,0.001277,1,-0.8879,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",0.128,0.121,0.751
1,2381_9,1,the classic war of the worlds by timothy hines...,0.256349,1,0.9736,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",0.080,0.207,0.713
2,7759_3,0,the film starts with a manager nicholas bell g...,-0.053941,0,-0.9883,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co...",0.135,0.055,0.809
3,3630_4,0,it must be assumed that those who praised this...,0.134753,1,-0.1202,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co...",0.062,0.054,0.884
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.6115,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co...",0.122,0.135,0.743
...,...,...,...,...,...,...,...,...,...,...
24995,3453_3,0,it seems like more consideration has gone into...,0.102083,1,0.8750,"{'neg': 0.026, 'neu': 0.825, 'pos': 0.149, 'co...",0.026,0.149,0.825
24996,5064_1,0,i dont believe they made this film completely ...,0.090813,1,0.9861,"{'neg': 0.082, 'neu': 0.68, 'pos': 0.238, 'com...",0.082,0.238,0.680
24997,10905_3,0,guy is a loser cant get girls needs to build u...,0.145256,1,0.9252,"{'neg': 0.053, 'neu': 0.8, 'pos': 0.147, 'comp...",0.053,0.147,0.800
24998,10194_3,0,this 30 minute documentary bu uel made in the ...,0.065625,1,-0.9598,"{'neg': 0.154, 'neu': 0.753, 'pos': 0.093, 'co...",0.154,0.093,0.753


# 3) Remove stop words

In [37]:
# Remove stop words

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [38]:
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
stop = stopwords.words('english')
movie_data['review_without_stopwords'] = movie_data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [40]:
movie_data.head(5)

Unnamed: 0,id,sentiment,review,textblob_senti_score,reviewtextblob_sentiment,vader_senti1,vader_Polarity,Neg,pos,neu,review_without_stopwords
0,5814_8,1,with all this stuff going down at the moment w...,0.001277,1,-0.8879,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",0.128,0.121,0.751,stuff going moment mj ive started listening mu...
1,2381_9,1,the classic war of the worlds by timothy hines...,0.256349,1,0.9736,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",0.08,0.207,0.713,classic war worlds timothy hines entertaining ...
2,7759_3,0,the film starts with a manager nicholas bell g...,-0.053941,0,-0.9883,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co...",0.135,0.055,0.809,film starts manager nicholas bell giving welco...
3,3630_4,0,it must be assumed that those who praised this...,0.134753,1,-0.1202,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co...",0.062,0.054,0.884,must assumed praised film greatest filmed oper...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.6115,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co...",0.122,0.135,0.743,superbly trashy wondrously unpretentious 80s e...


# 4) Apply NLTK’s PorterStemmer.

In [41]:
ps = PorterStemmer()

In [42]:
porter_stemmer = PorterStemmer()
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)


In [45]:
movie_data["review_without_stopwords"].columns = movie_data["review_without_stopwords"].str.lower().map(ps.stem)

In [46]:
movie_data.head(5)

Unnamed: 0,id,sentiment,review,textblob_senti_score,reviewtextblob_sentiment,vader_senti1,vader_Polarity,Neg,pos,neu,review_without_stopwords
0,5814_8,1,with all this stuff going down at the moment w...,0.001277,1,-0.8879,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",0.128,0.121,0.751,stuff go moment mj ive start listen music watc...
1,2381_9,1,the classic war of the worlds by timothy hines...,0.256349,1,0.9736,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",0.08,0.207,0.713,classic war world timothi hine entertain film ...
2,7759_3,0,the film starts with a manager nicholas bell g...,-0.053941,0,-0.9883,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co...",0.135,0.055,0.809,film start manag nichola bell give welcom inve...
3,3630_4,0,it must be assumed that those who praised this...,0.134753,1,-0.1202,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co...",0.062,0.054,0.884,must assum prais film greatest film opera ever...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.6115,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co...",0.122,0.135,0.743,superbl trashi wondrous unpretenti 80 exploit ...


In [43]:
movie_data["review_without_stopwords"] = movie_data["review_without_stopwords"].apply(stem_sentences)

In [44]:
movie_data.head(5)

Unnamed: 0,id,sentiment,review,textblob_senti_score,reviewtextblob_sentiment,vader_senti1,vader_Polarity,Neg,pos,neu,review_without_stopwords
0,5814_8,1,with all this stuff going down at the moment w...,0.001277,1,-0.8879,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",0.128,0.121,0.751,stuff go moment mj ive start listen music watc...
1,2381_9,1,the classic war of the worlds by timothy hines...,0.256349,1,0.9736,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",0.08,0.207,0.713,classic war world timothi hine entertain film ...
2,7759_3,0,the film starts with a manager nicholas bell g...,-0.053941,0,-0.9883,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co...",0.135,0.055,0.809,film start manag nichola bell give welcom inve...
3,3630_4,0,it must be assumed that those who praised this...,0.134753,1,-0.1202,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co...",0.062,0.054,0.884,must assum prais film greatest film opera ever...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.6115,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co...",0.122,0.135,0.743,superbl trashi wondrous unpretenti 80 exploit ...


# 5) Create a bag-of-words matrix from your stemmed text (output from (4)), where each row is a word-count vector for a single movie review (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook).Display the dimensions of your bag-of-words matrix. The number of rows in this matrix should be the same as the number of rows in your original data frame.

In [47]:
print ("Creating the bag of words...\n")


# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer() 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. 

train_data_review = vectorizer.fit_transform(movie_data["review_without_stopwords"])


Creating the bag of words...



In [48]:
print(train_data_review)

  (0, 77888)	1
  (0, 33711)	3
  (0, 52832)	1
  (0, 52606)	11
  (0, 42173)	2
  (0, 76735)	2
  (0, 47296)	1
  (0, 54427)	1
  (0, 88063)	3
  (0, 57487)	1
  (0, 23379)	1
  (0, 89827)	1
  (0, 53151)	2
  (0, 50395)	3
  (0, 87812)	3
  (0, 33084)	1
  (0, 14835)	1
  (0, 41100)	1
  (0, 35396)	2
  (0, 81392)	1
  (0, 66251)	2
  (0, 18413)	2
  (0, 25631)	1
  (0, 49128)	2
  (0, 52047)	1
  :	:
  (24999, 26171)	1
  (24999, 65876)	1
  (24999, 53467)	1
  (24999, 55854)	1
  (24999, 89888)	3
  (24999, 15674)	3
  (24999, 19447)	1
  (24999, 12317)	1
  (24999, 36849)	1
  (24999, 82355)	1
  (24999, 28987)	1
  (24999, 15845)	1
  (24999, 34782)	1
  (24999, 79768)	1
  (24999, 85103)	1
  (24999, 12313)	1
  (24999, 41490)	1
  (24999, 21774)	1
  (24999, 21384)	1
  (24999, 19766)	1
  (24999, 85084)	1
  (24999, 45469)	1
  (24999, 75202)	1
  (24999, 85004)	1
  (24999, 15812)	2


In [49]:
train_data_review

<25000x91908 sparse matrix of type '<class 'numpy.int64'>'
	with 2439277 stored elements in Compressed Sparse Row format>

The number of rows in this matrix same as the number of rows in your original data frame.

6) Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text,
for your movie reviews (see section 6.9 in the Machine Learning with Python Cookbook).
Display the dimensions of your tf-idf matrix.These dimensions should be the same as your bag-of-words matrix.

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    min_df=1, norm='l2', smooth_idf=True, use_idf=True, ngram_range=(1, 1))
#Use the TF-IDF class to train the same corpus above
b = vectorizer.fit_transform(movie_data['review_without_stopwords'])

In [51]:
b

<25000x91908 sparse matrix of type '<class 'numpy.float64'>'
	with 2439277 stored elements in Compressed Sparse Row format>

These dimensions is same as your bag-of-words matrix.

# Tokenization

In [52]:
corpora = movie_data['review_without_stopwords'].values
tokenized = [word_tokenize(corpus) for corpus in corpora]


In [53]:
tokenized = [list(filter(lambda x: len(x) > 1, document)) \
             for document in tokenized]

print(tokenized[2222])

['go', 'immedi', 'rent', 'movi', 'bottom', 'shelf', 'local', 'video', 'store', 'cover', 'dust', 'one', 'touch', 'year', 'may', 'even', '50', 'special', 'worth', 'ten', 'buck', 'swear', 'buy', 'arent', 'mani', 'film', 'compar', 'celluloid', 'version', 'goo', 'form', 'bottom', 'trash', 'year', 'ye', 'gave', 'realli', 'deserv', 'much', 'lower', '110', 'scale', 'design', 'stuff', 'like', 'mind']
