In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import sklearn.naive_bayes as nb
from sklearn import linear_model
from sklearn.datasets import load_boston

In [28]:
#read dataframe
df =pd.read_csv("rotten-tomatoes.csv.bz2")
df.head(5)
df.sample(n=3, random_state=1)
df

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title
0,Derek Adams,fresh,114709,http://www.timeout.com/film/reviews/87745/toy-...,Time Out,"So ingenious in concept, design and execution ...",2009-10-04 00:00:00,9559,Toy Story
1,Richard Corliss,fresh,114709,"http://www.time.com/time/magazine/article/0,91...",TIME Magazine,The year's most inventive comedy.,2008-08-31 00:00:00,9559,Toy Story
2,David Ansen,fresh,114709,http://www.newsweek.com/id/104199,Newsweek,A winning animated feature that has something ...,2008-08-18 00:00:00,9559,Toy Story
3,Leonard Klady,fresh,114709,http://www.variety.com/review/VE1117941294.htm...,Variety,The film sports a provocative and appealing st...,2008-06-09 00:00:00,9559,Toy Story
4,Jonathan Rosenbaum,fresh,114709,http://onfilm.chicagoreader.com/movies/capsule...,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10 00:00:00,9559,Toy Story
...,...,...,...,...,...,...,...,...,...
13437,Gene Siskel,rotten,88683,http://articles.chicagotribune.com/1985-09-13/...,Chicago Tribune,Agnes of God plays with some challenging ideas...,2013-05-08 00:00:00,11917,Agnes of God
13438,Variety Staff,rotten,88683,http://www.variety.com/review/VE1117796703.htm...,Variety,"Fonda's relentless interrogating, mannered cha...",2008-10-18 00:00:00,11917,Agnes of God
13439,,fresh,88683,http://www.timeout.com/film/reviews/77605/agne...,Time Out,Splendidly shot by Sven Nykvist and with excel...,2006-06-24 00:00:00,11917,Agnes of God
13440,Janet Maslin,rotten,88683,http://movies.nytimes.com/movie/review?res=950...,New York Times,"Miss Tilly makes a radiant Agnes, and Miss Ban...",2003-05-20 00:00:00,11917,Agnes of God


In [3]:
list(df)

['critic',
 'fresh',
 'imdb',
 'link',
 'publication',
 'quote',
 'review_date',
 'rtid',
 'title']

In [4]:
missing_f = df['fresh'].isnull()
print("No of missing values for fresh are",len(missing_f))
missing_q = df['quote'].isnull()
print("No of missing values for quote are",len(missing_q))


No of missing values for fresh are 13442
No of missing values for quote are 13442


In [5]:
df1 = df[(df.fresh == 'fresh')]
x = (df1['rtid'].unique())
print("These are the different values for fresh evaluations",x)
len(x)

These are the different values for fresh evaluations [ 9559 10498 16697 ... 10624 14013 11917]


1531

In [6]:
df2 = df[(df.fresh == 'rotten')]
y = (df2['rtid'].unique())
print("These are the different values for rotten evaluations",y)
len(y)

These are the different values for rotten evaluations [12436 10498 16697 ... 10215 14013 11917]


1373

In [7]:
z = df['rtid'].unique()
len(z)

1720

In [8]:
print("percentage of values for fresh are",(1531/1720 *100))
print("percentage of values for rotten are",(1373/1720 *100))

percentage of values for fresh are 89.01162790697674
percentage of values for rotten are 79.82558139534883


In [9]:
(df['quote'].values == '').sum()          
len(df[df['quote'] == ''])                
df.loc[df['quote'] == ''].count().iloc[0]  
print(" There are no zero-length or only whitespace quote-s in the data")

 There are no zero-length or only whitespace quote-s in the data


In [15]:
print("Length of longest string in quotes column of data is",df.quote.str.len().max())
print("Length of shortest string in quotes column of data is",df.quote.str.len().min())
print("Length of average string in quotes column of data is",df.quote.str.len().mean())

Length of longest string in quotes column of data is 256
Length of shortest string in quotes column of data is 4
Length of average string in quotes column of data is 121.23128998660914


In [23]:
x = df.pivot_table(index=['title'], aggfunc='size')
x

title
'Til There Was You                  8
10 Things I Hate About You         11
101 Dalmatians                     12
12 Angry Men (Twelve Angry Men)     6
2 Days in Paris                    15
                                   ..
Young and Innocent                  3
Your Friends & Neighbors           12
Zero Effect                         7
eXistenZ                            9
sex, lies, and videotape            8
Length: 1720, dtype: int64

In [29]:
df = df[~df.quote.isnull()]
df =df[df.fresh != 'none']
df = df[df.quote.str.len() > 0]
df.dropna(subset=['fresh','quote'],inplace=True)
df

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title
0,Derek Adams,fresh,114709,http://www.timeout.com/film/reviews/87745/toy-...,Time Out,"So ingenious in concept, design and execution ...",2009-10-04 00:00:00,9559,Toy Story
1,Richard Corliss,fresh,114709,"http://www.time.com/time/magazine/article/0,91...",TIME Magazine,The year's most inventive comedy.,2008-08-31 00:00:00,9559,Toy Story
2,David Ansen,fresh,114709,http://www.newsweek.com/id/104199,Newsweek,A winning animated feature that has something ...,2008-08-18 00:00:00,9559,Toy Story
3,Leonard Klady,fresh,114709,http://www.variety.com/review/VE1117941294.htm...,Variety,The film sports a provocative and appealing st...,2008-06-09 00:00:00,9559,Toy Story
4,Jonathan Rosenbaum,fresh,114709,http://onfilm.chicagoreader.com/movies/capsule...,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10 00:00:00,9559,Toy Story
...,...,...,...,...,...,...,...,...,...
13437,Gene Siskel,rotten,88683,http://articles.chicagotribune.com/1985-09-13/...,Chicago Tribune,Agnes of God plays with some challenging ideas...,2013-05-08 00:00:00,11917,Agnes of God
13438,Variety Staff,rotten,88683,http://www.variety.com/review/VE1117796703.htm...,Variety,"Fonda's relentless interrogating, mannered cha...",2008-10-18 00:00:00,11917,Agnes of God
13439,,fresh,88683,http://www.timeout.com/film/reviews/77605/agne...,Time Out,Splendidly shot by Sven Nykvist and with excel...,2006-06-24 00:00:00,11917,Agnes of God
13440,Janet Maslin,rotten,88683,http://movies.nytimes.com/movie/review?res=950...,New York Times,"Miss Tilly makes a radiant Agnes, and Miss Ban...",2003-05-20 00:00:00,11917,Agnes of God


In [31]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True)
# define vectorizer
X = vectorizer.fit_transform(df.quote.values)
# vectorize your data. Note: this creates a sparce matrix,
# use .toarray() if you want a dense matrix.
words = vectorizer.get_feature_names()
# in case you want to see what are the actual words

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

In [38]:
import sklearn.naive_bayes as nb

#fit and train the dataset on Naive Bayes classifier
Mult_NB=nb.MultinomialNB(alpha=0.065,fit_prior=True)
Mult_NB.fit(X_train,y_train)
print("The accuracy for test data:",Mult_NB.score(X_test,y_test))
print("The accuracy for train data:",Mult_NB.score(X_train,y_train))

The accuracy for test data: 0.7447839046199702
The accuracy for train data: 0.9516534699580811


In [39]:
def log_likelihood(model, x, y):
    prob = model.predict_log_proba(x)
    # returns Nx2 array.  Columns are log(Pr(y == 0|x)), log(Pr(y==1|x))
    ll = prob[y == 0, 0].sum() + prob[y == 1, 1].sum()
    return ll

#print log likelihood of training and testing dataset
print('Log-Linkelihood of training dataset',log_likelihood(Mult_NB,X_train,y_train))
print('Log-Linkelihood of testing dataset',log_likelihood(Mult_NB,X_test,y_test))

Log-Linkelihood of training dataset -1245.613629414669
Log-Linkelihood of testing dataset -2293.82101868052
