# Text Classification of Movie Reviews

Data Source : http://www.cs.cornell.edu/people/pabo/movie-review-data/

Path: ./TextFiles/moviereviews.tsv

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('moviereviews.tsv',sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
df.shape

(2000, 2)

In [4]:
df['review'][0]

'how do films like mouse hunt get into theatres ? \r\nisn\'t there a law or something ? \r\nthis diabolical load of claptrap from steven speilberg\'s dreamworks studio is hollywood family fare at its deadly worst . \r\nmouse hunt takes the bare threads of a plot and tries to prop it up with overacting and flat-out stupid slapstick that makes comedies like jingle all the way look decent by comparison . \r\nwriter adam rifkin and director gore verbinski are the names chiefly responsible for this swill . \r\nthe plot , for what its worth , concerns two brothers ( nathan lane and an appalling lee evens ) who inherit a poorly run string factory and a seemingly worthless house from their eccentric father . \r\ndeciding to check out the long-abandoned house , they soon learn that it\'s worth a fortune and set about selling it in auction to the highest bidder . \r\nbut battling them at every turn is a very smart mouse , happy with his run-down little abode and wanting it to stay that way . \r\

> this is a indentation

In [5]:
from IPython.display import Markdown, display
display(Markdown('#### '+df['review'][0]))

#### how do films like mouse hunt get into theatres ? 
isn't there a law or something ? 
this diabolical load of claptrap from steven speilberg's dreamworks studio is hollywood family fare at its deadly worst . 
mouse hunt takes the bare threads of a plot and tries to prop it up with overacting and flat-out stupid slapstick that makes comedies like jingle all the way look decent by comparison . 
writer adam rifkin and director gore verbinski are the names chiefly responsible for this swill . 
the plot , for what its worth , concerns two brothers ( nathan lane and an appalling lee evens ) who inherit a poorly run string factory and a seemingly worthless house from their eccentric father . 
deciding to check out the long-abandoned house , they soon learn that it's worth a fortune and set about selling it in auction to the highest bidder . 
but battling them at every turn is a very smart mouse , happy with his run-down little abode and wanting it to stay that way . 
the story alternates between unfunny scenes of the brothers bickering over what to do with their inheritance and endless action sequences as the two take on their increasingly determined furry foe . 
whatever promise the film starts with soon deteriorates into boring dialogue , terrible overacting , and increasingly uninspired slapstick that becomes all sound and fury , signifying nothing . 
the script becomes so unspeakably bad that the best line poor lee evens can utter after another run in with the rodent is : " i hate that mouse " . 
oh cringe ! 
this is home alone all over again , and ten times worse . 
one touching scene early on is worth mentioning . 
we follow the mouse through a maze of walls and pipes until he arrives at his makeshift abode somewhere in a wall . 
he jumps into a tiny bed , pulls up a makeshift sheet and snuggles up to sleep , seemingly happy and just wanting to be left alone . 
it's a magical little moment in an otherwise soulless film . 
a message to speilberg : if you want dreamworks to be associated with some kind of artistic credibility , then either give all concerned in mouse hunt a swift kick up the arse or hire yourself some decent writers and directors . 
this kind of rubbish will just not do at all . 


In [6]:
df.describe()

Unnamed: 0,label,review
count,2000,1965.0
unique,2,1939.0
top,neg,
freq,1000,27.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
label     2000 non-null object
review    1965 non-null object
dtypes: object(2)
memory usage: 31.4+ KB


In [8]:
df['label'].value_counts()

neg    1000
pos    1000
Name: label, dtype: int64

In [9]:
# Check for null values

df.isnull().sum()

label      0
review    35
dtype: int64

In [10]:
df.dropna(inplace=True)

len(df)

1965

In [11]:
df

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [12]:
blanks = []

for i,lb,rv in df.itertuples():   # iterating over a dataframe
    if type(rv)==str:             # avoiding NaN values
        if rv.isspace():          # test 'review' column for whitespace
            blanks.append(i)      # adding the index to blanks list
            
print(len(blanks))
print(blanks)

27
[57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [13]:
df.head(72)

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
67,neg,please don't mind this windbag letting off a b...
68,neg,"when i originally saw the trailer for "" analyz..."
69,neg,i'm a dedicated fan of writer kevin williamson...
70,neg,it's now the anniversary of the slayings of ju...


In [14]:
df.drop(blanks,inplace=True)

In [15]:
df.shape

(1938, 2)

In [16]:
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

# Splitting the data in test and train datasets

In [17]:
from sklearn.model_selection import train_test_split

x = df['review']
y = df['label']

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=42)

# Building pipelines in sklearn to vectorize, train and fit the dataset into a model

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC


# Creating NB Classfier object
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                       ('clf',MultinomialNB())])

# Creating Linear Support Vector Classfier object
text_clf_svc = Pipeline([('tfidf', TfidfVectorizer()),
                       ('clf',LinearSVC())])



# Feed training data to the first pipeline -- Naive Bayes

In [20]:
text_clf_nb.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

# Run predictions and analyze the results of NB

In [21]:
predictions = text_clf_nb.predict(x_test)

In [22]:
# Import confusion metrics

from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))
print('\n\n')
print(metrics.classification_report(y_test,predictions))

[[287  21]
 [130 202]]



              precision    recall  f1-score   support

         neg       0.69      0.93      0.79       308
         pos       0.91      0.61      0.73       332

    accuracy                           0.76       640
   macro avg       0.80      0.77      0.76       640
weighted avg       0.80      0.76      0.76       640



In [23]:
print(metrics.accuracy_score(y_test,predictions))

0.7640625


# Feeding data through second pipeline

In [24]:
text_clf_svc.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

# Running predictions and analyzing results of LinearSVC

In [25]:
predictions2 = text_clf_svc.predict(x_test)

In [26]:
print(metrics.confusion_matrix(y_test,predictions))

[[287  21]
 [130 202]]


In [27]:
print(metrics.confusion_matrix(y_test,predictions2))

[[259  49]
 [ 49 283]]


In [29]:
print(metrics.classification_report(y_test, predictions2))

              precision    recall  f1-score   support

         neg       0.84      0.84      0.84       308
         pos       0.85      0.85      0.85       332

    accuracy                           0.85       640
   macro avg       0.85      0.85      0.85       640
weighted avg       0.85      0.85      0.85       640



In [30]:
print(metrics.accuracy_score(y_test,predictions2))

0.846875


Based on the text alone, we've been able to achieve a 84.7% accuracy

# Remove stopwords

### Sklearn comes with its own stopwords list

In [33]:
from sklearn.feature_extraction import text
print(text.ENGLISH_STOP_WORDS)

frozenset({'whose', 'and', 'how', 'themselves', 'latter', 'nor', 'of', 'am', 'are', 'well', 'besides', 'these', 'she', 'together', 'herself', 'against', 'might', 'somehow', 'un', 'another', 'itself', 'mill', 'others', 'by', 'thick', 'empty', 'here', 'see', 'cannot', 'side', 'though', 'everywhere', 'several', 'sometimes', 'than', 'almost', 'thus', 'who', 'for', 'again', 'sometime', 'found', 'go', 'whereupon', 'during', 'must', 'whence', 'any', 'what', 'beforehand', 'being', 'may', 'about', 'back', 'mostly', 'least', 'seem', 'whether', 'top', 'an', 'too', 'first', 'now', 'around', 'except', 'formerly', 'should', 'becoming', 'more', 'somewhere', 'because', 'if', 'bottom', 'anywhere', 'afterwards', 'hasnt', 'same', 'even', 'our', 'upon', 'all', 'else', 'can', 'when', 'system', 'becomes', 'her', 'not', 'elsewhere', 'anything', 'a', 'each', 'keep', 'bill', 'interest', 'move', 'amount', 'amongst', 'toward', 'ie', 'along', 'call', 'perhaps', 'name', 'him', 'whoever', 'is', 'beyond', 'moreover'

In [34]:
df = pd.read_csv('moviereviews.tsv',sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [35]:
df.dropna(inplace=True)
blanks = []

for i,lb,rv in df.itertuples():   # iterating over a dataframe
    if type(rv)==str:             # avoiding NaN values
        if rv.isspace():          # test 'review' column for whitespace
            blanks.append(i)      # adding the index to blanks list
            
df.drop(blanks,inplace=True)

In [38]:
stopwords= ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [51]:
# Creating Linear Support Vector Classfier object
text_clf_svc3 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                       ('clf',LinearSVC())])

In [53]:
text_clf_svc3.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=['a', 'about', 'an', 'and', 'are',
                                             'as', 'at', 'be', 'been', 'but',...
                                             'how', 'i', 'if', 'in', 'into',
                                             'is', ...],
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer

In [55]:
predictions3 = text_clf_svc3.predict(x_test)

In [56]:
print(metrics.confusion_matrix(y_test,predictions3))

[[256  52]
 [ 48 284]]


In [57]:
print(metrics.classification_report(y_test,predictions3))

              precision    recall  f1-score   support

         neg       0.84      0.83      0.84       308
         pos       0.85      0.86      0.85       332

    accuracy                           0.84       640
   macro avg       0.84      0.84      0.84       640
weighted avg       0.84      0.84      0.84       640



In [58]:
print(metrics.accuracy_score(y_test,predictions3))

0.84375


In [59]:
myreview = 'Its a great movie. I liked the way the pivot has given the message, this movie is going to change the way people look at comedies. great movie!!'

In [63]:
print(text_clf_svc3.predict([myreview]))

['pos']
