# Movie Review Sentiment Prediction

### Imports & Path Definitions

Data can be downloaded from: https://www.dropbox.com/s/6a4c3p7v4pkwtq0/Advanced_Data_Analysis_Assignment.tar?dl=0

In [1]:
# Imports
import spacy
from os import listdir
import pandas as pd
from bs4 import BeautifulSoup
import random
import numpy as np
spacy.__version__

'3.0.6'

In [3]:
#!python3 -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

In [4]:
# Path definitions
train_neg = 'data/train/neg/'
train_pos = 'data/train/pos/'
test_neg = 'data/test/neg/'
test_pos = 'data/test/pos/'

# Verify sizes (should all be equal):
print('No. files in each directory:')
for d in [train_neg, train_pos, test_neg, test_pos]:
    print(len(listdir(d)))

No. files in each directory:
12501
12501
12500
12500


### Preprocessing: removing markup language.
Removing HTML tags using Beautiful Soup <br> (from this article: https://himanshulohiya.medium.com/cleaning-and-pre-processing-textual-data-d88036a1f4b8)

In [5]:
sample = open(train_neg + listdir(train_neg)[0], 'r').read() 
sample[:400]

'With its companion piece MASTERS OF HORROR, NIGHTMARES AND DREAMSCAPES can only be seen as the absolute nadir of the genre that began so auspiciously with THE TWILIGHT ZONE and THE OUTER LIMITS.<br /><br />Of course, part of the problem is that it does nothing to be of any interest to a comparatively adult audience, instead aiming at TEN-YEAR-OLDS, who are only able to count body-bags, and scarcel'

In [6]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text().strip()
    return stripped_text

sample = strip_html(
    open(train_neg + listdir(train_neg)[0], 'r').read() 
)
sample[:400]

'With its companion piece MASTERS OF HORROR, NIGHTMARES AND DREAMSCAPES can only be seen as the absolute nadir of the genre that began so auspiciously with THE TWILIGHT ZONE and THE OUTER LIMITS.Of course, part of the problem is that it does nothing to be of any interest to a comparatively adult audience, instead aiming at TEN-YEAR-OLDS, who are only able to count body-bags, and scarcely that. And '

### Dataframe construction

In [17]:
# Opens each file in the positive/negative paths, preprocesses with the functions above, and returns shuffled set.
def build_list(pos_path, neg_path):
    
    output = []
    
    for path in [pos_path, neg_path]:
        for val in listdir(path):
            if val.endswith('.txt'):
                file = path + val
                file = open(file,'r').read()
                file = strip_html(file)
                
                # Append prepped data w/ label = 1 for positive, label = 0 for negative
                if path == pos_path:
                    label = 1
                    
                elif path == neg_path:
                    label = 0
                    
                output.append((file, label))
    
    # Shuffle
    random.Random(42).shuffle(output)
    output = pd.DataFrame(output)
    output.columns = ['text','label']
    return output
        

In [21]:
# NOTE: This takes a while to run.
# Will build a singular 'data' DF of length 2500, [:2000] of which will later become the Train data.
data = build_list(train_pos[:1250], train_neg[:1250])

In [23]:
print(data.shape)
data.head()


(25000, 2)


Unnamed: 0,text,label
0,"I was looking for a cute, simple comedy to pas...",0
1,"Ossessione, adapted loosely (or if it is as lo...",1
2,"Before the regular comments, my main curiosity...",1
3,I feel the movie did not portray Smith histori...,0
4,I have to admit I have always found it difficu...,1


### Lematizing & removing stops & punctuation

In [25]:
def punct_space(token):
    return token.is_punct or token.is_space

def lemmatize(doc):
    parsed_doc=nlp(doc)
    lemm_doc = [token.lemma_ for token in papreviousrsed_doc
                      if not punct_space(token) and (token.lemma_!= '-PRON-') and not(nlp.vocab[token.text].is_stop)]
      
    # write the transformed text
    clean_text = u' '.join(lemm_doc)
    return clean_text

data['text_lemmatized']=data.text.map(lemmatize)

data.head()

Unnamed: 0,text,label,text_lemmatized
0,"I was looking for a cute, simple comedy to pas...",0,look cute simple comedy pass time choose film ...
1,"Ossessione, adapted loosely (or if it is as lo...",1,Ossessione adapt loosely loose close version s...
2,"Before the regular comments, my main curiosity...",1,regular comment main curiosity love SONG runni...
3,I feel the movie did not portray Smith histori...,0,feel movie portray Smith historically goal mov...
4,I have to admit I have always found it difficu...,1,admit find difficult watch antonioni film star...


In [26]:
# Saving for later
data.to_csv('data/prepped_data.csv')

### Model setup

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(data['text_lemmatized'], data['label'], test_size=0.4, random_state=42)

In [30]:
# Count vectorizer setup

cvec = CountVectorizer(stop_words='english', min_df=3)

# we create a matrix for the training set
cvec.fit(X_train)
cvec_counts_train = cvec.transform(X_train)
# we create a matrix for the test set
cvec_counts_test=cvec.transform(X_test)

In [31]:
# TFIDF transformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(cvec_counts_train)
X_test_tfidf = tfidf_transformer.fit_transform(cvec_counts_test)

In [32]:
# Random forest classifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn import metrics

# training
clf = RandomForestClassifier().fit(X_train_tfidf, y_train)

# testing
predicted=clf.predict(X_test_tfidf)

In [33]:
# Accuracy score
metrics.accuracy_score(y_test, predicted)

0.8367

### Evaluation

In [35]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84      4953
           1       0.85      0.82      0.84      5047

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



### Testing a sample

In [46]:
sample = data[data.index == 3212] # randomly selected index
sample

Unnamed: 0,text,label,text_lemmatized
3212,Fire And Ice is an animated film set in a fant...,1,Fire Ice animated film set fantasy world film ...


In [52]:
sample_cvec = cvec.transform(sample.text_lemmatized)
sample_tfidf = tfidf_transformer.fit_transform(sample_cvec)
sample_pred = clf.predict(sample_tfidf)
print(sample_pred == sample.label)

3212    True
Name: label, dtype: bool
