# Movie trailer sentiment analysis project

The following is my python program for a sentiment analysis model for testing the sentiment of a movie trailer on youtube. 

In [2]:
#import all libraries
import os

import pandas as pd
import numpy as np
import Utils 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import pickle

In [3]:
# load the dataset. Movie review dataset found on Kaggle
fileName = '../youtube comments sentiment program/movie_reviews.csv'
reviews = pd.read_csv(fileName)

In [4]:
#print the head of dataframe
reviews.head()

Unnamed: 0,Reviews,Sentiment
0,Alan Rickman & Emma Thompson give good perform...,0
1,I have seen this movie and I did not care for ...,0
2,"In Los Angeles, the alcoholic and lazy Hank Ch...",0
3,"This film is bundled along with ""Gli fumavano ...",0
4,I only comment on really very good films and o...,0


In [5]:
# preprocess the corpus and replace reviews with processed reviews
reviews['Reviews'] = Utils.preprocess(reviews['Reviews'])

In [6]:
reviews.head()

Unnamed: 0,Reviews,Sentiment
0,alan rickman emma thompson give good performan...,0
1,seen movie care movie anyhow would think going...,0
2,los angeles alcoholic lazy hank chinaski matt ...,0
3,film bundled along gli fumavano le colt lo chi...,0
4,comment really good film utter rubbish aim hel...,0


In [7]:
# use methods from custom Util file to build matrix and find corpus word count
matrix = Utils.tdm(reviews)
word_count = Utils.word_count(matrix)

In [8]:
print(word_count)

13604


In [9]:
#declare tfid vectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,max_features=word_count,stop_words='english')

#fit the vecotrizer over the dataset
cleaned_reviews = reviews['Reviews']
tf_idf_reviews = vectorizer.fit_transform(cleaned_reviews)

# get the sentiment target values
y_targets = reviews['Sentiment']


In [10]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(tf_idf_reviews,
                                                    y_targets,
                                                    test_size=8200,
                                                    random_state=42)

In [11]:
# initialize logistic regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [12]:
# find the test and train accuracy for the model
train_probs = lr.predict_proba(X_train)
train_results = np.argmax(train_probs, axis=1)

test_probs = lr.predict_proba(X_test)
test_results = np.argmax(test_probs, axis=1)

train_logical_correct = [pred == actual for pred, actual in zip(train_results, y_train)]
train_acc = np.mean(train_logical_correct)

test_logical_correct = [pred == actual for pred, actual in zip(test_results, y_test)]
test_acc = np.mean(test_logical_correct)

print('Train accuracy: ', train_acc)
print('Test accuracy: ', test_acc)

Train accuracy:  0.937797619047619
Test accuracy:  0.8871951219512195


Using this dataset I was able to achieve a 94% training accuracy and a 89% test accuracy. 

In [13]:
precision, recall, f1, support = precision_recall_fscore_support(y_test,test_results)

tn, fp, fn, tp = confusion_matrix(y_test, test_results).ravel()

print(confusion_matrix(y_test, test_results))
print('='*35)
print('Precision: ', precision)
print('Recall: ', recall)
print('F1: ', f1)
print('Support: ', support)

[[3584  510]
 [ 415 3691]]
Precision:  [0.89622406 0.87860033]
Recall:  [0.87542745 0.8989284 ]
F1:  [0.88570369 0.88864813]
Support:  [4094 4106]


In [14]:
pickle.dump(vectorizer, open(os.path.join('Data', 'vectorizer.pkl'),'wb'),protocol=4)
pickle.dump(lr, open(os.path.join('Data','logisticRegression.pkl'),'wb'), protocol=4)