In [0]:
import numpy as np
import pandas as pd

In [0]:
import matplotlib.pyplot as plt

import warnings # This is just to hide the warnings, you don't have to worry about this
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

In [138]:
from google.colab import drive
drive.mount('gdrive')

Drive already mounted at gdrive; to attempt to forcibly remount, call drive.mount("gdrive", force_remount=True).


In [0]:
# Read data
df  = pd.read_csv('gdrive/My Drive/FTMLE - Tonga/Data/movie_review.csv', encoding='utf-8', sep='\t')

In [0]:
data = pd.read_csv('/content/gdrive/My Drive/FTMLE - Tonga/Data/movie_review_evaluation.csv', encoding='utf-8', sep='\t')

In [0]:
from collections import Counter
vocab = Counter()
for document in df['review']:
  for word in document.split(' '):
    vocab[word] += 1

In [141]:
vocab.most_common(20)

[('the', 258519),
 ('a', 139707),
 ('and', 137397),
 ('of', 128750),
 ('to', 119278),
 ('is', 92935),
 ('in', 77245),
 ('I', 59255),
 ('that', 57991),
 ('this', 51379),
 ('it', 48865),
 ('/><br', 45851),
 ('was', 42004),
 ('as', 38288),
 ('with', 37496),
 ('for', 36919),
 ('The', 30399),
 ('but', 30350),
 ('on', 27738),
 ('movie', 27342)]

In [0]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [142]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [144]:
vocab_reduced = Counter()
for word, count in vocab.items():
  if not word in stop:
    vocab_reduced[word] = count

vocab_reduced.most_common(20)

[('I', 59255),
 ('/><br', 45851),
 ('The', 30399),
 ('movie', 27342),
 ('film', 24768),
 ('one', 18704),
 ('like', 16278),
 ('This', 11074),
 ('would', 10720),
 ('good', 10243),
 ('It', 9853),
 ('really', 9773),
 ('even', 9530),
 ('see', 9077),
 ('-', 8181),
 ('get', 7857),
 ('story', 7652),
 ('much', 7634),
 ('time', 7028),
 ('make', 6719)]

In [0]:
import re

def preprocessor(text):
    """ Return a cleaned version of text
    """
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

# Create some random texts for testing the function preprocessor()
df['review_fix'] = df['review'].apply(preprocessor)

In [151]:
df['review_fix'][0]

'with all this stuff going down at the moment with mj i ve started listening to his music watching the odd documentary here and there watched the wiz and watched moonwalker again maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent moonwalker is part biography part feature film which i remember going to see at the cinema when it was originally released some of it has subtle messages about mj s feeling towards the press and also the obvious message of drugs are bad m kay visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him the actual feature film bit when it finally starts is only on for 20 minutes o

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer

porter = PorterStemmer()

stop = stopwords.words('english')

def tokenizer_porter(text):
    # Your code here
    return [porter.stem(word) for word in text.split()]

def preprocessor(text):
    """ Return a cleaned version of text
    """
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

tfidf = TfidfVectorizer(stop_words=stop,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)


In [153]:
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2',
                preprocessor=<function preprocessor at 0x7fad36044f28>,
                smooth_idf=True,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function tokenizer_porter at 0x7fad36044048>,
               

In [0]:
from sklearn.model_selection import train_test_split

X = df['review_fix']

y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2)

In [186]:
X_test

11507    i really really love this show i have always l...
17229    this is the first american film to successfull...
7085     i watched this last night with low expectation...
21417    i absolutely love this game to death ever sinc...
15018    i am terribly sorry i know that faã binder sti...
                               ...                        
20377    ok my girlfriend and i rented the dvd and abou...
11616    this movie was extremely boring it should leas...
3736     last night i finished re watching jane eyre 19...
14833    i greatly enjoyed margaret atwood s novel the ...
3560     this is not really a zombie film if we re defi...
Name: review_fix, Length: 4500, dtype: object

In [157]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# A pipeline is what chains several steps together, once the initial exploration is done. 
# For example, some codes are meant to transform features — normalise numericals, or turn text into vectors, 
# or fill up missing data, they are transformers; other codes are meant to predict variables by fitting an algorithm,
# they are estimators. Pipeline chains all these together which can then be applied to training data
clf = Pipeline([('vect', tfidf),
                ('clf', LogisticRegression(random_state=0))])

clf.fit(X_train, y_train)

  'stop_words.' % sorted(inconsistent))


Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=<function preprocessor at 0x7fad36044488>,
                                 smooth_idf=True,
                                 stop_words=['i', 'me', 'my', 'myself', '...
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenizer_porter at 0x7fad360442f0>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
         

In [158]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Now apply those above metrics to evaluate your model
# Your code here
predictions = clf.predict(X_test)
accuracy_score(y_test, predictions)


0.8895555555555555

In [176]:
predictions

array([1, 1, 1, ..., 1, 0, 0])

In [0]:
pre_X_test = data['review']

In [182]:
pre_predictions = clf.predict(pre_X_test)
accuracy_score(y_test, predictions)

0.8895555555555555

In [0]:
predictions_values = pre_predictions.tolist()

In [0]:
data['predictions'] = predictions_values

In [185]:
data

Unnamed: 0,id,review,predictions
0,10633_1,I watched this video at a friend's house. I'm ...,0
1,4489_1,`The Matrix' was an exciting summer blockbuste...,0
2,3304_10,This movie is one among the very few Indian mo...,1
3,3350_3,The script for this movie was probably found i...,0
4,1119_1,Even if this film was allegedly a joke in resp...,0
...,...,...,...
2495,1065_10,"Dark Angel is a futuristic sci-fi series, set ...",1
2496,7261_3,This British-Spanish co-production is one of t...,1
2497,11075_10,"Having read the reviews for this film, I under...",1
2498,2041_2,Well I'll start with the good points. The movi...,0


In [0]:

data.to_csv('thi.csv', index = False)

In [159]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2237
           1       0.88      0.90      0.89      2263

    accuracy                           0.89      4500
   macro avg       0.89      0.89      0.89      4500
weighted avg       0.89      0.89      0.89      4500

