In [18]:
import pandas as pd
import nltk
import numpy as np
import gensim

import re
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from tqdm import tqdm
import gc

In [3]:
nltk.download('punkt')
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/rootroot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rootroot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rootroot/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df = pd.read_csv("../data/IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## Text-preprocessing:
1. Lowercasing, removal of HTML tags "br>, <br"
2. Stopwords removals
3. Lemmitization

In [5]:
corpus = df["review"].to_list()
len(corpus), corpus[:2]

(50000,
 ["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due

In [6]:
lemi = WordNetLemmatizer()
def text_process(doc):
    doc = re.sub("[^a-zA-Z0-9]", " ", doc)
    doc = doc.lower()
    doc = doc.split()
    doc = [lemi.lemmatize(remove_stopwords(sentences)) for sentences in doc if remove_stopwords(sentences)]
    doc = " ".join(doc)
    return doc

In [7]:
processed_corpus = []
for doc in tqdm(corpus):
    processed_corpus.append(text_process(doc))

100%|██████████| 50000/50000 [00:27<00:00, 1832.46it/s]


In [8]:
processed_corpus[4]

'petter mattei s love time money visually stunning film watch mr mattei offer vivid portrait human relation movie telling money power success people different situation encounter br br variation arthur schnitzler s play theme director transfer action present time new york different character meet connect connected way person know previous point contact stylishly film sophisticated luxurious look taken people live world live habitat br br thing get soul picture different stage loneliness inhabits big city exactly best place human relation fulfillment discerns case people encounter br br acting good mr mattei s direction steve buscemi rosario dawson carol kane michael imperioli adrian grenier rest talented cast character come alive br br wish mr mattei good luck await anxiously work'

In [9]:
def label_sentiment(sentiment):
    if sentiment == "positive":
        return 1
    elif sentiment == "negative": 
        return 0


df["sentiment_labelled"] = df["sentiment"].apply(label_sentiment)

In [10]:
X = np.array(processed_corpus)
y = df["sentiment_labelled"].values

# Word2vec 
## 1. Use Google Word2Vec Model

In [11]:
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api

from concurrent.futures import ProcessPoolExecutor

In [12]:
wv = api.load("word2vec-google-news-300")

In [13]:
def vectorize_avg_doc(doc):
    return np.mean(np.array([wv[word] for word in doc if word in wv]), axis = 0)

X_word2vec = []
with ProcessPoolExecutor() as pool:
    X_word2vec = list(tqdm(pool.map(vectorize_avg_doc, X), total = 50000))

X_word2vec = np.array(X_word2vec)

100%|██████████| 50000/50000 [00:16<00:00, 3001.18it/s]


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.15, random_state=42)

print(X_train.shape)
print(X_test.shape)

gc.collect()

(42500, 300)
(7500, 300)


1788

In [20]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

rf.score(X_test,y_test)

0.6182666666666666

In [21]:
processed_corpus[0]

'reviewer mentioned watching 1 oz episode ll hooked right exactly happened br br thing struck oz brutality unflinching scene violence set right word trust faint hearted timid pull punch regard drug sex violence hardcore classic use word br br called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement far away br br main appeal fact go show wouldn t dare forget pretty picture painted mainstream audience forget charm forget romance oz t mess episode saw struck nasty surreal couldn t ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard ll sold nickel inmate ll kill order away mannered middle class inmate turned prison bitch lack street skill prison experience watching oz comfortable uncomfortable viewing thats 

# 2. Train Word2Vec model

In [22]:
def preprocess_text(text):
    doc = re.sub("[^a-zA-Z0-9]", " ", text)
    doc = doc.split()
    doc = [lemi.lemmatize(remove_stopwords(sentences)) for sentences in doc if remove_stopwords(sentences)]
    doc = " ".join(doc)
    sent_token = nltk.sent_tokenize(doc)
    processed_words = []
    for sent in sent_token:
        processed_words.append(gensim.utils.simple_preprocess(sent))
    return processed_words

words = []
for docs in tqdm(processed_corpus):
    words.extend(preprocess_text(docs))

100%|██████████| 50000/50000 [00:21<00:00, 2304.40it/s]


In [23]:
model = Word2Vec(words, window=7,min_count=3, vector_size=200)

In [24]:
def vectorize_avg_doc_my(doc):
    return np.mean(np.array([model.wv[word] for word in doc if word in model.wv]), axis = 0)

X_word2vec_my = []
for i in tqdm(words):
    X_word2vec_my.append(vectorize_avg_doc_my(i))

X_word2vec_my = np.array(X_word2vec_my)
X_word2vec_my.shape, y.shape

100%|██████████| 50000/50000 [00:07<00:00, 6846.38it/s]


((50000, 200), (50000,))

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_word2vec_my, y, test_size=0.15, random_state=42)

lr = LogisticRegression(max_iter=2000)
lr.fit(X_train,y_train)

lr.score(X_test,y_test)

0.8741333333333333

In [26]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

rf.score(X_test,y_test)

0.8494666666666667

In [27]:
# # Save the model
# import pickle

# with open("../model/my_word2vec.pkl", "wb") as f:
#     pickle.dump(model, f)


# with open("../model/lr_my_word2vec.pkl", "wb") as f:
#     pickle.dump(lr, f)

# # # Load the model
# # with open("my_model.pkl", "rb") as f:
# #     loaded_rf = pickle.load(f)


In [28]:
def generate_prediction(test_sentence, score = False):
    preprocessed_text = preprocess_text(test_sentence)
    test_sentence_X = vectorize_avg_doc_my(preprocessed_text[0])
    if score:
        lr.predict_proba([test_sentence_X])[0]
    else:
        lr.predict([test_sentence_X])[0]

In [29]:
test_sentence = "the movie was okay. I liked the apple scene"
generate_prediction(test_sentence)