In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
filename = "imdb_master.csv"
import chardet
with open(filename, "rb") as f:
    result = chardet.detect(f.read(100000))

result


{'encoding': 'CP949', 'confidence': 0.99, 'language': 'Korean'}

In [3]:
full_dataset = pd.read_csv(filename, encoding="ISO-8859-1")

In [4]:
full_dataset

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt
...,...,...,...,...,...
99995,99995,train,"Delightfully awful! Made by David Giancola, a ...",unsup,9998_0.txt
99996,99996,train,"Watching Time Chasers, it obvious that it was ...",unsup,9999_0.txt
99997,99997,train,At the beginning we can see members of Troma t...,unsup,999_0.txt
99998,99998,train,"The movie was incredible, ever since I saw it ...",unsup,99_0.txt


In [5]:
# Removing unnamed column from dataset
full_dataset = full_dataset.iloc[:, 1:]
full_dataset

Unnamed: 0,type,review,label,file
0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt
...,...,...,...,...
99995,train,"Delightfully awful! Made by David Giancola, a ...",unsup,9998_0.txt
99996,train,"Watching Time Chasers, it obvious that it was ...",unsup,9999_0.txt
99997,train,At the beginning we can see members of Troma t...,unsup,999_0.txt
99998,train,"The movie was incredible, ever since I saw it ...",unsup,99_0.txt


In [6]:
train_dataset = full_dataset[full_dataset.type == "train"]
test_dataset = full_dataset[full_dataset.type == "test"]

In [7]:
train_dataset

Unnamed: 0,type,review,label,file
25000,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt
25001,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt
25002,train,This film lacked something I couldn't put my f...,neg,10001_4.txt
25003,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt
25004,train,When I was little my parents took me along to ...,neg,10003_1.txt
...,...,...,...,...
99995,train,"Delightfully awful! Made by David Giancola, a ...",unsup,9998_0.txt
99996,train,"Watching Time Chasers, it obvious that it was ...",unsup,9999_0.txt
99997,train,At the beginning we can see members of Troma t...,unsup,999_0.txt
99998,train,"The movie was incredible, ever since I saw it ...",unsup,99_0.txt


In [8]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75000 entries, 25000 to 99999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    75000 non-null  object
 1   review  75000 non-null  object
 2   label   75000 non-null  object
 3   file    75000 non-null  object
dtypes: object(4)
memory usage: 2.9+ MB


In [9]:
train_dataset.describe()

Unnamed: 0,type,review,label,file
count,75000,75000,75000,75000
unique,1,74057,3,75000
top,train,How has this piece of crap stayed on TV this l...,unsup,0_3.txt
freq,75000,5,50000,1


In [10]:
train_dataset.head()

Unnamed: 0,type,review,label,file
25000,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt
25001,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt
25002,train,This film lacked something I couldn't put my f...,neg,10001_4.txt
25003,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt
25004,train,When I was little my parents took me along to ...,neg,10003_1.txt


In [11]:
train_dataset.label.value_counts()

unsup    50000
neg      12500
pos      12500
Name: label, dtype: int64

In [12]:
train_dataset = train_dataset[train_dataset.label != "unsup"]

In [13]:
train_dataset.label.value_counts()

neg    12500
pos    12500
Name: label, dtype: int64

In [14]:
train_dataset

Unnamed: 0,type,review,label,file
25000,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt
25001,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt
25002,train,This film lacked something I couldn't put my f...,neg,10001_4.txt
25003,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt
25004,train,When I was little my parents took me along to ...,neg,10003_1.txt
...,...,...,...,...
49995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt
49996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt
49997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt
49998,train,A Christmas Together actually came before my t...,pos,99_8.txt


In [15]:
train_dataset.reset_index(drop=True, inplace=True)

In [16]:
train_dataset

Unnamed: 0,type,review,label,file
0,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt
1,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt
2,train,This film lacked something I couldn't put my f...,neg,10001_4.txt
3,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt
4,train,When I was little my parents took me along to ...,neg,10003_1.txt
...,...,...,...,...
24995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt
24996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt
24997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt
24998,train,A Christmas Together actually came before my t...,pos,99_8.txt


In [17]:
train_dataset["Sentiment"] = np.where(train_dataset["label"] == "pos", 1, 0)

In [18]:
train_dataset

Unnamed: 0,type,review,label,file,Sentiment
0,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt,0
1,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt,0
2,train,This film lacked something I couldn't put my f...,neg,10001_4.txt,0
3,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt,0
4,train,When I was little my parents took me along to ...,neg,10003_1.txt,0
...,...,...,...,...,...
24995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt,1
24996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt,1
24997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt,1
24998,train,A Christmas Together actually came before my t...,pos,99_8.txt,1


In [19]:
train_dataset["Sentiment_2"] = [1 if i == "pos" else 0 for i in train_dataset.label]

In [20]:
train_dataset

Unnamed: 0,type,review,label,file,Sentiment,Sentiment_2
0,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt,0,0
1,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt,0,0
2,train,This film lacked something I couldn't put my f...,neg,10001_4.txt,0,0
3,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt,0,0
4,train,When I was little my parents took me along to ...,neg,10003_1.txt,0,0
...,...,...,...,...,...,...
24995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt,1,1
24996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt,1,1
24997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt,1,1
24998,train,A Christmas Together actually came before my t...,pos,99_8.txt,1,1


In [21]:
def get_sentiment(label):
    if label == "pos":
        return 1
    else:
        return 0

In [22]:
train_dataset["Sentiment_3"] = train_dataset.label.apply(get_sentiment)

In [23]:
train_dataset

Unnamed: 0,type,review,label,file,Sentiment,Sentiment_2,Sentiment_3
0,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt,0,0,0
1,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt,0,0,0
2,train,This film lacked something I couldn't put my f...,neg,10001_4.txt,0,0,0
3,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt,0,0,0
4,train,When I was little my parents took me along to ...,neg,10003_1.txt,0,0,0
...,...,...,...,...,...,...,...
24995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt,1,1,1
24996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt,1,1,1
24997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt,1,1,1
24998,train,A Christmas Together actually came before my t...,pos,99_8.txt,1,1,1


In [24]:
train_dataset["Sentiment_4"] = train_dataset.label.map({"pos": 1, "neg":0})

In [25]:
train_dataset

Unnamed: 0,type,review,label,file,Sentiment,Sentiment_2,Sentiment_3,Sentiment_4
0,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt,0,0,0,0
1,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt,0,0,0,0
2,train,This film lacked something I couldn't put my f...,neg,10001_4.txt,0,0,0,0
3,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt,0,0,0,0
4,train,When I was little my parents took me along to ...,neg,10003_1.txt,0,0,0,0
...,...,...,...,...,...,...,...,...
24995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt,1,1,1,1
24996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt,1,1,1,1
24997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt,1,1,1,1
24998,train,A Christmas Together actually came before my t...,pos,99_8.txt,1,1,1,1


# Data Preprocessing
- Cleaning
- Tokenization
- Stop Words Removal
- Lemmatization
- Stemming
- Byte Pair Encoding

## Cleaning
- Removing punctuation symbols
- Converting it to lower case
- Removing numbers

In [26]:
import re
def remove_punctuation(review):
    return re.sub(r'[^\w\s]', "", review)

In [27]:
train_dataset["review_processed"] = train_dataset.review.apply(remove_punctuation)

In [28]:
train_dataset

Unnamed: 0,type,review,label,file,Sentiment,Sentiment_2,Sentiment_3,Sentiment_4,review_processed
0,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt,0,0,0,0,Story of a man who has unnatural feelings for ...
1,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt,0,0,0,0,Airport 77 starts as a brand new luxury 747 pl...
2,train,This film lacked something I couldn't put my f...,neg,10001_4.txt,0,0,0,0,This film lacked something I couldnt put my fi...
3,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt,0,0,0,0,Sorry everyone I know this is supposed to be a...
4,train,When I was little my parents took me along to ...,neg,10003_1.txt,0,0,0,0,When I was little my parents took me along to ...
...,...,...,...,...,...,...,...,...,...
24995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt,1,1,1,1,Seeing as the vote average was pretty low and ...
24996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt,1,1,1,1,The plot had some wretched unbelievable twists...
24997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt,1,1,1,1,I am amazed at how this movieand most others h...
24998,train,A Christmas Together actually came before my t...,pos,99_8.txt,1,1,1,1,A Christmas Together actually came before my t...


In [29]:
def convert_to_lowercase(review):
    return review.lower()

In [30]:
train_dataset["review_processed"] = train_dataset.review_processed.apply(convert_to_lowercase)

In [31]:
train_dataset

Unnamed: 0,type,review,label,file,Sentiment,Sentiment_2,Sentiment_3,Sentiment_4,review_processed
0,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt,0,0,0,0,story of a man who has unnatural feelings for ...
1,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt,0,0,0,0,airport 77 starts as a brand new luxury 747 pl...
2,train,This film lacked something I couldn't put my f...,neg,10001_4.txt,0,0,0,0,this film lacked something i couldnt put my fi...
3,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt,0,0,0,0,sorry everyone i know this is supposed to be a...
4,train,When I was little my parents took me along to ...,neg,10003_1.txt,0,0,0,0,when i was little my parents took me along to ...
...,...,...,...,...,...,...,...,...,...
24995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt,1,1,1,1,seeing as the vote average was pretty low and ...
24996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt,1,1,1,1,the plot had some wretched unbelievable twists...
24997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt,1,1,1,1,i am amazed at how this movieand most others h...
24998,train,A Christmas Together actually came before my t...,pos,99_8.txt,1,1,1,1,a christmas together actually came before my t...


In [32]:
def remove_numbers(review):
    return re.sub(r'[\d+]', "", review)

In [33]:
train_dataset["review_processed"] = train_dataset.review_processed.apply(remove_numbers)

In [34]:
train_dataset

Unnamed: 0,type,review,label,file,Sentiment,Sentiment_2,Sentiment_3,Sentiment_4,review_processed
0,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt,0,0,0,0,story of a man who has unnatural feelings for ...
1,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt,0,0,0,0,airport starts as a brand new luxury plane i...
2,train,This film lacked something I couldn't put my f...,neg,10001_4.txt,0,0,0,0,this film lacked something i couldnt put my fi...
3,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt,0,0,0,0,sorry everyone i know this is supposed to be a...
4,train,When I was little my parents took me along to ...,neg,10003_1.txt,0,0,0,0,when i was little my parents took me along to ...
...,...,...,...,...,...,...,...,...,...
24995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt,1,1,1,1,seeing as the vote average was pretty low and ...
24996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt,1,1,1,1,the plot had some wretched unbelievable twists...
24997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt,1,1,1,1,i am amazed at how this movieand most others h...
24998,train,A Christmas Together actually came before my t...,pos,99_8.txt,1,1,1,1,a christmas together actually came before my t...


## Tokenization and Stopwords Removal

In [35]:
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niketan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
# Function that does tokenization
def tokenize_words(review):
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(review)
    tokens = [token.strip() for token in tokens]
    return " ".join(tokens)

In [37]:
stopwords_list = stopwords.words("english")
def apply_tokenization_and_remove_stopwords(review):
    # Applying tokenization
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(review)
    tokens = [token.strip() for token in tokens]
    # applying removal of stopwords
    review_no_stopwords = [word for word in tokens if word not in stopwords_list]
    return " ".join(review_no_stopwords)
    

In [38]:
train_dataset["review_processed"] = train_dataset.review_processed.apply(apply_tokenization_and_remove_stopwords)

In [39]:
train_dataset

Unnamed: 0,type,review,label,file,Sentiment,Sentiment_2,Sentiment_3,Sentiment_4,review_processed
0,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt,0,0,0,0,story man unnatural feelings pig starts openin...
1,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt,0,0,0,0,airport starts brand new luxury plane loaded v...
2,train,This film lacked something I couldn't put my f...,neg,10001_4.txt,0,0,0,0,film lacked something couldnt put finger first...
3,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt,0,0,0,0,sorry everyone know supposed art film wow hand...
4,train,When I was little my parents took me along to ...,neg,10003_1.txt,0,0,0,0,little parents took along theater see interior...
...,...,...,...,...,...,...,...,...,...
24995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt,1,1,1,1,seeing vote average pretty low fact clerk vide...
24996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt,1,1,1,1,plot wretched unbelievable twists however chem...
24997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt,1,1,1,1,amazed movieand others average stars lower cra...
24998,train,A Christmas Together actually came before my t...,pos,99_8.txt,1,1,1,1,christmas together actually came time ive rais...


## Lemmatization

In [46]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
def apply_lemmatization(review):
    lemmatized_review = []
    lemmatizer = WordNetLemmatizer()
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(review)
    tokens = [token.strip() for token in tokens]
    for w in tokens:
        lemmatized_review.append(lemmatizer.lemmatize(w))
    return " ".join(lemmatized_review)

[nltk_data] Downloading package wordnet to /Users/niketan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/niketan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [47]:
s = "Story of a man who has unnatural instincts"
apply_lemmatization(s)

'Story of a man who ha unnatural instinct'

In [48]:
train_dataset["review_processed"] = train_dataset.review_processed.apply(apply_lemmatization)

In [49]:
train_dataset

Unnamed: 0,type,review,label,file,Sentiment,Sentiment_2,Sentiment_3,Sentiment_4,review_processed
0,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt,0,0,0,0,story man unnatural feeling pig start opening ...
1,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt,0,0,0,0,airport start brand new luxury plane loaded va...
2,train,This film lacked something I couldn't put my f...,neg,10001_4.txt,0,0,0,0,film lacked something couldnt put finger first...
3,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt,0,0,0,0,sorry everyone know supposed art film wow hand...
4,train,When I was little my parents took me along to ...,neg,10003_1.txt,0,0,0,0,little parent took along theater see interior ...
...,...,...,...,...,...,...,...,...,...
24995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt,1,1,1,1,seeing vote average pretty low fact clerk vide...
24996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt,1,1,1,1,plot wretched unbelievable twist however chemi...
24997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt,1,1,1,1,amazed movieand others average star lower crap...
24998,train,A Christmas Together actually came before my t...,pos,99_8.txt,1,1,1,1,christmas together actually came time ive rais...
