# Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from transformers import BertTokenizer
# import torch
# from torch import nn
# from transformers import BertModel
# from torch.optim import Adam
# from tqdm import tqdm

# Data Reading

In [2]:
dataset = pd.read_csv("IMDB Dataset.csv", sep=',')
# dataset = pd.read_csv("fruits.csv", sep=',')
display(dataset)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Data Split

In [3]:
# split data to positive and negative
p_samples = dataset[dataset['sentiment'] == 'positive']
n_samples = dataset[dataset['sentiment'] == 'negative']
# display(p_samples)
# display(n_samples)

# split the positive class samples to (70%, 10%, 20%) for (training, validation, testing) respectively 
p_testing = p_samples.sample(frac = 0.2)
p_validation = p_samples.drop(p_testing.index).sample(frac = 0.125)
p_training = p_samples.drop(p_validation.index).drop(p_testing.index)

# split the negative class samples to (70%, 10%, 20%) for (training, validation, testing) respectively 
n_testing = n_samples.sample(frac = 0.2)
n_validation = n_samples.drop(n_testing.index).sample(frac = 0.125)
n_training = n_samples.drop(n_validation.index).drop(n_testing.index)

# concatenating the 70% of p-class and n-class to form the training set
training_set = pd.concat([p_training, n_training], axis=0, ignore_index=True)

# concatenating the 10% of p-class and n-class to form the validation set
validation_set = pd.concat([p_validation, n_validation], axis=0, ignore_index=True)

# concatenating the 20% of p-class and n-class to form the testing set
testing_set = pd.concat([p_testing, n_testing], axis=0, ignore_index=True)

display(training_set)
# display(validation_set)
# display(testing_set)


# training_review = training_set.iloc[:,:-1]
# training_class = training_set.iloc[:,-1]
# display(training_review)
# display(training_class)

# validation_review = validation_set.iloc[:,:-1]
# validation_class = validation_set.iloc[:,-1]
# display(testing_review)
# display(testing_class)

# testing_review = testing_set.iloc[:,:-1]
# testing_class = testing_set.iloc[:,-1]
# display(testing_review)
# display(testing_class)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,I thought this was a wonderful way to spend ti...,positive
2,"Petter Mattei's ""Love in the Time of Money"" is...",positive
3,"Probably my all-time favorite movie, a story o...",positive
4,I sure would like to see a resurrection of a u...,positive
...,...,...
34995,A remake of Alejandro Amenabar's Abre los Ojos...,negative
34996,"Lame, lame, lame!!! A 90-minute cringe-fest th...",negative
34997,"Les Visiteurs, the first movie about the medie...",negative
34998,I'm going to have to disagree with the previou...,negative


# Text Pre-processing

### Remove punctuation & lowercase all characters

In [12]:
import string
string.punctuation

def remove_punctuation(text):
    return "".join([i.lower() for i in text if i not in string.punctuation])

# storing the puntuation free text
training_set['free_punc_review']= training_set['review'].apply(lambda x:remove_punctuation(x))
validation_set['free_punc_review']= validation_set['review'].apply(lambda x:remove_punctuation(x))
testing_set['free_punc_review']= testing_set['review'].apply(lambda x:remove_punctuation(x))

### Lowercase all characters

In [8]:
training_set['review_lower']= training_set['free_punc_review'].apply(lambda x: x.lower())
validation_set['review_lower']= validation_set['free_punc_review'].apply(lambda x: x.lower())
testing_set['review_lower']= testing_set['free_punc_review'].apply(lambda x: x.lower())

### Tokenization

In [None]:
import re
def tokenization(text):
    tokens = word_tokenize(text)
    return tokens
training_set['review_tokenied'] = training_set['review_lower'].apply(lambda x: tokenization(x))
validation_set['review_tokenied'] = validation_set['review_lower'].apply(lambda x: tokenization(x))
testing_set['review_tokenied'] = testing_set['review_lower'].apply(lambda x: tokenization(x))

### Remove stop words

In [None]:
from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')
stop_words[0:10]
['i', 'the', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

def remove_stopwords(text):
    output= [i for i in text if i not in stop_words]
    return output

training_set['no_stopwords'] = training_set['review_tokenied'].apply(lambda x:remove_stopwords(x))
validation_set['no_stopwords'] = validation_set['review_tokenied'].apply(lambda x:remove_stopwords(x))
testing_set['no_stopwords'] = testing_set['review_tokenied'].apply(lambda x:remove_stopwords(x))

### Lemmatization

In [11]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
    if type(text) is float : 
        return []
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
training_set['review_lemmatized'] = training_set['no_stopwords'].apply(lambda x:lemmatizer(x))

display(training_set)
display(validation_set)
display(testing_set)

Unnamed: 0,review,sentiment,free_punc_review,review_lower,review_tokenied,no_stopwords,review_lemmatized
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[one, reviewers, mentioned, watching, 1, oz, e...","[one, reviewer, mentioned, watching, 1, oz, ep..."
1,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su..."
2,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Matteis Love in the Time of Money is a ...,petter matteis love in the time of money is a ...,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,..."
3,"Probably my all-time favorite movie, a story o...",positive,Probably my alltime favorite movie a story of ...,probably my alltime favorite movie a story of ...,"[probably, my, alltime, favorite, movie, a, st...","[probably, alltime, favorite, movie, story, se...","[probably, alltime, favorite, movie, story, se..."
4,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...,i sure would like to see a resurrection of a u...,"[i, sure, would, like, to, see, a, resurrectio...","[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrection, dated, ..."
...,...,...,...,...,...,...,...
34995,A remake of Alejandro Amenabar's Abre los Ojos...,negative,A remake of Alejandro Amenabars Abre los Ojos ...,a remake of alejandro amenabars abre los ojos ...,"[a, remake, of, alejandro, amenabars, abre, lo...","[remake, alejandro, amenabars, abre, los, ojos...","[remake, alejandro, amenabars, abre, los, ojos..."
34996,"Lame, lame, lame!!! A 90-minute cringe-fest th...",negative,Lame lame lame A 90minute cringefest thats 89 ...,lame lame lame a 90minute cringefest thats 89 ...,"[lame, lame, lame, a, 90minute, cringefest, th...","[lame, lame, lame, 90minute, cringefest, thats...","[lame, lame, lame, 90minute, cringefest, thats..."
34997,"Les Visiteurs, the first movie about the medie...",negative,Les Visiteurs the first movie about the mediev...,les visiteurs the first movie about the mediev...,"[les, visiteurs, the, first, movie, about, the...","[les, visiteurs, first, movie, medieval, time,...","[le, visiteurs, first, movie, medieval, time, ..."
34998,I'm going to have to disagree with the previou...,negative,Im going to have to disagree with the previous...,im going to have to disagree with the previous...,"[im, going, to, have, to, disagree, with, the,...","[im, going, disagree, previous, comment, side,...","[im, going, disagree, previous, comment, side,..."


Unnamed: 0,review,sentiment,free_punc_review,review_lower,review_tokenied,no_stopwords
0,I've seen a fair few films from the Far East r...,positive,Ive seen a fair few films from the Far East re...,ive seen a fair few films from the far east re...,"[ive, seen, a, fair, few, films, from, the, fa...","[ive, seen, fair, films, far, east, recentlyso..."
1,LOL!!! delirious was so funny.. i was in tears...,positive,LOL delirious was so funny i was in tears Eddi...,lol delirious was so funny i was in tears eddi...,"[lol, delirious, was, so, funny, i, was, in, t...","[lol, delirious, funny, tears, eddie, murphys,..."
2,Although many audio recordings of great musici...,positive,Although many audio recordings of great musici...,although many audio recordings of great musici...,"[although, many, audio, recordings, of, great,...","[although, many, audio, recordings, great, mus..."
3,What makes watching and reviewing films a plea...,positive,What makes watching and reviewing films a plea...,what makes watching and reviewing films a plea...,"[what, makes, watching, and, reviewing, films,...","[makes, watching, reviewing, films, pleasure, ..."
4,Wagon Master (1950) Dir: John Ford <br /><br /...,positive,Wagon Master 1950 Dir John Ford br br Producti...,wagon master 1950 dir john ford br br producti...,"[wagon, master, 1950, dir, john, ford, br, br,...","[wagon, master, 1950, dir, john, ford, br, br,..."
...,...,...,...,...,...,...
4995,I never actually thought that a film could be ...,negative,I never actually thought that a film could be ...,i never actually thought that a film could be ...,"[i, never, actually, thought, that, a, film, c...","[never, actually, thought, film, could, atroci..."
4996,Little Mosque is one of the most boring CBC co...,negative,Little Mosque is one of the most boring CBC co...,little mosque is one of the most boring cbc co...,"[little, mosque, is, one, of, the, most, borin...","[little, mosque, one, boring, cbc, comedies, e..."
4997,This was shown on the biography channel and wa...,negative,This was shown on the biography channel and wa...,this was shown on the biography channel and wa...,"[this, was, shown, on, the, biography, channel...","[shown, biography, channel, informative, child..."
4998,this is by far the most pathetic movie Indian ...,negative,this is by far the most pathetic movie Indian ...,this is by far the most pathetic movie indian ...,"[this, is, by, far, the, most, pathetic, movie...","[far, pathetic, movie, indian, cinema, cinema,..."


Unnamed: 0,review,sentiment,free_punc_review,review_lower,review_tokenied,no_stopwords
0,For a movie that gets no respect there sure ar...,positive,For a movie that gets no respect there sure ar...,for a movie that gets no respect there sure ar...,"[for, a, movie, that, gets, no, respect, there...","[movie, gets, respect, sure, lot, memorable, q..."
1,This is one of the greatest sports movies ever...,positive,This is one of the greatest sports movies ever...,this is one of the greatest sports movies ever...,"[this, is, one, of, the, greatest, sports, mov...","[one, greatest, sports, movies, ever, made, ho..."
2,This neo-film noir is one of a genre of late t...,positive,This neofilm noir is one of a genre of late tw...,this neofilm noir is one of a genre of late tw...,"[this, neofilm, noir, is, one, of, a, genre, o...","[neofilm, noir, one, genre, late, twentieth, c..."
3,I havent seen that movie in 20 or more years b...,positive,I havent seen that movie in 20 or more years b...,i havent seen that movie in 20 or more years b...,"[i, havent, seen, that, movie, in, 20, or, mor...","[havent, seen, movie, 20, years, remember, att..."
4,"I've read reviews of Kerching on IMDb, and fra...",positive,Ive read reviews of Kerching on IMDb and frank...,ive read reviews of kerching on imdb and frank...,"[ive, read, reviews, of, kerching, on, imdb, a...","[ive, read, reviews, kerching, imdb, franklyiv..."
...,...,...,...,...,...,...
9995,I watched this film because I noticed that it ...,negative,I watched this film because I noticed that it ...,i watched this film because i noticed that it ...,"[i, watched, this, film, because, i, noticed, ...","[watched, film, noticed, kari, wuhrer, cast, l..."
9996,Really bad movie. Maybe the worst I've ever se...,negative,Really bad movie Maybe the worst Ive ever seen...,really bad movie maybe the worst ive ever seen...,"[really, bad, movie, maybe, the, worst, ive, e...","[really, bad, movie, maybe, worst, ive, ever, ..."
9997,Now we know where they got the idea of Snakes ...,negative,Now we know where they got the idea of Snakes ...,now we know where they got the idea of snakes ...,"[now, we, know, where, they, got, the, idea, o...","[know, got, idea, snakes, plane, put, bluntly,..."
9998,In the aftermath of September 11th in New York...,negative,In the aftermath of September 11th in New York...,in the aftermath of september 11th in new york...,"[in, the, aftermath, of, september, 11th, in, ...","[aftermath, september, 11th, new, york, drama,..."


# Classification using BERT