# Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from transformers import BertTokenizer
# import torch
# from torch import nn
# from transformers import BertModel
# from torch.optim import Adam
# from tqdm import tqdm

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


# Data Reading

In [2]:
dataset = pd.read_csv("IMDB Dataset.csv", sep=',')
# dataset = pd.read_csv("fruits.csv", sep=',')
display(dataset)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Data Split

In [3]:
# split data to positive and negative
p_samples = dataset[dataset['sentiment'] == 'positive']
n_samples = dataset[dataset['sentiment'] == 'negative']
# display(p_samples)
# display(n_samples)

# split the positive class samples to (70%, 10%, 20%) for (training, validation, testing) respectively 
p_testing = p_samples.sample(frac = 0.2)
p_validation = p_samples.drop(p_testing.index).sample(frac = 0.125)
p_training = p_samples.drop(p_validation.index).drop(p_testing.index)

# split the negative class samples to (70%, 10%, 20%) for (training, validation, testing) respectively 
n_testing = n_samples.sample(frac = 0.2)
n_validation = n_samples.drop(n_testing.index).sample(frac = 0.125)
n_training = n_samples.drop(n_validation.index).drop(n_testing.index)

# concatenating the 70% of p-class and n-class to form the training set
training_set = pd.concat([p_training, n_training], axis=0, ignore_index=True)

# concatenating the 10% of p-class and n-class to form the validation set
validation_set = pd.concat([p_validation, n_validation], axis=0, ignore_index=True)

# concatenating the 20% of p-class and n-class to form the testing set
testing_set = pd.concat([p_testing, n_testing], axis=0, ignore_index=True)

display(training_set)
# display(validation_set)
# display(testing_set)


# training_review = training_set.iloc[:,:-1]
# training_class = training_set.iloc[:,-1]
# display(training_review)
# display(training_class)

# validation_review = validation_set.iloc[:,:-1]
# validation_class = validation_set.iloc[:,-1]
# display(testing_review)
# display(testing_class)

# testing_review = testing_set.iloc[:,:-1]
# testing_class = testing_set.iloc[:,-1]
# display(testing_review)
# display(testing_class)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,"Petter Mattei's ""Love in the Time of Money"" is...",positive
3,I sure would like to see a resurrection of a u...,positive
4,If you like original gut wrenching laughter yo...,positive
...,...,...
34995,"Les Visiteurs, the first movie about the medie...",negative
34996,Robert Colomb has two full-time jobs. He's kno...,negative
34997,This is your typical junk comedy.<br /><br />T...,negative
34998,"Bad plot, bad dialogue, bad acting, idiotic di...",negative


# Text Pre-processing

In [4]:
# Remove punctuation
import string
string.punctuation

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

#storing the puntuation free text
training_set['free_punc_review']= training_set['review'].apply(lambda x:remove_punctuation(x))
validation_set['free_punc_review']= validation_set['review'].apply(lambda x:remove_punctuation(x))
testing_set['free_punc_review']= testing_set['review'].apply(lambda x:remove_punctuation(x))

# Lowercase all characters
training_set['review_lower']= training_set['free_punc_review'].apply(lambda x: x.lower())
validation_set['review_lower']= validation_set['free_punc_review'].apply(lambda x: x.lower())
testing_set['review_lower']= testing_set['free_punc_review'].apply(lambda x: x.lower())

# Tokenization
import re
def tokenization(text):
    tokens = word_tokenize(text)
    return tokens
training_set['review_tokenied'] = training_set['review_lower'].apply(lambda x: tokenization(x))
validation_set['review_tokenied'] = validation_set['review_lower'].apply(lambda x: tokenization(x))
testing_set['review_tokenied'] = testing_set['review_lower'].apply(lambda x: tokenization(x))

# Remove stop words
from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')
stop_words[0:10]
['i', 'the', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

def remove_stopwords(text):
    output= [i for i in text if i not in stop_words]
    return output

training_set['no_stopwords'] = training_set['review_tokenied'].apply(lambda x:remove_stopwords(x))
validation_set['no_stopwords'] = validation_set['review_tokenied'].apply(lambda x:remove_stopwords(x))
testing_set['no_stopwords'] = testing_set['review_tokenied'].apply(lambda x:remove_stopwords(x))

#Lemmatization of words
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
    if type(text) is float : 
        return []
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
training_set['review_lemmatized'] = training_set['no_stopwords'].apply(lambda x:lemmatizer(x))
    

display(training_set)
display(validation_set)
display(testing_set)

Unnamed: 0,review,sentiment,free_punc_review,review_lower,review_tokenied,no_stopwords,review_lemmatized
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[one, reviewers, mentioned, watching, 1, oz, e...","[one, reviewer, mentioned, watching, 1, oz, ep..."
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production br br The filmin...,a wonderful little production br br the filmin...,"[a, wonderful, little, production, br, br, the...","[wonderful, little, production, br, br, filmin...","[wonderful, little, production, br, br, filmin..."
2,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Matteis Love in the Time of Money is a ...,petter matteis love in the time of money is a ...,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,..."
3,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...,i sure would like to see a resurrection of a u...,"[i, sure, would, like, to, see, a, resurrectio...","[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrection, dated, ..."
4,If you like original gut wrenching laughter yo...,positive,If you like original gut wrenching laughter yo...,if you like original gut wrenching laughter yo...,"[if, you, like, original, gut, wrenching, laug...","[like, original, gut, wrenching, laughter, lik...","[like, original, gut, wrenching, laughter, lik..."
...,...,...,...,...,...,...,...
34995,"Les Visiteurs, the first movie about the medie...",negative,Les Visiteurs the first movie about the mediev...,les visiteurs the first movie about the mediev...,"[les, visiteurs, the, first, movie, about, the...","[les, visiteurs, first, movie, medieval, time,...","[le, visiteurs, first, movie, medieval, time, ..."
34996,Robert Colomb has two full-time jobs. He's kno...,negative,Robert Colomb has two fulltime jobs Hes known ...,robert colomb has two fulltime jobs hes known ...,"[robert, colomb, has, two, fulltime, jobs, hes...","[robert, colomb, two, fulltime, jobs, hes, kno...","[robert, colomb, two, fulltime, job, he, known..."
34997,This is your typical junk comedy.<br /><br />T...,negative,This is your typical junk comedybr br There ar...,this is your typical junk comedybr br there ar...,"[this, is, your, typical, junk, comedybr, br, ...","[typical, junk, comedybr, br, almost, laughs, ...","[typical, junk, comedybr, br, almost, laugh, g..."
34998,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,Bad plot bad dialogue bad acting idiotic direc...,bad plot bad dialogue bad acting idiotic direc...,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti..."


Unnamed: 0,review,sentiment,free_punc_review,review_lower,review_tokenied,no_stopwords
0,I went into Deathtrap expecting a well orchest...,positive,I went into Deathtrap expecting a well orchest...,i went into deathtrap expecting a well orchest...,"[i, went, into, deathtrap, expecting, a, well,...","[went, deathtrap, expecting, well, orchestrate..."
1,Zero Day is a film few people have gotten to s...,positive,Zero Day is a film few people have gotten to s...,zero day is a film few people have gotten to s...,"[zero, day, is, a, film, few, people, have, go...","[zero, day, film, people, gotten, see, shame, ..."
2,A touching movie. It is full of emotions and w...,positive,A touching movie It is full of emotions and wo...,a touching movie it is full of emotions and wo...,"[a, touching, movie, it, is, full, of, emotion...","[touching, movie, full, emotions, wonderful, a..."
3,"This is a top finnish film this year,although ...",positive,This is a top finnish film this yearalthough T...,this is a top finnish film this yearalthough t...,"[this, is, a, top, finnish, film, this, yearal...","[top, finnish, film, yearalthough, tango, kaba..."
4,ABC's version of the life of the late Pope: Th...,positive,ABCs version of the life of the late Pope They...,abcs version of the life of the late pope they...,"[abcs, version, of, the, life, of, the, late, ...","[abcs, version, life, late, pope, put, slightl..."
...,...,...,...,...,...,...
4995,My favorite memory of this show and the band w...,negative,My favorite memory of this show and the band w...,my favorite memory of this show and the band w...,"[my, favorite, memory, of, this, show, and, th...","[favorite, memory, show, band, got, together, ..."
4996,This is the kind of movie that my enemies cont...,negative,This is the kind of movie that my enemies cont...,this is the kind of movie that my enemies cont...,"[this, is, the, kind, of, movie, that, my, ene...","[kind, movie, enemies, content, watch, time, b..."
4997,It's not often I feel compelled to give negati...,negative,Its not often I feel compelled to give negativ...,its not often i feel compelled to give negativ...,"[its, not, often, i, feel, compelled, to, give...","[often, feel, compelled, give, negative, criti..."
4998,"Well, if you like pop/punk, punk, ska, and a t...",negative,Well if you like poppunk punk ska and a tad bi...,well if you like poppunk punk ska and a tad bi...,"[well, if, you, like, poppunk, punk, ska, and,...","[well, like, poppunk, punk, ska, tad, bit, mod..."


Unnamed: 0,review,sentiment,free_punc_review,review_lower,review_tokenied,no_stopwords
0,Dolemite is one of the best movies featuring a...,positive,Dolemite is one of the best movies featuring a...,dolemite is one of the best movies featuring a...,"[dolemite, is, one, of, the, best, movies, fea...","[dolemite, one, best, movies, featuring, pimp,..."
1,I will not comment on the story as such. I agr...,positive,I will not comment on the story as such I agre...,i will not comment on the story as such i agre...,"[i, will, not, comment, on, the, story, as, su...","[comment, story, agree, peoples, comments, goo..."
2,I had the pleasure of seeing Saltimbanco live ...,positive,I had the pleasure of seeing Saltimbanco live ...,i had the pleasure of seeing saltimbanco live ...,"[i, had, the, pleasure, of, seeing, saltimbanc...","[pleasure, seeing, saltimbanco, live, seeing, ..."
3,"'It's easy to kill a monster, but it's hard to...",positive,Its easy to kill a monster but its hard to kil...,its easy to kill a monster but its hard to kil...,"[its, easy, to, kill, a, monster, but, its, ha...","[easy, kill, monster, hard, kill, human, being..."
4,This movie deserves a 20/10 if I could give on...,positive,This movie deserves a 2010 if I could give one...,this movie deserves a 2010 if i could give one...,"[this, movie, deserves, a, 2010, if, i, could,...","[movie, deserves, 2010, could, give, one, holl..."
...,...,...,...,...,...,...
9995,I watched the version with pathetic American o...,negative,I watched the version with pathetic American o...,i watched the version with pathetic american o...,"[i, watched, the, version, with, pathetic, ame...","[watched, version, pathetic, american, overdub..."
9996,"Like many others, I counted on the appearance ...",negative,Like many others I counted on the appearance o...,like many others i counted on the appearance o...,"[like, many, others, i, counted, on, the, appe...","[like, many, others, counted, appearance, denn..."
9997,Awesomely improbable and foolish potboiler tha...,negative,Awesomely improbable and foolish potboiler tha...,awesomely improbable and foolish potboiler tha...,"[awesomely, improbable, and, foolish, potboile...","[awesomely, improbable, foolish, potboiler, le..."
9998,"I tried. I really, really tried to think of so...",negative,I tried I really really tried to think of some...,i tried i really really tried to think of some...,"[i, tried, i, really, really, tried, to, think...","[tried, really, really, tried, think, somethin..."


# Classification using BERT