# Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from transformers import BertTokenizer
# import torch
# from torch import nn
# from transformers import BertModel
# from torch.optim import Adam
# from tqdm import tqdm

# Data Reading

In [2]:
dataset = pd.read_csv("IMDB Dataset.csv", sep=',')
# dataset = pd.read_csv("fruits.csv", sep=',')
display(dataset)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Text Pre-processing

### Remove punctuation & lowercase all characters

In [3]:
import string
string.punctuation

def remove_punctuation(text):
    return "".join([i.lower() for i in text if i not in string.punctuation])

# storing the puntuation free text
dataset['free_punc_review']= dataset['review'].apply(lambda x:remove_punctuation(x))

display(dataset)

Unnamed: 0,review,sentiment,free_punc_review
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically theres a family where a little boy j...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love in the time of money is a ...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,i thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,negative,i am a catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...,negative,im going to have to disagree with the previous...


### Tokenization

In [4]:
import re
def tokenization(text):
    tokens = word_tokenize(text)
    return tokens
dataset['review_tokenied'] = dataset['free_punc_review'].apply(lambda x: tokenization(x))

display(dataset)

Unnamed: 0,review,sentiment,free_punc_review,review_tokenied
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[a, wonderful, little, production, br, br, the..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,basically theres a family where a little boy j...,"[basically, theres, a, family, where, a, littl..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love in the time of money is a ...,"[petter, matteis, love, in, the, time, of, mon..."
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,i thought this movie did a down right good job...,"[i, thought, this, movie, did, a, down, right,..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,I am a Catholic taught in parochial elementary...,negative,i am a catholic taught in parochial elementary...,"[i, am, a, catholic, taught, in, parochial, el..."
49998,I'm going to have to disagree with the previou...,negative,im going to have to disagree with the previous...,"[im, going, to, have, to, disagree, with, the,..."


### Remove stop words

In [5]:
from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')
stop_words[0:10]
['i', 'the', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

def remove_stopwords(text):
    output= [i for i in text if i not in stop_words]
    return output

dataset['no_stopwords'] = dataset['review_tokenied'].apply(lambda x:remove_stopwords(x))

display(dataset)

Unnamed: 0,review,sentiment,free_punc_review,review_tokenied,no_stopwords
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[one, reviewers, mentioned, watching, 1, oz, e..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[a, wonderful, little, production, br, br, the...","[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,basically theres a family where a little boy j...,"[basically, theres, a, family, where, a, littl...","[basically, theres, family, little, boy, jake,..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love in the time of money is a ...,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,..."
...,...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,i thought this movie did a down right good job...,"[i, thought, this, movie, did, a, down, right,...","[thought, movie, right, good, job, wasnt, crea..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,I am a Catholic taught in parochial elementary...,negative,i am a catholic taught in parochial elementary...,"[i, am, a, catholic, taught, in, parochial, el...","[catholic, taught, parochial, elementary, scho..."
49998,I'm going to have to disagree with the previou...,negative,im going to have to disagree with the previous...,"[im, going, to, have, to, disagree, with, the,...","[im, going, disagree, previous, comment, side,..."


### Lemmatization

In [6]:
from nltk.stem import WordNetLemmatizer

#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
    if type(text) is float : 
        return []
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
dataset['review_lemmatized'] = dataset['no_stopwords'].apply(lambda x:lemmatizer(x))

display(dataset)

Unnamed: 0,review,sentiment,free_punc_review,review_tokenied,no_stopwords,review_lemmatized
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[one, reviewers, mentioned, watching, 1, oz, e...","[one, reviewer, mentioned, watching, 1, oz, ep..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[a, wonderful, little, production, br, br, the...","[wonderful, little, production, br, br, filmin...","[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,basically theres a family where a little boy j...,"[basically, theres, a, family, where, a, littl...","[basically, theres, family, little, boy, jake,...","[basically, there, family, little, boy, jake, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love in the time of money is a ...,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,..."
...,...,...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,i thought this movie did a down right good job...,"[i, thought, this, movie, did, a, down, right,...","[thought, movie, right, good, job, wasnt, crea...","[thought, movie, right, good, job, wasnt, crea..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,I am a Catholic taught in parochial elementary...,negative,i am a catholic taught in parochial elementary...,"[i, am, a, catholic, taught, in, parochial, el...","[catholic, taught, parochial, elementary, scho...","[catholic, taught, parochial, elementary, scho..."
49998,I'm going to have to disagree with the previou...,negative,im going to have to disagree with the previous...,"[im, going, to, have, to, disagree, with, the,...","[im, going, disagree, previous, comment, side,...","[im, going, disagree, previous, comment, side,..."


# Data Preparation

In [7]:
dataset['review'] = dataset['review_lemmatized'].apply(lambda x:' '.join(x))
dataset.drop('free_punc_review', inplace=True, axis=1)
dataset.drop('review_tokenied', inplace=True, axis=1)
dataset.drop('no_stopwords', inplace=True, axis=1)
dataset.drop('review_lemmatized', inplace=True, axis=1)

display(dataset)

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode y...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
49995,thought movie right good job wasnt creative or...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,im going disagree previous comment side maltin...,negative


# Data Split

In [10]:
# split data to positive and negative
p_samples = dataset[dataset['sentiment'] == 'positive']
n_samples = dataset[dataset['sentiment'] == 'negative']
# display(p_samples)
# display(n_samples)

# split the positive class samples to (70%, 10%, 20%) for (training, validation, testing) respectively 
p_testing = p_samples.sample(frac = 0.2)
p_validation = p_samples.drop(p_testing.index).sample(frac = 0.125)
p_training = p_samples.drop(p_validation.index).drop(p_testing.index)

# split the negative class samples to (70%, 10%, 20%) for (training, validation, testing) respectively 
n_testing = n_samples.sample(frac = 0.2)
n_validation = n_samples.drop(n_testing.index).sample(frac = 0.125)
n_training = n_samples.drop(n_validation.index).drop(n_testing.index)

# concatenating the 70% of p-class and n-class to form the training set
training_set = pd.concat([p_training, n_training], axis=0, ignore_index=True)

# concatenating the 10% of p-class and n-class to form the validation set
validation_set = pd.concat([p_validation, n_validation], axis=0, ignore_index=True)

# concatenating the 20% of p-class and n-class to form the testing set
testing_set = pd.concat([p_testing, n_testing], axis=0, ignore_index=True)

# display(training_set)
# display(validation_set)
# display(testing_set)


# training_review = training_set.iloc[:,:-1]
# training_class = training_set.iloc[:,-1]
# display(training_review)
# display(training_class)

# validation_review = validation_set.iloc[:,:-1]
# validation_class = validation_set.iloc[:,-1]
# display(testing_review)
# display(testing_class)

# testing_review = testing_set.iloc[:,:-1]
# testing_class = testing_set.iloc[:,-1]
# display(testing_review)
# display(testing_class)

# Classification using BERT

### Bert Tokenization

In [None]:
import torch
import numpy as np
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

labels = {'negative': 0,
          'positive': 1}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y



### Step 2