# Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from transformers import BertTokenizer
# import torch
# from torch import nn
# from transformers import BertModel
# from torch.optim import Adam
# from tqdm import tqdm

# Data Reading

In [2]:
dataset = pd.read_csv("IMDB Dataset.csv", sep=',')
# dataset = pd.read_csv("fruits.csv", sep=',')
display(dataset)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Data Split

In [3]:
# split data to positive and negative
p_samples = dataset[dataset['sentiment'] == 'positive']
n_samples = dataset[dataset['sentiment'] == 'negative']
# display(p_samples)
# display(n_samples)

# split the positive class samples to (70%, 10%, 20%) for (training, validation, testing) respectively 
p_testing = p_samples.sample(frac = 0.2)
p_validation = p_samples.drop(p_testing.index).sample(frac = 0.125)
p_training = p_samples.drop(p_validation.index).drop(p_testing.index)

# split the negative class samples to (70%, 10%, 20%) for (training, validation, testing) respectively 
n_testing = n_samples.sample(frac = 0.2)
n_validation = n_samples.drop(n_testing.index).sample(frac = 0.125)
n_training = n_samples.drop(n_validation.index).drop(n_testing.index)

# concatenating the 70% of p-class and n-class to form the training set
training_set = pd.concat([p_training, n_training], axis=0, ignore_index=True)

# concatenating the 10% of p-class and n-class to form the validation set
validation_set = pd.concat([p_validation, n_validation], axis=0, ignore_index=True)

# concatenating the 20% of p-class and n-class to form the testing set
testing_set = pd.concat([p_testing, n_testing], axis=0, ignore_index=True)

display(training_set)
# display(validation_set)
# display(testing_set)


# training_review = training_set.iloc[:,:-1]
# training_class = training_set.iloc[:,-1]
# display(training_review)
# display(training_class)

# validation_review = validation_set.iloc[:,:-1]
# validation_class = validation_set.iloc[:,-1]
# display(testing_review)
# display(testing_class)

# testing_review = testing_set.iloc[:,:-1]
# testing_class = testing_set.iloc[:,-1]
# display(testing_review)
# display(testing_class)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,I thought this was a wonderful way to spend ti...,positive
2,"Petter Mattei's ""Love in the Time of Money"" is...",positive
3,"Probably my all-time favorite movie, a story o...",positive
4,I sure would like to see a resurrection of a u...,positive
...,...,...
34995,A remake of Alejandro Amenabar's Abre los Ojos...,negative
34996,"Lame, lame, lame!!! A 90-minute cringe-fest th...",negative
34997,"Les Visiteurs, the first movie about the medie...",negative
34998,I'm going to have to disagree with the previou...,negative


# Text Pre-processing

### Remove punctuation

In [7]:
import string
string.punctuation

def remove_punctuation(text):
    return "".join([i for i in text if i not in string.punctuation])

# storing the puntuation free text
training_set['free_punc_review']= training_set['review'].apply(lambda x:remove_punctuation(x))
validation_set['free_punc_review']= validation_set['review'].apply(lambda x:remove_punctuation(x))
testing_set['free_punc_review']= testing_set['review'].apply(lambda x:remove_punctuation(x))

### Lowercase all characters

In [8]:
training_set['review_lower']= training_set['free_punc_review'].apply(lambda x: x.lower())
validation_set['review_lower']= validation_set['free_punc_review'].apply(lambda x: x.lower())
testing_set['review_lower']= testing_set['free_punc_review'].apply(lambda x: x.lower())

### Tokenization

In [None]:
import re
def tokenization(text):
    tokens = word_tokenize(text)
    return tokens
training_set['review_tokenied'] = training_set['review_lower'].apply(lambda x: tokenization(x))
validation_set['review_tokenied'] = validation_set['review_lower'].apply(lambda x: tokenization(x))
testing_set['review_tokenied'] = testing_set['review_lower'].apply(lambda x: tokenization(x))

### Remove stop words

In [None]:
from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')
stop_words[0:10]
['i', 'the', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

def remove_stopwords(text):
    output= [i for i in text if i not in stop_words]
    return output

training_set['no_stopwords'] = training_set['review_tokenied'].apply(lambda x:remove_stopwords(x))
validation_set['no_stopwords'] = validation_set['review_tokenied'].apply(lambda x:remove_stopwords(x))
testing_set['no_stopwords'] = testing_set['review_tokenied'].apply(lambda x:remove_stopwords(x))

### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
    if type(text) is float : 
        return []
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
training_set['review_lemmatized'] = training_set['no_stopwords'].apply(lambda x:lemmatizer(x))

display(training_set)
display(validation_set)
display(testing_set)

# Classification using BERT