# Preparing data

## 1. Setup dataset and libraries

### import libraries

In [21]:
import pandas as pd
import re
import ast
import string
import numpy as np
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

folder_path = "..../dataset/"

## 2. Read dataset

In [2]:
df_ori = pd.read_csv( folder_path+"IMDB Dataset of 50K Movie Reviews/IMDB Dataset.csv" )
df_ori.sample( 5 )

Unnamed: 0,review,sentiment
21115,"<br /><br />Artisticly shot, actors portray ex...",positive
7079,I've seen the 1973 movie Lost Horizons and rea...,positive
11220,Second movie in the boxset. Originally titled ...,negative
49501,"Nick Cage is Randall Raines, a retired car thi...",positive
38153,This film rocks...so hard...<br /><br />The ca...,positive


In [3]:
df_ori.dtypes

review       object
sentiment    object
dtype: object

In [4]:
df_ori.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [5]:
print( "total data reviews:",len( df_ori ) )

total data reviews: 50000


## 3. Data Sampling

### function for sampling data

In [6]:
# to do sampling data for the selected dataset

def stratified_sampling( dtframe, column_category, number_data_each_class ):
  # get selected column values
  classes = dtframe[ column_category ].unique()

  # vessel for collected data
  collected_data = []

  # for each class get their sample data randomly
  for cls in classes:
    # sampling the data from a class
    chosen_data = dtframe[ dtframe[ column_category ] == cls ].sample( n=number_data_each_class, random_state=42 )

    # save selected data into the temporary vessel
    collected_data.append( chosen_data )
  # end loop

  # concate all the collected data
  merged_data = pd.concat( collected_data, ignore_index=True )

  # randomize the order
  merged_data = merged_data.sample(frac=1).reset_index( drop=True )

  # return the final data
  return merged_data
# end func

### do data sampling with previous function

In [7]:
# do data sampling
sampled_data = stratified_sampling( df_ori, 'sentiment', 5000 )

# check the collected sampled data
sampled_data.head( 5 )

Unnamed: 0,review,sentiment
0,"The trouble with the book, ""Memoirs of a Geish...",negative
1,Take someone you love or want to love and go s...,positive
2,"""Cut"" is a film about some film students makin...",negative
3,History and experience over the past couple of...,negative
4,"""Death Bed:The Bed That Eats"" is a supremely b...",positive


### Saving current sampled data

In [8]:
# # save seledcted data into the csv file
# sampled_data.to_csv( folder_path+"sampled_movie_dataset.csv", index=False )

## 4. Preprocessing Step

### read sampled dataset

In [9]:
# read the saved sampled data from previous data sampling step
df_sampled = pd.read_csv( folder_path+"sampled_movie_dataset.csv" )

# check the number of selected data
print( "number of data:",len( df_sampled ) )

# create new column to store preprocessed text reviews
df_sampled['preprocessed_review'] = None

# check current dataframe conditions
df_sampled.head( 5 )

number of data: 10000


Unnamed: 0,review,sentiment,preprocessed_review
0,"I know little or nothing about astronomy, but ...",positive,
1,"(As a note, I'd like to say that I saw this mo...",negative,
2,The year 2000 had been a bad year for indian f...,positive,
3,If you pack all the clichés about city firefig...,negative,
4,How many of us have read a book or seen a play...,positive,


### setups objects, list, and functions to be used on preprocessing data

In [10]:
# create regex pattern for numerical and punctuation removal
word_pattern = re.compile(r"[^A-Za-z]+")

# download tokenizer and stopwords
nltk.download( 'punkt' )
nltk.download( 'stopwords' )

# get list of stopwords and punctuation chars
list_stopwords = set( stopwords.words( 'english' ) )

# create a Porter Stemmer instance
porter_stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\medin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\medin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### additional functions to help preprocessing

In [11]:
# function to remove any html tags
def remove_html_tags( review_text ):
  # only remove the tags when a review text contains html tags
  if '<' in review_text and '>' in review_text:
    # replace a <br> tags with a space
    text_without_br = review_text.replace( "<br />", " " )

    # remove other html tags
    return BeautifulSoup( text_without_br, "html.parser" ).get_text()
  else:
    return review_text
  # end if
# end function

# merge all pre processing functions into a function
def preprocessing_text( input_text ):
  # 1. remove the html tags from input text
  removal_html_result = remove_html_tags( input_text )

  # 2. Sentence segmentation, breaking every document into sentences
  segmentation_result = sent_tokenize( removal_html_result )

  # 3. Tokenisasi; case folding; removal of punctuation and numerical character; stopword removal; word stemming
  # loop all sentences for pre processing on each words
  final_result = []
  unique_words = []
  for sentence in segmentation_result:
    # default tokenized sentence
    tokenized_words = []

    # 3. Tokenizing
    # loop all tokenized sentence
    for token in word_tokenize( sentence ):
      # 4. Case folding
      word_1 = token.lower()

      # 5. Punctuation and numerical character removal
      # ref: https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string
      word_2 = word_pattern.sub( '', word_1 )

      # 6. stopwords removal, also remove any empty word
      # ref: https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
      if not word_2 in list_stopwords and len( word_2 ) > 0:

        # 7. word stemming (Porter’s Stemming)
        # (?) maybe also try lemmatization
        # ref: https://www.geeksforgeeks.org/introduction-to-stemming/
        word_3 = porter_stemmer.stem( word_2 )

        # store the pre processed word
        tokenized_words.append( word_3 )
      # end if
    # end loop tokenized sentence

    # store all tokenized sentences into the final word list
    final_result.append( tokenized_words )
  # end loop sentence list

  # return final result of preprocessed input text
  return final_result
# end if

### do preprocessing and convert sentiment value from string into number

In [12]:
# preprocess all data
df_sampled['preprocessed_review'] = df_sampled['review'].apply( lambda review_text: preprocessing_text( review_text ) )

# convert sentiment from string into number
df_sampled['sentiment_number'] = df_sampled['sentiment'].map( { 'negative': 0, 'positive': 1 } )

# check data
df_sampled.head( 5 )

  return BeautifulSoup( text_without_br, "html.parser" ).get_text()


Unnamed: 0,review,sentiment,preprocessed_review,sentiment_number
0,"I know little or nothing about astronomy, but ...",positive,"[[know, littl, noth, astronomi, nevertheless, ...",1
1,"(As a note, I'd like to say that I saw this mo...",negative,"[[note, like, say, saw, movi, annual, church, ...",0
2,The year 2000 had been a bad year for indian f...,positive,"[[year, bad, year, indian, film, due, lack, qu...",1
3,If you pack all the clichés about city firefig...,negative,"[[pack, clich, citi, firefight, minut, ladder]...",0
4,How many of us have read a book or seen a play...,positive,"[[mani, us, read, book, seen, play, movi, vers...",1


## 5. Feature Extraction Step

ref:
- https://www.analyticsvidhya.com/blog/2021/09/what-are-n-grams-and-how-to-implement-them-in-python/
- https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/
- ChatGPT

## convert tokens

### functions to convert tokens into form tat able to be processed in CountVector function

In [13]:
# to merge all tokens from a review into one long textual value with divider of each sentence is `.`
def merge_tokens_into_sentences( input_sentences, separate_sentence=False ):
  # default vessel for joined tokens
  joined_tokens_list = []

  # loop all sentence
  for tokens in input_sentences:
    if separate_sentence:
      # merge tokens into one sentence; and add it into the merged token list
      joined_tokens_list.append( " ".join( tokens ) + "." )
    else:
      # merge tokens into one sentence; and add it into the merged token list
      joined_tokens_list.append( " ".join( tokens ) )
    # end if
  # end loop

  # return final merged tokens
  return " ".join( joined_tokens_list )
# end func

### apply the previous function into preprocessed data review and store it into different column

In [14]:
# merge reviews in two different ways

# a) create tokens from each sentence from one review
df_sampled['merged_tokens_many'] = df_sampled['preprocessed_review'].apply( lambda review_text: merge_tokens_into_sentences( review_text, separate_sentence=True ) )

# b) create tokens from one review (not separated based on their sentence)
df_sampled['merged_tokens_one']  = df_sampled['preprocessed_review'].apply( lambda review_text: merge_tokens_into_sentences( review_text, separate_sentence=False ) )

# c) re-arrange columns order
df_sampled = df_sampled[ [ 'review', 'merged_tokens_one', 'merged_tokens_many', 'sentiment', 'sentiment_number' ] ]

# save selected data into the csv file
df_sampled.to_csv( folder_path+"processed_movie_dataset.csv", index=False )

# check result
df_sampled.head( 5 )

Unnamed: 0,review,merged_tokens_one,merged_tokens_many,sentiment,sentiment_number
0,"I know little or nothing about astronomy, but ...",know littl noth astronomi nevertheless first l...,know littl noth astronomi nevertheless first l...,positive,1
1,"(As a note, I'd like to say that I saw this mo...",note like say saw movi annual church camp enti...,note like say saw movi annual church camp enti...,negative,0
2,The year 2000 had been a bad year for indian f...,year bad year indian film due lack qualiti ima...,year bad year indian film due lack qualiti ima...,positive,1
3,If you pack all the clichés about city firefig...,pack clich citi firefight minut ladder stori h...,pack clich citi firefight minut ladder. stori ...,negative,0
4,How many of us have read a book or seen a play...,mani us read book seen play movi version came ...,mani us read book seen play movi version came ...,positive,1


### split data into train and test set

In [15]:
# set into y and X variables
y_data = df_sampled['sentiment_number'].copy()
X_data = df_sampled.copy().drop( 'sentiment_number', axis=1 )

# split into train and test set
X_train, X_test, y_train, y_test = train_test_split( X_data, y_data, test_size=0.2, random_state=42, stratify=y_data )

# merge the X and y variables back into separate DataFrames
df_train = pd.concat( [ X_train, y_train ], axis=1 )
df_test  = pd.concat( [ X_test, y_test ], axis=1 )

# save each train and test set into separate csv file
df_train.to_csv( folder_path+"processed_movie_dataset_train.csv", index=False )
df_test.to_csv( folder_path+"processed_movie_dataset_test.csv", index=False )

# check the class distribution to verify stratification
print( "Class distribution in the original sampled dataset:" )
print( df_sampled['sentiment_number'].value_counts( normalize=True ) )
print( "" )
print( "Class distribution in the training set:" )
print( df_train['sentiment_number'].value_counts( normalize=True ) )
print( "" )
print( "Class distribution in the test set:" )
print( df_test['sentiment_number'].value_counts( normalize=True ) )

Class distribution in the original sampled dataset:
sentiment_number
1    0.5
0    0.5
Name: proportion, dtype: float64

Class distribution in the training set:
sentiment_number
0    0.5
1    0.5
Name: proportion, dtype: float64

Class distribution in the test set:
sentiment_number
0    0.5
1    0.5
Name: proportion, dtype: float64


### try to read original preprocessed data

In [16]:
read_ori = pd.read_csv( folder_path+"processed_movie_dataset.csv" )
read_ori.head( 5 )

Unnamed: 0,review,merged_tokens_one,merged_tokens_many,sentiment,sentiment_number
0,"I know little or nothing about astronomy, but ...",know littl noth astronomi nevertheless first l...,know littl noth astronomi nevertheless first l...,positive,1
1,"(As a note, I'd like to say that I saw this mo...",note like say saw movi annual church camp enti...,note like say saw movi annual church camp enti...,negative,0
2,The year 2000 had been a bad year for indian f...,year bad year indian film due lack qualiti ima...,year bad year indian film due lack qualiti ima...,positive,1
3,If you pack all the clichés about city firefig...,pack clich citi firefight minut ladder stori h...,pack clich citi firefight minut ladder. stori ...,negative,0
4,How many of us have read a book or seen a play...,mani us read book seen play movi version came ...,mani us read book seen play movi version came ...,positive,1


### try to read preprocessed data train

In [17]:
read_train = pd.read_csv( folder_path+"processed_movie_dataset_train.csv" )
read_train.head( 5 )

Unnamed: 0,review,merged_tokens_one,merged_tokens_many,sentiment,sentiment_number
0,"There have been many movies featuring Bigfoot,...",mani movi featur bigfoot major good least goof...,mani movi featur bigfoot major good least goof...,negative,0
1,The plot: A crime lord is uniting 3 different ...,plot crime lord unit differ mafia entrepris bu...,plot crime lord unit differ mafia entrepris bu...,positive,1
2,"It's great to hear the 3 or so comments, that ...",great hear comment point footbal wive signifi ...,great hear comment point footbal wive signifi ...,negative,0
3,"A great story, based on a true story about a y...",great stori base true stori young black man di...,great stori base true stori young black man di...,positive,1
4,This film is the proof that a good actor is no...,film proof good actor noth without good direct...,film proof good actor noth without good direct...,negative,0


### try to read preprocessed data test

In [19]:
read_test = pd.read_csv( folder_path+"processed_movie_dataset_test.csv" )
read_test.head( 5 )

Unnamed: 0,review,merged_tokens_one,merged_tokens_many,sentiment,sentiment_number
0,I gave this movie a chance only because it had...,gave movi chanc good review see trailer though...,gave movi chanc good review. see trailer thoug...,negative,0
1,"This is full of major spoilers, so beware.<br ...",full major spoiler bewar prix de beaut alway s...,full major spoiler bewar. prix de beaut alway ...,positive,1
2,Never saw the original movie in the series...I...,never saw origin movi seri hope much better mo...,never saw origin movi seri hope much better mo...,negative,0
3,This is a film that is far more enjoyable than...,film far enjoy rate would suggest mani way lik...,film far enjoy rate would suggest. mani way li...,positive,1
4,This was intolerable. (SPOILER #1) Protagonist...,intoler spoiler protagonist avoid pointless di...,intoler. spoiler protagonist avoid pointless d...,negative,0


## (optional) try to create features

In [22]:
# to create a vectorizer with sklearn countvectorizer function
def create_vectorizer( input_docs, ngram_start=1, ngram_end=1 ):
  # create a corpus from input documents
  corpus = " ".join( input_docs )

  # create a CountVectorizer object with desired n-gram range
  vectorizer = CountVectorizer( ngram_range=( ngram_start, ngram_end ), binary=True )

  # fit the vectorizer to the concatenated corpus
  vectorizer.fit( [ corpus ] )

  # get the shared vocabulary of n-grams
  vocabulary = vectorizer.get_feature_names_out()

  # return vocab list and the vectrizer obj
  return vocabulary, vectorizer
# end func

### create features from sampled data only

In [24]:
# create ngrams for models from original sampled data
unigram_vocab, unigram_vectorizer           = create_vectorizer( read_ori['merged_tokens_one'], 1, 1 )
bigram_vocab, bigram_vectorizer             = create_vectorizer( read_ori['merged_tokens_one'], 2, 2 )
trigram_vocab, trigram_vectorizer           = create_vectorizer( read_ori['merged_tokens_one'], 3, 3 )
unibigram_vocab, unibigram_vectorizer       = create_vectorizer( read_ori['merged_tokens_one'], 1, 2 )
unibitrigram_vocab, unibitrigram_vectorizer = create_vectorizer( read_ori['merged_tokens_one'], 1, 3 )

# check current length of vocabulary on each ngram vectorizer
print( "len of unigram_vocab:", len( unigram_vocab ) )
print( "len of bigram_vocab:", len( bigram_vocab ) )
print( "len of trigram_vocab:", len( trigram_vocab ) )
print( "len of unibigram_vocab:", len( unibigram_vocab ) )
print( "len of unibitrigram_vocab:", len( unibitrigram_vocab ) )

len of unigram_vocab: 43363
len of bigram_vocab: 731612
len of trigram_vocab: 1128636
len of unibigram_vocab: 774975
len of unibitrigram_vocab: 1903611


### create features from train set only

In [23]:
# create ngrams for models from train set
unigram_vocab, unigram_vectorizer           = create_vectorizer( read_train['merged_tokens_one'], 1, 1 )
bigram_vocab, bigram_vectorizer             = create_vectorizer( read_train['merged_tokens_one'], 2, 2 )
trigram_vocab, trigram_vectorizer           = create_vectorizer( read_train['merged_tokens_one'], 3, 3 )
unibigram_vocab, unibigram_vectorizer       = create_vectorizer( read_train['merged_tokens_one'], 1, 2 )
unibitrigram_vocab, unibitrigram_vectorizer = create_vectorizer( read_train['merged_tokens_one'], 1, 3 )

# check current length of vocabulary on each ngram vectorizer
print( "len of unigram_vocab:", len( unigram_vocab ) )
print( "len of bigram_vocab:", len( bigram_vocab ) )
print( "len of trigram_vocab:", len( trigram_vocab ) )
print( "len of unibigram_vocab:", len( unibigram_vocab ) )
print( "len of unibitrigram_vocab:", len( unibitrigram_vocab ) )

len of unigram_vocab: 38837
len of bigram_vocab: 609699
len of trigram_vocab: 911622
len of unibigram_vocab: 648536
len of unibitrigram_vocab: 1560158
