# Introduction

Author: Luis Sejas 

Student ID: 8440116

# Before the Model

## Part 1: Loading and Seeing the Data

In [1]:
%%capture
!pip install tensorflow-datasets > /dev/null
!pip install fasttext

In [2]:
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import fasttext
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = set(nltk.corpus.stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
(ds_train,ds_test),ds_info = tfds.load(
    name="imdb_reviews",
    split=["train","test"],
    shuffle_files=True,
    as_supervised=True,
    with_info=True
)

In [4]:
df_train = tfds.as_dataframe(ds_train, ds_info)
df_test = tfds.as_dataframe(ds_test, ds_info)

## Part 2: Pre-processing the data

I have noticed that the reviews start with b' or with b" and ' or " at the end, among other stuff.

The aim here is to clean the data to train an algorithm that will automatically detect the sentiment correctly Ideally, even ambiguous text

Below is a series of formulas to clean the reviews.

Keep in mind this is only the beginning, therefore some deep cleaning will not be employed at this stage and yes on the other ones.

This pre-processing will be preserved for comparison purposes.

In [5]:
def clean_entry(text_list):
  str_list = []
  for text in text_list:
    str_text = str(text)
    str_text_lim = len(str_text)-1
    str_text = str_text[1:str_text_lim]
    str_list.append(str_text)
  return html_term_remover(str_list)

def prepare_for_ai(df_col):
  list_to_return = df_col.tolist()
  return clean_entry(list_to_return)

def html_term_remover(df_list: list):
  return_list = []
  for i in df_list:
    b_soup = BeautifulSoup(i, 'html.parser')
    return_list.append(b_soup.get_text())
  return apply_re(return_list)

def apply_re(str_list):
  re_list = []
  for text in str_list:
    text = re.sub("[^0-9A-Za-z ]", "", text)
    re_list.append(text)
  return remove_integer(re_list)

def remove_integer(str_list):
  re_list = []
  int_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
  for text in str_list:
    sentence_list = []
    new_text = text.split()
    for word in new_text:
      if word not in int_list:
        sentence_list.append(word)
    re_list.append(' '.join(sentence_list))
  return return_lower_text(re_list)

def return_lower_text(str_list):
  re_list = []
  for word in str_list:
    re_list.append(word.lower())
  return re_list



In [6]:
# The following variables are base and every model will have its own adaptations

x_train = prepare_for_ai(df_train['text'])
x_test = prepare_for_ai(df_test['text'])
y_train = df_train['label']
y_test = df_test['label']

In [7]:
y_train_fasttext = y_train.values.tolist()
y_test_fasttext = y_test.values.tolist()

In [8]:
y_train_converted = []
y_test_converted = []

for label in y_train_fasttext:
  if label == 1:
    fasttext_label = "__label__positive"
    y_train_converted.append(fasttext_label)
  if label == 0:
    fasttext_label = "__label__negative"
    y_train_converted.append(fasttext_label)

for label in y_test_fasttext:
  if label == 1:
    fasttext_label = "__label__positive"
    y_test_converted.append(fasttext_label)
  if label == 0:
    fasttext_label = "__label__negative"
    y_test_converted.append(fasttext_label)

In [9]:
def apply_nltk(text):
  return_list = []
  for elem in text:
    tokens = word_tokenize(elem)
    working_list = []
    for word in tokens:
      if len(word) > 3 and word not in stopwords:
        working_list.append(stemmer.lemmatize(word))
    return_list.append(' '.join(working_list))
  return return_list


In [10]:
x_train_fasttext = apply_nltk(x_train)
x_test_fasttext = apply_nltk(x_test)

In [11]:
x_train_joined = []
for index in range(0, len(x_train_fasttext)):
  new_text = y_train_converted[index] + " " + x_train_fasttext[index]
  x_train_joined.append(new_text)

In [12]:
x_test_joined = []
for index in range(0, len(x_test_fasttext)):
  new_text = y_test_converted[index] + " " + x_test_fasttext[index]
  x_test_joined.append(new_text)

In [13]:
np.savetxt("x_test_ft.txt", x_test_joined, delimiter="\n", fmt="%s")
np.savetxt("x_train_ft.txt", x_train_joined, delimiter="\n", fmt="%s")

In [14]:
model = fasttext.train_supervised(input="x_train_ft.txt", lr=0.1, epoch=5)

In [15]:
model.test("x_test_ft.txt")

(25000, 0.87096, 0.87096)