<a href="https://colab.research.google.com/github/luca2618/COMP550_project/blob/main/data/preprocess_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
### importing libraries

# basic libraries
import pandas as pd
import math
import regex as re
import ast

# nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Importing dataset

In [2]:
# cloning github repo
!git clone https://github.com/luca2618/COMP550_project

Cloning into 'COMP550_project'...
remote: Enumerating objects: 79, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 79 (delta 8), reused 18 (delta 6), pack-reused 59[K
Receiving objects: 100% (79/79), 376.51 MiB | 33.86 MiB/s, done.
Resolving deltas: 100% (31/31), done.


In [3]:
# unzipping dataset and saving it in a dataframe
!unzip COMP550_project/data/reviews_dataset.csv.zip

Archive:  COMP550_project/data/reviews_dataset.csv.zip
  inflating: reviews_dataset.csv     


In [4]:
reviews_df = pd.read_csv('reviews_dataset.csv')

# Pre-processing

In [5]:
### converts column of lists as string to list
def convert_string_list_col(df):

  for index, row in df.iterrows():
    df.at[index, 'PREPROCESSED_REVIEW'] = ast.literal_eval(row['PREPROCESSED_REVIEW'])

  return df

In [6]:
### returns text as tokens
def preprocess_text(text):

  # removes punctuation and numbers
  text = re.sub('[0-9_;:!?.,*@#\[\]\(\)?<>"\']', '', text)
  lowercase_text = text.lower()

  # tokenizes the text
  tokens = word_tokenize(lowercase_text)

  ### for now, dont remove stopwords as we need "she", "her", etc.
  # removes stopwords
  # tokens = [word for word in tokens if word not in stopwords.words('english')]

  ### for now, dont lemmatize words as our axes seem to focus on adjectives
  # lemmatizes words
  # tokens = lemmatize_text_tokens(tokens)

  return tokens

In [7]:
def lemmatize_text_tokens(text_tokens):
  # intializes lemmatizer
  wnl = WordNetLemmatizer()

  # pos tags
  pos_tags = nltk.pos_tag(text_tokens)
  # replaces NLTK tags with ones that are accepted by WordNetLemmatizer
  pos_tags = [(word, lemmatizer_pos(tag)) for word, tag in pos_tags]

  # lemmatizes each word
  lemmatized_tokens = [wnl.lemmatize(word, tag) for word, tag in pos_tags]

  return lemmatized_tokens

In [8]:
def lemmatizer_pos(pos):

  pos_first_letter = pos[0]

  match pos_first_letter:
    # verbs
    case 'V':
      pos = 'v'
    # adjectives
    case 'J':
      pos = 'a'
    # adverbs
    case 'R':
      pos = 'r'
    case _:
      pos = 'n'

  return pos

In [9]:
### adds a new column for the preprocessed review and populates it
def preprocess_reviews_text(df, verbose = True):
  # for tracking progress
  count = 0

  if 'PREPROCESSED_REVIEW' not in df.columns:
    df['PREPROCESSED_REVIEW'] = ''

  for index, row in df.iterrows():

    # for tracking progress
    if count % 10000 == 0 and verbose:
      print(count)

    review_text = row['REVIEW']

    preprocessed_review = preprocess_text(review_text)
    df.at[index, 'PREPROCESSED_REVIEW'] = preprocessed_review

    # for tracking progress
    count += 1

  return df

# Preprocesses reviews text

In [10]:
### preprocesses reviews
cleaned_df = preprocess_reviews_text(reviews_df)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000


In [34]:
### removing raw text from datatframe and saving it as a json file
cleaned_df = cleaned_df.drop(columns = ['REVIEW'])
json_cleaned_reviews_dataset = cleaned_df.to_json(orient = "records")

# Saving files

In [31]:
### downloading cleaned dataset (google colab)

from google.colab import files
import json

with open('preprocessed_reviews_dataset.json', 'w') as f:
  json.dump(json_cleaned_reviews_dataset, f)
  files.download('preprocessed_reviews_dataset.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>