<a href="https://colab.research.google.com/github/noahnguyen2004/Scotiabank-Customer-App-Review-Datathon/blob/main/customer_review_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import model_selection, preprocessing, linear_model, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import ensemble
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from xgboost import XGBClassifier

from IPython.display import display   # more user-friendly dataframe display


import nltk                           # tagging (e.g. positive, neutral, negative) classification
nltk.download('stopwords')
from nltk.corpus import stopwords     # stopwords to eliminate words that don't convey important information
from textblob import Word             # textblob for sentiment analysis
nltk.download('wordnet')

from termcolor import colored
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn import set_config
set_config(print_changed_only = False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## Loading data

In [62]:
df = pd.read_csv('/content/drive/MyDrive/Scotiabank customer review datathon/Winter 2024 Scotia DSD Data Set.csv', delimiter = ';', encoding = 'utf-8', on_bad_lines = 'skip')

In [63]:
# make a copy of the existing data. From now on we will work with df_copy
df_copy = df[:]
df_copy

Unnamed: 0,Review_ID,Date,Rating,Review_Language,Version,Review_Likes,Review
0,0,2022-04-20 11:38:29,1,en,20.37.2,0,"Worst bank on the planet. Liars, cheats, and t..."
1,1,2023-03-25 19:10:42,5,en,20.47.0,0,App is great.
2,2,2022-05-31 00:54:40,1,en,20.38.1,0,Barely works. Barely. Stopped giving out notif...
3,3,2021-06-18 13:16:44,5,en,20.27.0,0,Really easy for a technophobe
4,4,2023-11-29 13:54:17,1,en,2310.0.1,0,Sucks
...,...,...,...,...,...,...,...
9171,9171,2021-04-20 10:16:28,5,en,20.25.1,0,Great app. Easy to use!
9172,9172,2023-03-05 10:27:12,5,en,20.47.0,0,Great App Top Notch Top Notch!
9173,9173,2023-03-14 15:28:08,5,en,20.47.0,0,It works like this to should
9174,9174,2022-10-08 15:08:03,1,en,,0,This bank insists on barriers that prevents di...


## Preprocessing data

### Drop the ID column

In [64]:
df_copy = df_copy.drop('Review_ID', axis = 1)

In [65]:
df_copy

Unnamed: 0,Date,Rating,Review_Language,Version,Review_Likes,Review
0,2022-04-20 11:38:29,1,en,20.37.2,0,"Worst bank on the planet. Liars, cheats, and t..."
1,2023-03-25 19:10:42,5,en,20.47.0,0,App is great.
2,2022-05-31 00:54:40,1,en,20.38.1,0,Barely works. Barely. Stopped giving out notif...
3,2021-06-18 13:16:44,5,en,20.27.0,0,Really easy for a technophobe
4,2023-11-29 13:54:17,1,en,2310.0.1,0,Sucks
...,...,...,...,...,...,...
9171,2021-04-20 10:16:28,5,en,20.25.1,0,Great app. Easy to use!
9172,2023-03-05 10:27:12,5,en,20.47.0,0,Great App Top Notch Top Notch!
9173,2023-03-14 15:28:08,5,en,20.47.0,0,It works like this to should
9174,2022-10-08 15:08:03,1,en,,0,This bank insists on barriers that prevents di...


### Check the language of each customer's review

In [66]:
df_copy['Review_Language'].value_counts()

en    9176
Name: Review_Language, dtype: int64

There is no review of another language, thus we can proceed filtering stopwords.

### Convert customer review into lowercase

In [67]:
def lowercase(col):
    return col.apply(lambda x: x.lower() if isinstance(x, str) else x)

In [68]:
df_copy['Review'] = lowercase(df_copy['Review'])
df_copy['Review']

0       worst bank on the planet. liars, cheats, and t...
1                                           app is great.
2       barely works. barely. stopped giving out notif...
3                           really easy for a technophobe
4                                                   sucks
                              ...                        
9171                              great app. easy to use!
9172                       great app top notch top notch!
9173                         it works like this to should
9174    this bank insists on barriers that prevents di...
9175                                      very convenient
Name: Review, Length: 9176, dtype: object

In [69]:
df_copy

Unnamed: 0,Date,Rating,Review_Language,Version,Review_Likes,Review
0,2022-04-20 11:38:29,1,en,20.37.2,0,"worst bank on the planet. liars, cheats, and t..."
1,2023-03-25 19:10:42,5,en,20.47.0,0,app is great.
2,2022-05-31 00:54:40,1,en,20.38.1,0,barely works. barely. stopped giving out notif...
3,2021-06-18 13:16:44,5,en,20.27.0,0,really easy for a technophobe
4,2023-11-29 13:54:17,1,en,2310.0.1,0,sucks
...,...,...,...,...,...,...
9171,2021-04-20 10:16:28,5,en,20.25.1,0,great app. easy to use!
9172,2023-03-05 10:27:12,5,en,20.47.0,0,great app top notch top notch!
9173,2023-03-14 15:28:08,5,en,20.47.0,0,it works like this to should
9174,2022-10-08 15:08:03,1,en,,0,this bank insists on barriers that prevents di...


### Print out all common stopwords in English obtained by NLTK

In [70]:
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Customer's reviews tokenization

In [71]:
def review_split(col):
  '''
  Split a review into list of each word as an element
  '''
  for i in range(len(col)):
    col[i] = col[i].split()
  return col

In [72]:
df_copy['Review'] = review_split(df_copy['Review'])

In [73]:
df_copy

Unnamed: 0,Date,Rating,Review_Language,Version,Review_Likes,Review
0,2022-04-20 11:38:29,1,en,20.37.2,0,"[worst, bank, on, the, planet., liars,, cheats..."
1,2023-03-25 19:10:42,5,en,20.47.0,0,"[app, is, great.]"
2,2022-05-31 00:54:40,1,en,20.38.1,0,"[barely, works., barely., stopped, giving, out..."
3,2021-06-18 13:16:44,5,en,20.27.0,0,"[really, easy, for, a, technophobe]"
4,2023-11-29 13:54:17,1,en,2310.0.1,0,[sucks]
...,...,...,...,...,...,...
9171,2021-04-20 10:16:28,5,en,20.25.1,0,"[great, app., easy, to, use!]"
9172,2023-03-05 10:27:12,5,en,20.47.0,0,"[great, app, top, notch, top, notch!]"
9173,2023-03-14 15:28:08,5,en,20.47.0,0,"[it, works, like, this, to, should]"
9174,2022-10-08 15:08:03,1,en,,0,"[this, bank, insists, on, barriers, that, prev..."


### Filling missing data

In [76]:
def nan_value_count(df):
  '''
    Check number of missing values in each column
  '''
  df_null = {}
  for col in df.columns:
    num_null = df[col].isna().sum()
    df_null[col] = [num_null]
  df_null = pd.DataFrame(df_null, index=[0]).T
  df_null = df_null.rename(columns = {0: 'Number of missing values'})
  return df_null

In [83]:
nan_value_count(df_copy)

Unnamed: 0,Number of missing values
Date,0
Rating,0
Review_Language,0
Version,693
Review_Likes,0
Review,0


We observe that the number of missing values in the Version column is 693. We can proceed with assigning 0 to those records.

In [89]:
df_copy['Version'].fillna(0, inplace=True)

### Dealing with dates

The specific time, day, or month may not be relevant or bring any important information to the given problem, so we only take Year into account.

In [91]:
for i in range(len(df_copy['Date'])):
  df_copy['Date'][i] = df_copy['Date'][i][:4]

In [92]:
df_copy

Unnamed: 0,Date,Rating,Review_Language,Version,Review_Likes,Review
0,2022,1,en,20.37.2,0,"[worst, bank, on, the, planet., liars,, cheats..."
1,2023,5,en,20.47.0,0,"[app, is, great.]"
2,2022,1,en,20.38.1,0,"[barely, works., barely., stopped, giving, out..."
3,2021,5,en,20.27.0,0,"[really, easy, for, a, technophobe]"
4,2023,1,en,2310.0.1,0,[sucks]
...,...,...,...,...,...,...
9171,2021,5,en,20.25.1,0,"[great, app., easy, to, use!]"
9172,2023,5,en,20.47.0,0,"[great, app, top, notch, top, notch!]"
9173,2023,5,en,20.47.0,0,"[it, works, like, this, to, should]"
9174,2022,1,en,0,0,"[this, bank, insists, on, barriers, that, prev..."
