### Data Preprocessing

In [1]:
import pandas as pd
import warnings
import contractions
import json
import datetime
import gensim
import spacy
import joblib

from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

warnings.filterwarnings(action="ignore")



---

#### I. Import CSV files

In [2]:
def import_csv_as(path):
    reviews = pd.read_csv(path)
    reviews['title_review'] = reviews['title'] + ' ' + reviews['review']
    reviews = reviews.drop(['userName','title','review','isEdited','app_id','developerResponse','app_name'],axis=1)
    return reviews

In [3]:
def import_csv_ps(path):
    reviews = pd.read_csv(path)
    reviews['title_review'] = reviews['content'].astype(str)
    reviews['date'] = reviews['at']
    reviews['rating'] = reviews['score']
    reviews = reviews[['date','rating','title_review']]
    return reviews

In [4]:
gcash_reviews_as = import_csv_as('../../Data/Raw/App Store/GCash/gcash_0613-1427.csv')
paymaya_reviews_as = import_csv_as('../../Data/Raw/App Store/PayMaya/paymaya_0613-1402.csv')

In [5]:
gcash_reviews_ps = import_csv_ps('../../Data/Raw/Play Store/gcash_reviews_playstore.csv')
paymaya_reviews_ps = import_csv_ps('../../Data/Raw/Play Store/paymaya_reviews_playstore.csv')

In [6]:
gcash_reviews_as

Unnamed: 0,date,rating,title_review
0,2018-12-29 15:24:59,3,It is starting to become pretty annoying Almos...
1,2020-05-16 07:33:32,5,Actually ok. This app gets so many negative re...
2,2020-02-04 00:25:57,3,Cash In Concern A little bit disappointed when...
3,2019-12-10 09:43:52,1,Frustrating and slow and you’re a bunch of thi...
4,2018-06-26 18:43:31,1,Payments don’t post. My payment via gcash did ...
...,...,...,...
4862,2016-01-16 11:52:50,1,FIX THIS!!! This is one of the crucial apps fo...
4863,2015-11-09 08:20:37,1,Can't access because they can't process my tra...
4864,2015-09-19 14:58:01,1,Waste of time downloading Compatibility says c...
4865,2015-09-09 11:45:00,1,Cant log in Useless app.cant access my gcash w...


In [7]:
paymaya_reviews_as

Unnamed: 0,date,rating,title_review
0,2020-03-30 01:10:54,5,VERY BAD SERVICE I have been contacting them a...
1,2020-08-06 03:55:14,1,Very poor customer service There should be a z...
2,2020-05-02 01:58:12,1,App is great but service is not I really loved...
3,2020-04-09 01:32:02,1,Make your app trustworthy really Comeon this i...
4,2020-12-01 07:59:56,1,The new update... Ok so lets be honest I love ...
...,...,...,...
2564,2016-11-15 05:07:55,1,Verification code - time waster Your verificat...
2565,2015-11-29 18:38:50,4,"Nice One Very nice idea, we can buy now online..."
2566,2016-04-09 15:51:13,2,UPDATE PLEASE! Error occured when i'm trying t...
2567,2016-03-16 10:26:02,1,Am I doing something wrong I can't receive the...


In [8]:
gcash_reviews_ps

Unnamed: 0,date,rating,title_review
0,2021-06-02 18:32:37,1,Good for mobile online deals.
1,2021-06-02 18:32:14,1,Ilang beses nang naulit na nag load ako nag ba...
2,2021-06-02 18:32:07,4,Its a great experience and convenient
3,2021-06-02 18:31:31,5,Ok na ok sya para sa mga easy transaction lalo...
4,2021-06-02 18:31:25,5,very helpful and contented
...,...,...,...
176511,2019-04-03 09:46:14,5,I love it
176512,2019-04-03 09:38:52,4,I love the save money and invest feature of th...
176513,2019-04-03 09:38:16,1,Amex does not work and support is basically br...
176514,2019-04-03 09:33:42,5,satisfied


In [9]:
paymaya_reviews_ps

Unnamed: 0,date,rating,title_review
0,2021-06-15T10:22:11.000Z,5,I rated 1 star yesterday and if there's lower ...
1,2021-06-15T10:08:54.000Z,2,I cant do anything with my money! It always sa...
2,2021-06-15T10:08:14.000Z,5,Masyado na kaming maraming papel dito sa bahay...
3,2021-06-15T10:04:31.000Z,4,upgrading account and changing no is disappoin...
4,2021-06-15T09:56:32.000Z,5,I'm super stressed with my past e-wallet app s...
...,...,...,...
104558,2021-06-12T06:18:19.000Z,5,Very good in service easy to send get cash
104559,2021-06-12T06:08:23.000Z,4,Bakit walang alloutsurf sa load.
104560,2021-06-12T04:54:57.000Z,1,Keep contacting the support about the duplicat...
104561,2021-06-12T04:03:56.000Z,5,I like this apps promise


#### II. Preprocessing

Reference: 
1. https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0
---

##### A. Dropping blank rows, changing to lower case, expanding contractions, spell correction, and tokenization

In [10]:
def preprocess_a(corpus, column):
    new_column = f"{column}_tokenized"
    corpus[column].dropna(inplace=True) # Drop blank rows
    corpus[column] = [entry.lower() for entry in corpus[column]] # Lowercase text
    corpus[column] = corpus[column].apply(lambda x: contractions.fix(x)) # Expand contractions
    corpus[column] = corpus[column].apply(lambda x: str(TextBlob(x))) # Correct spelling
    corpus[column] = corpus[column].str.replace('[^\w\s]','') # Remove punctuation
    corpus[new_column] = [word_tokenize(entry) for entry in corpus[column]] # Tokenization

In [11]:
preprocess_a(gcash_reviews_as, 'title_review')
preprocess_a(paymaya_reviews_as, 'title_review')

In [12]:
preprocess_a(gcash_reviews_ps, 'title_review')
preprocess_a(paymaya_reviews_ps, 'title_review')

In [13]:
gcash_reviews_as

Unnamed: 0,date,rating,title_review,title_review_tokenized
0,2018-12-29 15:24:59,3,it is starting to become pretty annoying almos...,"[it, is, starting, to, become, pretty, annoyin..."
1,2020-05-16 07:33:32,5,actually ok this app gets so many negative rev...,"[actually, ok, this, app, gets, so, many, nega..."
2,2020-02-04 00:25:57,3,cash in concern a little bit disappointed when...,"[cash, in, concern, a, little, bit, disappoint..."
3,2019-12-10 09:43:52,1,frustrating and slow and you are a bunch of th...,"[frustrating, and, slow, and, you, are, a, bun..."
4,2018-06-26 18:43:31,1,payments do not post my payment via gcash did ...,"[payments, do, not, post, my, payment, via, gc..."
...,...,...,...,...
4862,2016-01-16 11:52:50,1,fix this this is one of the crucial apps for g...,"[fix, this, this, is, one, of, the, crucial, a..."
4863,2015-11-09 08:20:37,1,can not access because they can not process my...,"[can, not, access, because, they, can, not, pr..."
4864,2015-09-19 14:58:01,1,waste of time downloading compatibility says c...,"[waste, of, time, downloading, compatibility, ..."
4865,2015-09-09 11:45:00,1,can not log in useless appcan not access my gc...,"[can, not, log, in, useless, appcan, not, acce..."


In [14]:
paymaya_reviews_as

Unnamed: 0,date,rating,title_review,title_review_tokenized
0,2020-03-30 01:10:54,5,very bad service i have been contacting them a...,"[very, bad, service, i, have, been, contacting..."
1,2020-08-06 03:55:14,1,very poor customer service there should be a z...,"[very, poor, customer, service, there, should,..."
2,2020-05-02 01:58:12,1,app is great but service is not i really loved...,"[app, is, great, but, service, is, not, i, rea..."
3,2020-04-09 01:32:02,1,make your app trustworthy really comeon this i...,"[make, your, app, trustworthy, really, comeon,..."
4,2020-12-01 07:59:56,1,the new update ok so let us be honest i love m...,"[the, new, update, ok, so, let, us, be, honest..."
...,...,...,...,...
2564,2016-11-15 05:07:55,1,verification code time waster your verificati...,"[verification, code, time, waster, your, verif..."
2565,2015-11-29 18:38:50,4,nice one very nice idea we can buy now online ...,"[nice, one, very, nice, idea, we, can, buy, no..."
2566,2016-04-09 15:51:13,2,update please error occured when I am trying t...,"[update, please, error, occured, when, I, am, ..."
2567,2016-03-16 10:26:02,1,am i doing something wrong i can not receive t...,"[am, i, doing, something, wrong, i, can, not, ..."


In [15]:
gcash_reviews_ps

Unnamed: 0,date,rating,title_review,title_review_tokenized
0,2021-06-02 18:32:37,1,good for mobile online deals,"[good, for, mobile, online, deals]"
1,2021-06-02 18:32:14,1,ilang beses nang naulit na nag load ako nag ba...,"[ilang, beses, nang, naulit, na, nag, load, ak..."
2,2021-06-02 18:32:07,4,its a great experience and convenient,"[its, a, great, experience, and, convenient]"
3,2021-06-02 18:31:31,5,ok na ok sya para sa mga easy transaction lalo...,"[ok, na, ok, sya, para, sa, mga, easy, transac..."
4,2021-06-02 18:31:25,5,very helpful and contented,"[very, helpful, and, contented]"
...,...,...,...,...
176511,2019-04-03 09:46:14,5,i love it,"[i, love, it]"
176512,2019-04-03 09:38:52,4,i love the save money and invest feature of th...,"[i, love, the, save, money, and, invest, featu..."
176513,2019-04-03 09:38:16,1,amex does not work and support is basically br...,"[amex, does, not, work, and, support, is, basi..."
176514,2019-04-03 09:33:42,5,satisfied,[satisfied]


In [16]:
paymaya_reviews_ps

Unnamed: 0,date,rating,title_review,title_review_tokenized
0,2021-06-15T10:22:11.000Z,5,i rated 1 star yesterday and if there is lower...,"[i, rated, 1, star, yesterday, and, if, there,..."
1,2021-06-15T10:08:54.000Z,2,i can not do anything with my money it always ...,"[i, can, not, do, anything, with, my, money, i..."
2,2021-06-15T10:08:14.000Z,5,masyado na kaming maraming papel dito sa bahay...,"[masyado, na, kaming, maraming, papel, dito, s..."
3,2021-06-15T10:04:31.000Z,4,upgrading account and changing no is disappoin...,"[upgrading, account, and, changing, no, is, di..."
4,2021-06-15T09:56:32.000Z,5,I am super stressed with my past ewallet app s...,"[I, am, super, stressed, with, my, past, ewall..."
...,...,...,...,...
104558,2021-06-12T06:18:19.000Z,5,very good in service easy to send get cash,"[very, good, in, service, easy, to, send, get,..."
104559,2021-06-12T06:08:23.000Z,4,bakit walang alloutsurf sa load,"[bakit, walang, alloutsurf, sa, load]"
104560,2021-06-12T04:54:57.000Z,1,keep contacting the support about the duplicat...,"[keep, contacting, the, support, about, the, d..."
104561,2021-06-12T04:03:56.000Z,5,i like this apps promise,"[i, like, this, apps, promise]"


##### B. Filtering by date and ratings
---
Remove 2,3,4 star reviews and limit date to provided timeframe.

In [17]:
start_date = "2020-01-01"
end_date = "2021-05-31"

In [18]:
def preprocess_b(corpus):
    corpus['date'] = pd.to_datetime(corpus['date'])
    corpus = corpus.loc[(corpus['date'] >= start_date) & (corpus['date'] <= end_date)]
    corpus = corpus.sort_values(by='date')
    corpus_good = corpus.loc[corpus['rating'] == 5]
    corpus_bad = corpus.loc[corpus['rating'] == 1]
    return corpus_good, corpus_bad

In [19]:
gcash_good_as_df, gcash_bad_as_df = preprocess_b(gcash_reviews_as)
paymaya_good_as_df, paymaya_bad_as_df = preprocess_b(paymaya_reviews_as)

In [20]:
gcash_good_ps_df, gcash_bad_ps_df = preprocess_b(gcash_reviews_ps)
paymaya_good_ps_df, paymaya_bad_ps_df = preprocess_b(paymaya_reviews_ps)

##### C. Removing stop words
---
Tagalog stopwords list from: https://github.com/stopwords-iso/stopwords-tl/blob/master/stopwords-tl.json

In [21]:
with open('../../Data/Stopwords/stopwords-custom.json') as file:
    custom_stopwords = json.load(file)

In [22]:
with open('../../Data/Stopwords/stopwords-tl.json') as file:
    tl_stopwords = json.load(file)

Since nltk does not contain Tagalog stopwords, we append the Tagalog stopwords to the built-in English stopwords to create a custom taglish_stopwords list

In [23]:
taglish_stopwords = stopwords.words('english')
taglish_stopwords.extend(tl_stopwords)
taglish_stopwords.extend(custom_stopwords)
taglish_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [24]:
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [25]:
def df_to_words(df):
    data = df.title_review_tokenized.values.tolist()
    return data

In [26]:
def remove_stopwords(entry):
    return [[word for word in simple_preprocess(str(doc)) if word not in taglish_stopwords] for doc in entry]

In [27]:
def make_ngrams(data, entry):
    bigram = gensim.models.Phrases(data, min_count=5, threshold=20) # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    bigrams = [bigram_mod[doc] for doc in entry]
    
    return bigrams

In [28]:
def preprocess_b(corpus):
    data = df_to_words(corpus)
    data_nstops = remove_stopwords(data)
    bigrams = make_ngrams(data, data_nstops)
    return bigrams

In [29]:
gcash_good_as = preprocess_b(gcash_good_as_df)
gcash_bad_as = preprocess_b(gcash_bad_as_df)
paymaya_good_as = preprocess_b(paymaya_good_as_df)
paymaya_bad_as = preprocess_b(paymaya_bad_as_df)

In [30]:
gcash_good_ps = preprocess_b(gcash_good_ps_df)
gcash_bad_ps = preprocess_b(gcash_bad_ps_df)
paymaya_good_ps = preprocess_b(paymaya_good_ps_df)
paymaya_bad_ps = preprocess_b(paymaya_bad_ps_df)

In [31]:
gcash_good_as_df['title_review_tokenized'] = gcash_good_as
gcash_bad_as_df['title_review_tokenized'] = gcash_bad_as
gcash_good_ps_df['title_review_tokenized'] = gcash_good_ps
gcash_bad_ps_df['title_review_tokenized'] = gcash_bad_ps

In [32]:
paymaya_good_as_df['title_review_tokenized'] = paymaya_good_as
paymaya_bad_as_df['title_review_tokenized'] = paymaya_bad_as
paymaya_good_ps_df['title_review_tokenized'] = paymaya_good_ps
paymaya_bad_ps_df['title_review_tokenized'] = paymaya_bad_ps

In [33]:
gcash_df = [gcash_good_as_df, gcash_bad_as_df, gcash_good_ps_df, gcash_bad_ps_df]
paymaya_df = [paymaya_good_as_df, paymaya_bad_as_df, paymaya_good_ps_df, paymaya_bad_ps_df]

#### D. Data Transformation: Corpus and Dictionary
---

In [34]:
def data_transform(data, store):
    id2word = corpora.Dictionary(data)
    if store == 'as':
        id2word.filter_extremes(no_below = 5, no_above = 0.5, keep_n=100000)
    else:
        id2word.filter_extremes(no_below = 15, no_above = 0.5, keep_n=100000)
    texts = data
    corpus = [id2word.doc2bow(text) for text in texts]
    
    return id2word, corpus

In [35]:
gcash_good_dict_as, gcash_good_corpus_as = data_transform(gcash_good_as, 'as')
gcash_bad_dict_as, gcash_bad_corpus_as = data_transform(gcash_bad_as, 'as')
paymaya_good_dict_as, paymaya_good_corpus_as = data_transform(paymaya_good_as, 'as')
paymaya_bad_dict_as, paymaya_bad_corpus_as = data_transform(paymaya_bad_as, 'as')

In [36]:
gcash_good_dict_ps, gcash_good_corpus_ps = data_transform(gcash_good_ps, 'ps')
gcash_bad_dict_ps, gcash_bad_corpus_ps = data_transform(gcash_bad_ps, 'ps')
paymaya_good_dict_ps, paymaya_good_corpus_ps = data_transform(paymaya_good_ps, 'ps')
paymaya_bad_dict_ps, paymaya_bad_corpus_ps = data_transform(paymaya_bad_ps, 'ps')

##### D. Saving the files
---

In [37]:
gcash_good_file = [gcash_good_as, gcash_good_dict_as, gcash_good_corpus_as, 
                   gcash_good_ps, gcash_good_dict_ps, gcash_good_corpus_ps]
gcash_bad_file = [gcash_bad_as, gcash_bad_dict_as, gcash_bad_corpus_as,
                  gcash_bad_ps, gcash_bad_dict_ps, gcash_bad_corpus_ps]
paymaya_good_file = [paymaya_good_as, paymaya_good_dict_as, paymaya_good_corpus_as, 
                   paymaya_good_ps, paymaya_good_dict_ps, paymaya_good_corpus_ps]
paymaya_bad_file = [paymaya_bad_as, paymaya_bad_dict_as, paymaya_bad_corpus_as,
                  paymaya_bad_ps, paymaya_bad_dict_ps, paymaya_bad_corpus_ps]

In [38]:
joblib.dump(gcash_good_file, '../../Data/Preprocessed/Mico/P2/gcash_good.sav')
joblib.dump(gcash_bad_file, '../../Data/Preprocessed/Mico/P2/gcash_bad.sav')
joblib.dump(paymaya_good_file, '../../Data/Preprocessed/Mico/P2/paymaya_good.sav')
joblib.dump(paymaya_bad_file, '../../Data/Preprocessed/Mico/P2/paymaya_bad.sav')
joblib.dump(gcash_df, '../../Data/Preprocessed/Mico/P2/gcash_df.sav')
joblib.dump(paymaya_df, '../../Data/Preprocessed/Mico/P2/paymaya_df.sav')

['../../Data/Preprocessed/Mico/P2/paymaya_df.sav']