### Data Preprocessing

In [1]:
import pandas as pd
import warnings
import contractions
import json
import datetime

from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

warnings.filterwarnings(action="ignore")

---

#### I. Import CSV files

In [2]:
def import_csv(path):
    reviews = pd.read_csv(path)
    reviews['title_review'] = reviews['title'] + ' ' + reviews['review']
    reviews = reviews.drop(['userName','title','review','isEdited','app_id','developerResponse','app_name'],axis=1)
    return reviews

In [3]:
gcash_reviews = import_csv('../../Data/Raw/App Store/GCash/gcash_0613-1427.csv')
paymaya_reviews = import_csv('../../Data/Raw/App Store/PayMaya/paymaya_0613-1402.csv')

In [4]:
gcash_reviews

Unnamed: 0,date,rating,title_review
0,2018-12-29 15:24:59,3,It is starting to become pretty annoying Almos...
1,2020-05-16 07:33:32,5,Actually ok. This app gets so many negative re...
2,2020-02-04 00:25:57,3,Cash In Concern A little bit disappointed when...
3,2019-12-10 09:43:52,1,Frustrating and slow and you’re a bunch of thi...
4,2018-06-26 18:43:31,1,Payments don’t post. My payment via gcash did ...
...,...,...,...
4862,2016-01-16 11:52:50,1,FIX THIS!!! This is one of the crucial apps fo...
4863,2015-11-09 08:20:37,1,Can't access because they can't process my tra...
4864,2015-09-19 14:58:01,1,Waste of time downloading Compatibility says c...
4865,2015-09-09 11:45:00,1,Cant log in Useless app.cant access my gcash w...


In [5]:
paymaya_reviews

Unnamed: 0,date,rating,title_review
0,2020-03-30 01:10:54,5,VERY BAD SERVICE I have been contacting them a...
1,2020-08-06 03:55:14,1,Very poor customer service There should be a z...
2,2020-05-02 01:58:12,1,App is great but service is not I really loved...
3,2020-04-09 01:32:02,1,Make your app trustworthy really Comeon this i...
4,2020-12-01 07:59:56,1,The new update... Ok so lets be honest I love ...
...,...,...,...
2564,2016-11-15 05:07:55,1,Verification code - time waster Your verificat...
2565,2015-11-29 18:38:50,4,"Nice One Very nice idea, we can buy now online..."
2566,2016-04-09 15:51:13,2,UPDATE PLEASE! Error occured when i'm trying t...
2567,2016-03-16 10:26:02,1,Am I doing something wrong I can't receive the...


#### II. Preprocessing

Reference: 
1. https://stackoverflow.com/questions/45605946/how-to-do-text-pre-processing-using-spacy
2. https://towardsdatascience.com/setting-up-text-preprocessing-pipeline-using-scikit-learn-and-spacy-e09b9b76758f
3. https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

---

##### A. Dropping blank rows, changing to lower case, expanding contractions, spell correction, and tokenization

In [6]:
def preprocess_a(corpus, column):
    new_column = f"{column}_tokenized"
    corpus[column].dropna(inplace=True) # Drop blank rows
    corpus[column] = [entry.lower() for entry in corpus[column]] # Lowercase text
    corpus[column] = corpus[column].apply(lambda x: contractions.fix(x)) # Expand contractions
    corpus[column] = corpus[column].apply(lambda x: str(TextBlob(x))) # Correct spelling
    corpus[column] = corpus[column].str.replace('[^\w\s]','') # Remove punctuation
    corpus[new_column] = [word_tokenize(entry) for entry in corpus[column]] # Tokenization

In [7]:
preprocess_a(gcash_reviews, 'title_review')
preprocess_a(paymaya_reviews, 'title_review')

In [8]:
gcash_reviews

Unnamed: 0,date,rating,title_review,title_review_tokenized
0,2018-12-29 15:24:59,3,it is starting to become pretty annoying almos...,"[it, is, starting, to, become, pretty, annoyin..."
1,2020-05-16 07:33:32,5,actually ok this app gets so many negative rev...,"[actually, ok, this, app, gets, so, many, nega..."
2,2020-02-04 00:25:57,3,cash in concern a little bit disappointed when...,"[cash, in, concern, a, little, bit, disappoint..."
3,2019-12-10 09:43:52,1,frustrating and slow and you are a bunch of th...,"[frustrating, and, slow, and, you, are, a, bun..."
4,2018-06-26 18:43:31,1,payments do not post my payment via gcash did ...,"[payments, do, not, post, my, payment, via, gc..."
...,...,...,...,...
4862,2016-01-16 11:52:50,1,fix this this is one of the crucial apps for g...,"[fix, this, this, is, one, of, the, crucial, a..."
4863,2015-11-09 08:20:37,1,can not access because they can not process my...,"[can, not, access, because, they, can, not, pr..."
4864,2015-09-19 14:58:01,1,waste of time downloading compatibility says c...,"[waste, of, time, downloading, compatibility, ..."
4865,2015-09-09 11:45:00,1,can not log in useless appcan not access my gc...,"[can, not, log, in, useless, appcan, not, acce..."


In [9]:
paymaya_reviews

Unnamed: 0,date,rating,title_review,title_review_tokenized
0,2020-03-30 01:10:54,5,very bad service i have been contacting them a...,"[very, bad, service, i, have, been, contacting..."
1,2020-08-06 03:55:14,1,very poor customer service there should be a z...,"[very, poor, customer, service, there, should,..."
2,2020-05-02 01:58:12,1,app is great but service is not i really loved...,"[app, is, great, but, service, is, not, i, rea..."
3,2020-04-09 01:32:02,1,make your app trustworthy really comeon this i...,"[make, your, app, trustworthy, really, comeon,..."
4,2020-12-01 07:59:56,1,the new update ok so let us be honest i love m...,"[the, new, update, ok, so, let, us, be, honest..."
...,...,...,...,...
2564,2016-11-15 05:07:55,1,verification code time waster your verificati...,"[verification, code, time, waster, your, verif..."
2565,2015-11-29 18:38:50,4,nice one very nice idea we can buy now online ...,"[nice, one, very, nice, idea, we, can, buy, no..."
2566,2016-04-09 15:51:13,2,update please error occured when I am trying t...,"[update, please, error, occured, when, I, am, ..."
2567,2016-03-16 10:26:02,1,am i doing something wrong i can not receive t...,"[am, i, doing, something, wrong, i, can, not, ..."


##### B. Removing stop words and performing lemmenting
---
Tagalog stopwords list from: https://github.com/stopwords-iso/stopwords-tl/blob/master/stopwords-tl.json

In [10]:
with open('../../Data/Stopwords/stopwords-tl.json') as file:
    tl_stopwords = json.load(file)

Since nltk does not contain Tagalog stopwords, we append the Tagalog stopwords to the built-in English stopwords to create a custom taglish_stopwords list

In [11]:
taglish_stopwords = stopwords.words('english')
taglish_stopwords.extend(tl_stopwords)
taglish_stopwords.extend(['gcash','app','po','talaga', 'nyo', 'yung', 'naman', 'di', 
                          'paymaya','lang', 'thank', 'please'])
taglish_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
def preprocess_b(corpus, column):
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    for index,entry in enumerate(corpus[column]):
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if word not in taglish_stopwords and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
        # The final processed set of words for each iteration will be stored in 'title_review_tokenized'
        corpus.loc[index, column] = str(Final_words)

In [13]:
preprocess_b(gcash_reviews,'title_review_tokenized')

In [14]:
preprocess_b(paymaya_reviews,'title_review_tokenized')

In [15]:
gcash_reviews

Unnamed: 0,date,rating,title_review,title_review_tokenized
0,2018-12-29 15:24:59,3,it is starting to become pretty annoying almos...,"['start', 'become', 'pretty', 'annoy', 'almost..."
1,2020-05-16 07:33:32,5,actually ok this app gets so many negative rev...,"['actually', 'ok', 'get', 'many', 'negative', ..."
2,2020-02-04 00:25:57,3,cash in concern a little bit disappointed when...,"['cash', 'concern', 'little', 'bit', 'disappoi..."
3,2019-12-10 09:43:52,1,frustrating and slow and you are a bunch of th...,"['frustrating', 'slow', 'bunch', 'thief', 'I',..."
4,2018-06-26 18:43:31,1,payments do not post my payment via gcash did ...,"['payment', 'post', 'payment', 'via', 'post', ..."
...,...,...,...,...
4862,2016-01-16 11:52:50,1,fix this this is one of the crucial apps for g...,"['fix', 'one', 'crucial', 'apps', 'globe', 'us..."
4863,2015-11-09 08:20:37,1,can not access because they can not process my...,"['access', 'process', 'transaction', 'use', 'w..."
4864,2015-09-19 14:58:01,1,waste of time downloading compatibility says c...,"['waste', 'time', 'download', 'compatibility',..."
4865,2015-09-09 11:45:00,1,can not log in useless appcan not access my gc...,"['log', 'useless', 'appcan', 'access', 'wallet..."


In [16]:
paymaya_reviews

Unnamed: 0,date,rating,title_review,title_review_tokenized
0,2020-03-30 01:10:54,5,very bad service i have been contacting them a...,"['bad', 'service', 'contact', 'payment', 'make..."
1,2020-08-06 03:55:14,1,very poor customer service there should be a z...,"['poor', 'customer', 'service', 'zero', 'ratin..."
2,2020-05-02 01:58:12,1,app is great but service is not i really loved...,"['great', 'service', 'really', 'love', 'conven..."
3,2020-04-09 01:32:02,1,make your app trustworthy really comeon this i...,"['make', 'trustworthy', 'really', 'comeon', 'r..."
4,2020-12-01 07:59:56,1,the new update ok so let us be honest i love m...,"['new', 'update', 'ok', 'let', 'u', 'honest', ..."
...,...,...,...,...
2564,2016-11-15 05:07:55,1,verification code time waster your verificati...,"['verification', 'code', 'time', 'waster', 've..."
2565,2015-11-29 18:38:50,4,nice one very nice idea we can buy now online ...,"['nice', 'one', 'nice', 'idea', 'buy', 'online..."
2566,2016-04-09 15:51:13,2,update please error occured when I am trying t...,"['update', 'error', 'occur', 'I', 'try', 'log'..."
2567,2016-03-16 10:26:02,1,am i doing something wrong i can not receive t...,"['something', 'wrong', 'receive', 'verificatio..."


##### C. Filtering by date and ratings
---

In [17]:
start_date = "2020-01-01"
end_date = "2021-05-31"

In [18]:
def preprocess_c(corpus):
    corpus['date'] = pd.to_datetime(corpus['date'], format='%Y-%m-%d %H:%M:%S')
    corpus = corpus.loc[(corpus['date'] >= start_date) & (corpus['date'] <= end_date)]
    corpus = corpus.sort_values(by='date')
    corpus_good = corpus.loc[corpus['rating'] >= 4]
    corpus_bad = corpus.loc[corpus['rating'] <= 3]
    return corpus_good, corpus_bad

In [19]:
gcash_good_reviews, gcash_bad_reviews = preprocess_c(gcash_reviews)
paymaya_good_reviews, paymaya_bad_reviews = preprocess_c(paymaya_reviews)

In [20]:
gcash_good_reviews

Unnamed: 0,date,rating,title_review,title_review_tokenized
812,2020-01-10 03:29:09,5,nice so far so good convenient no problem enco...,"['nice', 'far', 'good', 'convenient', 'problem..."
3234,2020-01-17 17:59:30,5,good service plz support gcash,"['good', 'service', 'plz', 'support']"
56,2020-01-20 17:32:16,4,incoming outgoing transaction limits hello gc...,"['incoming', 'outgo', 'transaction', 'limit', ..."
489,2020-01-23 00:33:47,5,convenience since the day i installed the apps...,"['convenience', 'since', 'day', 'instal', 'app..."
792,2020-01-25 06:41:20,4,icons the app is ok but maybe stop rearranging...,"['icon', 'ok', 'maybe', 'stop', 'rearrange', '..."
...,...,...,...,...
2703,2021-05-25 02:44:16,4,customer service medyo makulit ako at maraming...,"['customer', 'service', 'medyo', 'makulit', 'm..."
2371,2021-05-26 13:37:55,5,face id pls bring back the face id security fe...,"['face', 'id', 'pls', 'bring', 'back', 'face',..."
186,2021-05-27 14:59:34,4,slightly not safe may i know what are the othe...,"['slightly', 'safe', 'know', 'way', 'access', ..."
619,2021-05-29 04:13:21,5,g cash allow nickname i hope next update of gc...,"['g', 'cash', 'allow', 'nickname', 'hope', 'ne..."


In [21]:
paymaya_good_reviews

Unnamed: 0,date,rating,title_review,title_review_tokenized
876,2020-01-03 03:11:29,4,purchase ali express great experience with the...,"['purchase', 'ali', 'express', 'great', 'exper..."
807,2020-01-05 05:58:26,5,verification error my account is labled as dup...,"['verification', 'error', 'account', 'labled',..."
1283,2020-01-06 03:34:12,5,nice and great app very convenient and very us...,"['nice', 'great', 'convenient', 'useful', 'lot..."
980,2020-01-10 04:52:05,5,sad review poor customer service,"['sad', 'review', 'poor', 'customer', 'service']"
271,2020-01-19 03:38:01,5,inconsistent almost ate all of my money i have...,"['inconsistent', 'almost', 'ate', 'money', 'is..."
...,...,...,...,...
370,2021-05-28 12:04:55,5,nice app i really loved paymaya because it is ...,"['nice', 'really', 'love', 'convenient', 'onli..."
644,2021-05-29 11:51:47,5,review the first time i laid my eyes to this a...,"['review', 'first', 'time', 'lay', 'eye', 'ala..."
960,2021-05-29 11:58:19,5,effectively nitong nababawasan ang intindihin...,"['effectively', 'nitong', 'nababawasan', 'inti..."
404,2021-05-29 14:38:20,5,review the qr gives off an impression of being...,"['review', 'qr', 'give', 'impression', 'short'..."


In [22]:
gcash_bad_reviews

Unnamed: 0,date,rating,title_review,title_review_tokenized
161,2020-01-01 02:10:56,3,needs improvement gcash is a great app but if ...,"['need', 'improvement', 'great', 'try', 'buy',..."
4655,2020-01-01 11:26:33,1,gcash is a total scam andami ng videos sa yout...,"['total', 'scam', 'andami', 'video', 'youtube'..."
793,2020-01-01 15:11:30,3,email registration error i cannot log in my e...,"['email', 'registration', 'error', 'log', 'ema..."
69,2020-01-01 19:14:19,1,customer support during issues is trash been a...,"['customer', 'support', 'issue', 'trash', 'avi..."
3631,2020-01-02 01:43:34,1,2020 outdate please keep up with 2020 it is wo...,"['outdate', 'keep', 'bad', 'dial']"
...,...,...,...,...
3588,2021-05-28 13:04:29,1,scammer is always scammer and tha is globe tel...,"['scammer', 'always', 'scammer', 'tha', 'globe..."
1823,2021-05-29 22:31:59,3,do not reset the dashboard every update i had ...,"['reset', 'dashboard', 'every', 'update', 'rea..."
3852,2021-05-30 06:38:52,1,worst app ever worst app an customer service,"['bad', 'ever', 'worst', 'customer', 'service']"
1272,2021-05-30 08:39:15,2,gcredit application page is buggy i was not ab...,"['gcredit', 'application', 'page', 'buggy', 'a..."


In [23]:
paymaya_bad_reviews

Unnamed: 0,date,rating,title_review,title_review_tokenized
1642,2020-01-01 05:36:26,1,they removed garena and other gaming load this...,"['remove', 'garena', 'game', 'load', 'much', '..."
67,2020-01-01 08:40:34,1,do not download this app i was trying to get s...,"['download', 'try', 'get', 'help', 'via', 'ema..."
583,2020-01-01 11:34:17,1,reversal of 20000 pesos bad customer service i...,"['reversal', 'peso', 'bad', 'customer', 'servi..."
2337,2020-01-01 13:44:46,1,useless na useless na paymaya kailangan na mag...,"['useless', 'useless', 'mag', 'upgrade', 'card..."
1116,2020-01-01 14:01:27,1,where is my money i added money to my account ...,"['money', 'add', 'money', 'account', 'last', '..."
...,...,...,...,...
1217,2021-05-24 06:04:12,1,no sense of urgency bank transfers and online ...,"['sense', 'urgency', 'bank', 'transfer', 'onli..."
1046,2021-05-24 08:02:31,1,can not open the app why i can not open the ap...,"['open', 'open', 'even', 'connection', 'always..."
1107,2021-05-27 00:22:14,3,pay bills only few billers are on the list so ...,"['pay', 'bill', 'billers', 'list', 'little', '..."
275,2021-05-29 03:16:00,1,poor support slow verification I have been str...,"['poor', 'support', 'slow', 'verification', 'I..."


##### D. Saving the files
---

In [24]:
gcash_good_reviews.to_csv('../../Data/Preprocessed/Mico/App Store/gcash_good.csv', index=False)
gcash_bad_reviews.to_csv('../../Data/Preprocessed/Mico/App Store/gcash_bad.csv', index=False)

In [25]:
paymaya_good_reviews.to_csv('../../Data/Preprocessed/Mico/App Store/paymaya_good.csv', index=False)
paymaya_bad_reviews.to_csv('../../Data/Preprocessed/Mico/App Store/paymaya_bad.csv', index=False)