### Data Preprocessing

In [1]:
import pandas as pd
import warnings
import json

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

warnings.filterwarnings(action="ignore")

---

#### I. Import CSV files

In [2]:
def import_csv(path):
    reviews = pd.read_csv(path)
    reviews['title_review'] = reviews['title'] + ' ' + reviews['review']
    reviews = reviews.drop(['userName','title','review','isEdited','app_id','developerResponse','app_name'],axis=1)
    return reviews

In [3]:
gcash_reviews = import_csv('../Data/Raw/App Store/GCash/gcash_0613-1427.csv')
paymaya_reviews = import_csv('../Data/Raw/App Store/PayMaya/paymaya_0613-1402.csv')

In [4]:
gcash_reviews

Unnamed: 0,date,rating,title_review
0,2018-12-29 15:24:59,3,It is starting to become pretty annoying Almos...
1,2020-05-16 07:33:32,5,Actually ok. This app gets so many negative re...
2,2020-02-04 00:25:57,3,Cash In Concern A little bit disappointed when...
3,2019-12-10 09:43:52,1,Frustrating and slow and you’re a bunch of thi...
4,2018-06-26 18:43:31,1,Payments don’t post. My payment via gcash did ...
...,...,...,...
4862,2016-01-16 11:52:50,1,FIX THIS!!! This is one of the crucial apps fo...
4863,2015-11-09 08:20:37,1,Can't access because they can't process my tra...
4864,2015-09-19 14:58:01,1,Waste of time downloading Compatibility says c...
4865,2015-09-09 11:45:00,1,Cant log in Useless app.cant access my gcash w...


In [5]:
paymaya_reviews

Unnamed: 0,date,rating,title_review
0,2020-03-30 01:10:54,5,VERY BAD SERVICE I have been contacting them a...
1,2020-08-06 03:55:14,1,Very poor customer service There should be a z...
2,2020-05-02 01:58:12,1,App is great but service is not I really loved...
3,2020-04-09 01:32:02,1,Make your app trustworthy really Comeon this i...
4,2020-12-01 07:59:56,1,The new update... Ok so lets be honest I love ...
...,...,...,...
2564,2016-11-15 05:07:55,1,Verification code - time waster Your verificat...
2565,2015-11-29 18:38:50,4,"Nice One Very nice idea, we can buy now online..."
2566,2016-04-09 15:51:13,2,UPDATE PLEASE! Error occured when i'm trying t...
2567,2016-03-16 10:26:02,1,Am I doing something wrong I can't receive the...


#### II. Preprocessing (Tokenization, Lemmenting)

Reference: 
1. https://stackoverflow.com/questions/45605946/how-to-do-text-pre-processing-using-spacy
2. https://towardsdatascience.com/setting-up-text-preprocessing-pipeline-using-scikit-learn-and-spacy-e09b9b76758f
3. https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

---

##### A. Dropping blank rows, changing to lower case, and tokenization

In [6]:
def preprocess_a(corpus, column):
    corpus[column].dropna(inplace=True)
    corpus[column] = [entry.lower() for entry in corpus[column]]
    corpus[column] = [word_tokenize(entry) for entry in corpus[column]]

In [7]:
preprocess_a(gcash_reviews, 'title_review')
preprocess_a(paymaya_reviews, 'title_review')

In [8]:
gcash_reviews

Unnamed: 0,date,rating,title_review
0,2018-12-29 15:24:59,3,"[it, is, starting, to, become, pretty, annoyin..."
1,2020-05-16 07:33:32,5,"[actually, ok., this, app, gets, so, many, neg..."
2,2020-02-04 00:25:57,3,"[cash, in, concern, a, little, bit, disappoint..."
3,2019-12-10 09:43:52,1,"[frustrating, and, slow, and, you, ’, re, a, b..."
4,2018-06-26 18:43:31,1,"[payments, don, ’, t, post, ., my, payment, vi..."
...,...,...,...
4862,2016-01-16 11:52:50,1,"[fix, this, !, !, !, this, is, one, of, the, c..."
4863,2015-11-09 08:20:37,1,"[ca, n't, access, because, they, ca, n't, proc..."
4864,2015-09-19 14:58:01,1,"[waste, of, time, downloading, compatibility, ..."
4865,2015-09-09 11:45:00,1,"[cant, log, in, useless, app.cant, access, my,..."


In [9]:
paymaya_reviews

Unnamed: 0,date,rating,title_review
0,2020-03-30 01:10:54,5,"[very, bad, service, i, have, been, contacting..."
1,2020-08-06 03:55:14,1,"[very, poor, customer, service, there, should,..."
2,2020-05-02 01:58:12,1,"[app, is, great, but, service, is, not, i, rea..."
3,2020-04-09 01:32:02,1,"[make, your, app, trustworthy, really, comeon,..."
4,2020-12-01 07:59:56,1,"[the, new, update, ..., ok, so, lets, be, hone..."
...,...,...,...
2564,2016-11-15 05:07:55,1,"[verification, code, -, time, waster, your, ve..."
2565,2015-11-29 18:38:50,4,"[nice, one, very, nice, idea, ,, we, can, buy,..."
2566,2016-04-09 15:51:13,2,"[update, please, !, error, occured, when, i, '..."
2567,2016-03-16 10:26:02,1,"[am, i, doing, something, wrong, i, ca, n't, r..."


##### B. Removing stop words and performing lemmenting
---
Tagalog stopwords list from: https://github.com/stopwords-iso/stopwords-tl/blob/master/stopwords-tl.json

In [10]:
with open('../Data/Stopwords/stopwords-tl.json') as file:
    tl_stopwords = json.load(file)

Since nltk does not contain Tagalog stopwords, we append the Tagalog stopwords to the built-in English stopwords to create a custom taglish_stopwords list

In [11]:
taglish_stopwords = stopwords.words('english')
taglish_stopwords.extend(tl_stopwords)
taglish_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
def preprocess_b(corpus, column):
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    for index,entry in enumerate(corpus[column]):
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if word not in taglish_stopwords and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
        # The final processed set of words for each iteration will be stored in 'comments_text'
        corpus.loc[index,column] = str(Final_words)

In [13]:
preprocess_b(gcash_reviews,'title_review')

In [14]:
preprocess_b(paymaya_reviews,'title_review')

In [15]:
gcash_reviews.to_csv('../Data/Preprocessed/Gcash.csv', index=False)

In [16]:
paymaya_reviews.to_csv('../Data/Preprocessed/Paymaya.csv', index=False)

In [17]:
gcash_reviews

Unnamed: 0,date,rating,title_review
0,2018-12-29 15:24:59,3,"['start', 'become', 'pretty', 'annoy', 'almost..."
1,2020-05-16 07:33:32,5,"['actually', 'app', 'get', 'many', 'negative',..."
2,2020-02-04 00:25:57,3,"['cash', 'concern', 'little', 'bit', 'disappoi..."
3,2019-12-10 09:43:52,1,"['frustrating', 'slow', 'bunch', 'thief', 'avi..."
4,2018-06-26 18:43:31,1,"['payment', 'post', 'payment', 'via', 'gcash',..."
...,...,...,...
4862,2016-01-16 11:52:50,1,"['fix', 'one', 'crucial', 'apps', 'globe', 'us..."
4863,2015-11-09 08:20:37,1,"['ca', 'access', 'ca', 'process', 'transaction..."
4864,2015-09-19 14:58:01,1,"['waste', 'time', 'download', 'compatibility',..."
4865,2015-09-09 11:45:00,1,"['cant', 'log', 'useless', 'access', 'gcash', ..."


In [18]:
paymaya_reviews

Unnamed: 0,date,rating,title_review
0,2020-03-30 01:10:54,5,"['bad', 'service', 'contact', 'payment', 'make..."
1,2020-08-06 03:55:14,1,"['poor', 'customer', 'service', 'zero', 'ratin..."
2,2020-05-02 01:58:12,1,"['app', 'great', 'service', 'really', 'love', ..."
3,2020-04-09 01:32:02,1,"['make', 'app', 'trustworthy', 'really', 'come..."
4,2020-12-01 07:59:56,1,"['new', 'update', 'ok', 'let', 'honest', 'love..."
...,...,...,...
2564,2016-11-15 05:07:55,1,"['verification', 'code', 'time', 'waster', 've..."
2565,2015-11-29 18:38:50,4,"['nice', 'one', 'nice', 'idea', 'buy', 'online..."
2566,2016-04-09 15:51:13,2,"['update', 'please', 'error', 'occur', 'try', ..."
2567,2016-03-16 10:26:02,1,"['something', 'wrong', 'ca', 'receive', 'verif..."
