# NLP - Cleaning and Preprocessing Text Data of User Reviews in AppStore

### Imports

In [35]:
# pandas
import pandas as pd
# natural language toolkit
import nltk
# string for punctuation list
import string
# to remove links, numbers
import re
# to get stopwords from smart stopword list link
from urllib.request import urlopen
# wordnet for part of the speech
from nltk.corpus import wordnet
from collections import Counter
# Tokenizer
from nltk.tokenize import RegexpTokenizer
# Lemmatizer
from nltk.stem import WordNetLemmatizer
#Stemmers
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

import numpy as np


##  CSV Read and DataFrame Creation

We load a CSV file, create a DataFrame, and verify its shape. Initially, we have a dataset with 3097 rows and 16 columns, where each row represents a distinct reviews posted on AppStore for 10 different apps.

In [2]:
def get_data(file):
    data = pd.read_csv(file)
    print(data.shape)
    return data

In [3]:
file = "final_annotations.csv"
df = get_data(file)
df

(3097, 16)


Unnamed: 0,id,reviewId,userName,userImage,content,wc,score,thumbsUpCount,thumb?,reviewCreatedVersion,at,replyContent,repliedAt,aapp_name,aacat1,inex1
0,972,gp:AOqpTOEtDRJ3jrXbHFtJDkV6M20O2WZMvMoHEprDcCM...,Debbie Peach,https://play-lh.googleusercontent.com/a-/AOh14...,Need more notice when Alexa is going to add co...,18,4,0,FALSE,2.2.342851.0,2020-06-20 3:52:26,,,alexa,accessibility,internal
1,959,gp:AOqpTOGZhKglDGI88-WwwUTIW0lAJ829T3aMFxy3vB3...,Glenn McMillan,https://play-lh.googleusercontent.com/a-/AOh14...,"I have given up on this app, it is constantly ...",138,1,2,1,2.2.280247.0,2019-08-15 1:32:36,,,alexa,accountability,internal
2,1055,gp:AOqpTOEYd-V6GjkM2N14YmvR5HkL9IGR1KrcIP9lCwS...,Holly Baker,https://play-lh.googleusercontent.com/-gQkg1A5...,Im addicted and didnt even want an Alexa. She ...,18,5,1,1,2.2.319280.0,2020-02-25 13:51:42,,,alexa,Addiction,internal
3,1071,gp:AOqpTOEI7VftVQUfMCXv2CeTOIXKmubxMYrO2iKL2V6...,DreLisa Richmond,https://play-lh.googleusercontent.com/-w_zy3dM...,Must repeat request more than I would like. Fr...,15,3,1,1,,2020-05-08 15:07:48,,,alexa,Addiction,internal
4,1074,gp:AOqpTOFxOeKALhwBlDR_N-uMFW7h2So_kBlU8NDxDfN...,James Besel,https://play-lh.googleusercontent.com/a-/AOh14...,I love all of my devices. I am kind of addicte...,56,4,7,1,2.2.372932.0,2020-11-19 23:07:48,,,alexa,Addiction,internal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,562,gp:AOqpTOHeODykxpgVuJDCiyWm5MOu8EKeDx5iRRaB6Th...,Chadwick Smith,https://play-lh.googleusercontent.com/-bV_Dipz...,I don't like it at all i feel unsafe using thi...,12,1,0,FALSE,,2020-09-06 9:05:54,,,zoom,safety,internal
3093,630,gp:AOqpTOEUav74SI4pHVyOAkUt1Tm3brQu_EiiBONo9oJ...,Cheryl-Lee Borzsony,https://play-lh.googleusercontent.com/-CfYxSTM...,No experience other than being scammed,6,5,0,FALSE,5.4.1.453,2020-11-07 17:35:09,,,zoom,scam,internal
3094,634,gp:AOqpTOEjhIgP4HN8EYJza2wvtnPtHoD1WBI6H3REetG...,Skye pa,https://play-lh.googleusercontent.com/-eCn8M2B...,Good way to get scammed and credit info stolen...,16,1,0,FALSE,5.4.4.615,2020-11-23 18:05:10,,,zoom,scam,internal
3095,2982,gp:AOqpTOFZ42Ov8J_LzmCJ6RFyKKTtf_C2pBEGpwZzlWL...,Amjad Al Taleb,https://play-lh.googleusercontent.com/a-/AOh14...,Battery killer!,2,2,0,FALSE,5.3.52883.0928,2020-10-03 10:55:05,,,zoom,sustainability,internal


In [4]:
# Get unique values of apps and raised ethical concerns of reviews
apps = df['aapp_name'].unique()
print('Apps:', ', '.join(apps))

concerns = df['aacat1'].unique()
print('\nRaised Ethical Concerns: ', ', '.join(concerns))


Apps: alexa, facebook, googlehome, instagram, linkedin, tiktok, uber, vinted, youtube, zoom

Raised Ethical Concerns:  accessibility, accountability, Addiction, discrimination, none, privacy, safety, scam, sustainability, censorship, Cyberbullying/toxicity, harmful advertising, identity theft, inappropriate content, Noise, spreading false information, transparency, Unethical company, Content theft, cyberbullying, content theft


## Remove links

In [5]:
def removeLink(text):
    no_link = ' '.join(re.sub("(w+://S+)", " ", text).split())
    return no_link

In [6]:
df['clean_content'] = df['content'].apply(lambda x: removeLink(x))
df['clean_content']

0       Need more notice when Alexa is going to add co...
1       I have given up on this app, it is constantly ...
2       Im addicted and didnt even want an Alexa. She ...
3       Must repeat request more than I would like. Fr...
4       I love all of my devices. I am kind of addicte...
                              ...                        
3092    I don't like it at all i feel unsafe using thi...
3093               No experience other than being scammed
3094    Good way to get scammed and credit info stolen...
3095                                      Battery killer!
3096    I don't recommend this app to anyone. Upon sig...
Name: clean_content, Length: 3097, dtype: object

## Remove numbers

In [7]:
def removeNumber(text):
    return ' '.join(re.sub(r'[0-9]',' ', text).split())

In [8]:
df['clean_content'] = df['clean_content'].apply(lambda x: removeNumber(x))

df['clean_content']

0       Need more notice when Alexa is going to add co...
1       I have given up on this app, it is constantly ...
2       Im addicted and didnt even want an Alexa. She ...
3       Must repeat request more than I would like. Fr...
4       I love all of my devices. I am kind of addicte...
                              ...                        
3092    I don't like it at all i feel unsafe using thi...
3093               No experience other than being scammed
3094    Good way to get scammed and credit info stolen...
3095                                      Battery killer!
3096    I don't recommend this app to anyone. Upon sig...
Name: clean_content, Length: 3097, dtype: object

## Remove Emojis

In [9]:
def deEmojify(text):
    return text.encode('ascii', 'ignore').decode('ascii')

In [10]:
df['clean_content'] = df['clean_content'].apply(lambda x: deEmojify(x))

#df['clean_content']
print(df.loc[450, ['content','clean_content']].values)

["I can't comment anymore why is it limited I'm so sad I wanna delete it but I'm dead to my uncle if I do that😭😭😭😭😭"
 "I can't comment anymore why is it limited I'm so sad I wanna delete it but I'm dead to my uncle if I do that"]


## Converting all characters to lowercase

In [11]:
df['clean_content'] = df['clean_content'].apply(lambda x: x.lower())
df['clean_content']

0       need more notice when alexa is going to add co...
1       i have given up on this app, it is constantly ...
2       im addicted and didnt even want an alexa. she ...
3       must repeat request more than i would like. fr...
4       i love all of my devices. i am kind of addicte...
                              ...                        
3092    i don't like it at all i feel unsafe using thi...
3093               no experience other than being scammed
3094    good way to get scammed and credit info stolen...
3095                                      battery killer!
3096    i don't recommend this app to anyone. upon sig...
Name: clean_content, Length: 3097, dtype: object

## Remove stopwords
* nltk.corpus.stopwords.words('english') could be also used. However, it contains 179, whereas smart stopword list does 571 words, including ‘i’, ‘me’, ‘my’, ‘myself’, ‘we’, ‘you’, ‘he’, ‘his’, for instance. 
* stpwrd is here extended with app names that are mentioned in the reviews as well since they are going to be included in every reviews that belong to them.

In [12]:
def generate_stopwords():
    stpwrd_url = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a11-smart-stop-list/english.stop"
    response = urlopen(stpwrd_url)
    stpwrds = response.read().decode('utf-8').replace("\n", " ").split()
    new_stopwords = ["app", "alexa", "facebook", 
                     "googlehome", "instagram", "linkedin", "tiktok", "tik", "tok", "uber"]
    stpwrds.extend(new_stopwords)
    return stpwrds

In [13]:
def remove_stopwords(text, stpwrds):
    text = text.split(" ")
    words = [w for w in text if w not in stpwrds]
    return ' '.join(words)

In [14]:
stpwrds = generate_stopwords()
df['clean_content'] = df['clean_content'].apply(lambda x: remove_stopwords(x, stpwrds))

In [15]:
#df['clean_content']
print(df.loc[450, ['content','clean_content']].values)

["I can't comment anymore why is it limited I'm so sad I wanna delete it but I'm dead to my uncle if I do that😭😭😭😭😭"
 'comment anymore limited sad wanna delete dead uncle']


In [16]:
df['clean_content']

0               notice add comments command hard hearing.
1       app, constantly offline multiple devices sign ...
2       im addicted didnt alexa. memory companion im a...
3       repeat request like. frustrating times. conven...
4       love devices. kind addicted them. stay organiz...
                              ...                        
3092                                          feel unsafe
3093                                   experience scammed
3094     good scammed credit info stolen. horrible peril.
3095                                      battery killer!
3096    recommend anyone. signing found delete account...
Name: clean_content, Length: 3097, dtype: object

## Remove punctuation
The process of punctuation elimination involves iterating through the series using list comprehension and preserving all elements that do not exist in the __string.punctuation__ list. This list, imported at the beginning using __import string__, comprises all punctuation marks.

In [17]:
def removePunctuation(text):
    no_punc = "".join([c for c in text if c not in string.punctuation])
    return no_punc

In [18]:
df['clean_content'] = df['clean_content'].apply(lambda x: removePunctuation(x))
df['clean_content']

0                notice add comments command hard hearing
1       app constantly offline multiple devices sign s...
2       im addicted didnt alexa memory companion im al...
3       repeat request like frustrating times convenie...
4       love devices kind addicted them stay organized...
                              ...                        
3092                                          feel unsafe
3093                                   experience scammed
3094       good scammed credit info stolen horrible peril
3095                                       battery killer
3096    recommend anyone signing found delete account ...
Name: clean_content, Length: 3097, dtype: object

## Tokenizing words

* __RegexpTokenizer__ is a function that is used to break down a string into smaller substrings based on a specified regular expression pattern. The selected pattern splits up by spaces that are not attached to a digit as numbers are already cleaned from reviews.
* __discard\_empty__ is set to True. It ensures that any empty tokens produced by the tokenizer are removed from the resulting output. 
(see in https://www.nltk.org/_modules/nltk/tokenize/regexp.html) 

In [19]:
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+', discard_empty=True)
df['clean_content'] = df['clean_content'].apply(lambda x: tokenizer.tokenize(x))

In [20]:
print(df['clean_content'])
print("\nOne particular review:")
print(df.loc[450, ['content','clean_content']].values)

0         [notice, add, comments, command, hard, hearing]
1       [app, constantly, offline, multiple, devices, ...
2       [im, addicted, didnt, alexa, memory, companion...
3       [repeat, request, like, frustrating, times, co...
4       [love, devices, kind, addicted, them, stay, or...
                              ...                        
3092                                       [feel, unsafe]
3093                                [experience, scammed]
3094    [good, scammed, credit, info, stolen, horrible...
3095                                    [battery, killer]
3096    [recommend, anyone, signing, found, delete, ac...
Name: clean_content, Length: 3097, dtype: object

One particular review:
["I can't comment anymore why is it limited I'm so sad I wanna delete it but I'm dead to my uncle if I do that😭😭😭😭😭"
 list(['comment', 'anymore', 'limited', 'sad', 'wanna', 'delete', 'dead', 'uncle'])]


## Stemming and Lemmatizing

The Stemming and Lemmatizing methods are used in natural language processing to simplify words by converting them to their base or root form. During the process, they both are explored and evaluated to determine the most suitable one based on the requirements.

## Stemming

To stem the review texts, them ost common method, PorterStemmer, from nltk library is used.
Stemming is a relatively more proactive approach that involves removing prefixes and/or suffixes from words based on commonly observed patterns. While it can be useful in certain cases, it is not always reliable as the resulting word may lose its original meaning by becoming too generic or root-like.

In [21]:
def word_stemmer(text, stemmer):
    stem_text = [stemmer.stem(i) for i in text]
    return stem_text

### PorterStemmer

In [22]:
porter = PorterStemmer()
df['clean_content_porter'] = df['clean_content'].apply(lambda x: word_stemmer(x, porter))

### SnowballStemmer

In [23]:
snowball = SnowballStemmer(language='english')
df['clean_content_snowball'] = df['clean_content'].apply(lambda x: word_stemmer(x, snowball))

### LancasterStemmer

In [24]:
lancaster = LancasterStemmer()
df['clean_content_lancaster'] = df['clean_content'].apply(lambda x: word_stemmer(x, lancaster))

In [25]:
print("\nOne particular review:")
print(df.loc[450, ['clean_content',
                   'clean_content_porter', 
                   'clean_content_snowball', 
                   'clean_content_lancaster']].values)


One particular review:
[list(['comment', 'anymore', 'limited', 'sad', 'wanna', 'delete', 'dead', 'uncle'])
 list(['comment', 'anymor', 'limit', 'sad', 'wanna', 'delet', 'dead', 'uncl'])
 list(['comment', 'anymor', 'limit', 'sad', 'wanna', 'delet', 'dead', 'uncl'])
 list(['com', 'anym', 'limit', 'sad', 'wann', 'delet', 'dead', 'unc'])]


## Lemmatizing
To lemmatize the review texts, WordNetLemmatizer from nltk library is used.

Lemmatizing maps words to their base or dictionary form by considering their part of speech. Unlike stemming, lemmatization consistently returns valid words that can be found in a dictionary. The goal of lemmatization algorithms is to accurately reduce inflected words, ensuring the association of the base word with the language is preserved.

To normalize a word, it is necessary to determine its part of speech. During this process, the rules for normalization will vary depending on the specific part of speech of the word. The Wordnet Lemmatizer allows you to specify a specific part of speech (POS) tag. Part-of-speech constants are: 
ADJ = a'
ADJ_SAT = 's'
ADV = 'r'
NOUN = 'n'
VERB = 'v'

In this case, we set the tag to 'v' which represents 'verb' and 'a' which represents 'adjective'. Typically, this POS tag is used for lemmatization.

In [26]:
def get_part_of_speech(word):
    probable_part_of_speech = wordnet.synsets(word)
    pos_counts = Counter()
    #pos_counts["n"] = len([item for item in probable_part_of_speech if item.pos() == "n"])
    pos_counts["v"] = len([item for item in probable_part_of_speech if item.pos() == "v"])
    pos_counts["a"] = len([item for item in probable_part_of_speech if item.pos() == "a"])  
    #pos_counts["r"] = len([item for item in probable_part_of_speech if item.pos() == "r"])

    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech

In [27]:
def word_lemmatizer(text, lemmatizer):
    lem_text = [lemmatizer.lemmatize(i, get_part_of_speech(i)) for i in text]
    return lem_text
lemmatizer =  WordNetLemmatizer()

#### Test case
In this example, strengths and weaknesses of using pos tags can be tested. 

By considering only verb and adjective tags, 'worst' is transformed to 'bad' and 'addicted' is transformed to 'addict'. However, 'rating' is changed to 'rat'.

On the other hand, if noun tag is used (it can tested by commenting/uncommenting the the tag in __get_part_of_speech()__, the past tense is preserved, but 'rating' is not converted to 'rat'.

This scenario exemplifies the well-known No Free Lunch Theorem, where we must carefully evaluate the pros and cons of each approach and make a decision based on the specific advantages and disadvantages they offer.

Considering the amount of reviews, it is decided to use only verbs and adjectives to reduce the number of unique words.

In [28]:
test = ['worst', 'was', 'bad', 'are', 'addicted', 'scammed', 'hearing', 'swimming', 'rating']
print(test)
word_lemmatizer(test,lemmatizer)

['worst', 'was', 'bad', 'are', 'addicted', 'scammed', 'hearing', 'swimming', 'rating']


['bad', 'be', 'bad', 'be', 'addict', 'scammed', 'hear', 'swim', 'rat']

In [29]:
df['clean_content'] = df['clean_content'].apply(lambda x: word_lemmatizer(x, lemmatizer))

In [30]:
print(df[['content','clean_content']])
print("\nOne particular review:")
print(df.loc[0, ['content','clean_content']].values)

                                                content  \
0     Need more notice when Alexa is going to add co...   
1     I have given up on this app, it is constantly ...   
2     Im addicted and didnt even want an Alexa. She ...   
3     Must repeat request more than I would like. Fr...   
4     I love all of my devices. I am kind of addicte...   
...                                                 ...   
3092  I don't like it at all i feel unsafe using thi...   
3093             No experience other than being scammed   
3094  Good way to get scammed and credit info stolen...   
3095                                    Battery killer!   
3096  I don't recommend this app to anyone. Upon sig...   

                                          clean_content  
0           [notice, add, comment, command, hard, hear]  
1     [app, constantly, offline, multiple, devices, ...  
2     [im, addict, didnt, alexa, memory, companion, ...  
3     [repeat, request, like, frustrate, time, conve...  
4

### Stemmer vs Lemmatizer

In [31]:
print('raw review: ', df.loc[450,'content'])
print('lemmatized: ', df.loc[450,'clean_content'])
print('stemmed with porter: ', df.loc[450,'clean_content_porter'])

raw review:  I can't comment anymore why is it limited I'm so sad I wanna delete it but I'm dead to my uncle if I do that😭😭😭😭😭
lemmatized:  ['comment', 'anymore', 'limit', 'sad', 'wanna', 'delete', 'dead', 'uncle']
stemmed with porter:  ['comment', 'anymor', 'limit', 'sad', 'wanna', 'delet', 'dead', 'uncl']


### Including the app names and etchical concerns in the reviews

In [32]:
concern = True
app = True
if concern:
    for index, row in df.iterrows():
        row["clean_content"].append(row["aacat1"])
if app:
    for index, row in df.iterrows():
        row["clean_content"].append(row["aapp_name"])

In [33]:
corpus_list = df["clean_content"].tolist()
corpus_list[:3]

[['notice',
  'add',
  'comment',
  'command',
  'hard',
  'hear',
  'accessibility',
  'alexa'],
 ['app',
  'constantly',
  'offline',
  'multiple',
  'devices',
  'sign',
  'sign',
  'app',
  'tell',
  'echo',
  'line',
  'devices',
  'connect',
  'work',
  'correctly',
  'reset',
  'echo',
  'multiple',
  'time',
  'create',
  'account',
  'reinstall',
  'multiple',
  'devices',
  'result',
  'account',
  'constantly',
  'line',
  'numerous',
  'email',
  'desk',
  'reply',
  'account',
  'server',
  'issue',
  'amazon',
  'honest',
  'users',
  'echo',
  'device',
  'work',
  'fine',
  'capable',
  'voice',
  'control',
  'light',
  'replace',
  'competitors',
  'product',
  'stage',
  'recommend',
  'product',
  'anyone',
  'accountability',
  'alexa'],
 ['im',
  'addict',
  'didnt',
  'alexa',
  'memory',
  'companion',
  'im',
  'alone',
  'lol',
  'Addiction',
  'alexa']]

In [34]:
%store corpus_list

Stored 'corpus_list' (list)
