# NLP - Cleaning and Preprocessing Text Data of User Reviews in AppStore

### Imports

In [None]:
# pandas
import pandas as pd
# natural language toolkit
import nltk
# string for punctuation list
import string
# to remove links, numbers
import re
# to get stopwords from smart stopword list link
from urllib.request import urlopen
# wordnet for part of the speech
from nltk.corpus import wordnet
from collections import Counter
# Tokenizer
from nltk.tokenize import RegexpTokenizer
# Lemmatizer
from nltk.stem import WordNetLemmatizer
#Stemmers
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
import numpy as np

##  CSV Read and DataFrame Creation

We load a CSV file, create a DataFrame, and verify its shape. Initially, we have a dataset with 3097 rows and 16 columns, where each row represents a distinct reviews posted on AppStore for 10 different apps.

In [None]:
def get_data(file):
    data = pd.read_csv(file)
    print(data.shape)
    return data

In [None]:
file = "gps_reannotation-full.csv"
df = get_data(file)
df.info()

In [None]:
# Get unique values of apps and raised ethical concerns of reviews
apps = df['app_name'].unique()
print('Apps:', ', '.join(apps))

concerns = df['cat1'].unique()
print('\nRaised Ethical Concerns: ', ', '.join(concerns))


## Remove links

In [None]:
def removeLink(text):
    no_link = ' '.join(re.sub("(w+://S+)", " ", text).split())
    return no_link

In [None]:
df['clean_content'] = df['content'].apply(lambda x: removeLink(x))
df['clean_content']

## Remove numbers

In [None]:
def removeNumber(text):
    return ' '.join(re.sub(r'[0-9]',' ', text).split())

In [None]:
df['clean_content'] = df['clean_content'].apply(lambda x: removeNumber(x))

df['clean_content']

## Remove Emojis

In [None]:
def deEmojify(text):
    return text.encode('ascii', 'ignore').decode('ascii')

In [None]:
df['clean_content'] = df['clean_content'].apply(lambda x: deEmojify(x))

#df['clean_content']
print(df.loc[450, ['content','clean_content']].values)

## Converting all characters to lowercase

In [None]:
df['clean_content'] = df['clean_content'].apply(lambda x: x.lower())
df['clean_content']

## Remove stopwords
* nltk.corpus.stopwords.words('english') could be also used. However, it contains 179, whereas smart stopword list does 571 words, including ‘i’, ‘me’, ‘my’, ‘myself’, ‘we’, ‘you’, ‘he’, ‘his’, for instance. 
* stpwrd is here extended with app names that are mentioned in the reviews as well since they are going to be included in every reviews that belong to them.

In [None]:
def generate_stopwords():
    stpwrd_url = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a11-smart-stop-list/english.stop"
    response = urlopen(stpwrd_url)
    stpwrds = response.read().decode('utf-8').replace("\n", " ").split()
    return stpwrds

In [None]:
def remove_stopwords(text, stpwrds):
    text = text.split(" ")
    words = [w for w in text if w not in stpwrds]
    return ' '.join(words)

In [None]:
stpwrds = generate_stopwords()
df['clean_content'] = df['clean_content'].apply(lambda x: remove_stopwords(x, stpwrds))
df['clean_content'] 

In [None]:
#df['clean_content']
print(df.loc[400, ['content','clean_content']].values)

## Remove punctuation
The process of punctuation elimination involves iterating through the series using list comprehension and preserving all elements that do not exist in the __string.punctuation__ list. This list, imported at the beginning using __import string__, comprises all punctuation marks.

In [None]:
def removePunctuation(text):
    no_punc = "".join([c for c in text if c not in string.punctuation])
    return no_punc

In [None]:
df['clean_content'] = df['clean_content'].apply(lambda x: removePunctuation(x))
df['clean_content']

In [None]:
df['clean_content'] = df['clean_content'].apply(lambda x: remove_stopwords(x, stpwrds))

## Tokenizing words

* __RegexpTokenizer__ is a function that is used to break down a string into smaller substrings based on a specified regular expression pattern. The selected pattern splits up by spaces that are not attached to a digit as numbers are already cleaned from reviews.
* __discard\_empty__ is set to True. It ensures that any empty tokens produced by the tokenizer are removed from the resulting output. 
(see in https://www.nltk.org/_modules/nltk/tokenize/regexp.html) 

In [None]:
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+', discard_empty=True)
df['clean_content'] = df['clean_content'].apply(lambda x: tokenizer.tokenize(x))

In [None]:
print(df['clean_content'])
print("\nOne particular review:")
print(df.loc[400, ['content','clean_content']].values)

## Lemmatizing

#### WordNet

In [None]:
def get_part_of_speech(word):
    probable_part_of_speech = wordnet.synsets(word)
    pos_counts = Counter()
    #pos_counts["n"] = len([item for item in probable_part_of_speech if item.pos() == "n"])
    pos_counts["v"] = len([item for item in probable_part_of_speech if item.pos() == "v"])
    #pos_counts["n"] = len([item for item in probable_part_of_speech if item.pos() == "n"])
    pos_counts["a"] = len([item for item in probable_part_of_speech if item.pos() == "a"])  
    pos_counts["n"] = len([item for item in probable_part_of_speech if item.pos() == "n"])
    #pos_counts["r"] = len([item for item in probable_part_of_speech if item.pos() == "r"])

    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech

In [None]:
def word_lemmatizer(text, lemmatizer):
    lem_text = [lemmatizer.lemmatize(i, get_part_of_speech(i)) for i in text]
    return lem_text


In [None]:
wordnetlemma =  WordNetLemmatizer()
df['clean_content'] = df['clean_content'].apply(lambda x: word_lemmatizer(x, wordnetlemma))

In [None]:
print(df[['content','clean_content']])
print("\nOne particular review:")
print(df.loc[0, ['content','clean_content']].values)

In [None]:
df['clean_content'] = [' '.join(x) for x in df['clean_content']]
df['clean_content']

In [None]:
df['cat1']

In [None]:
df['cat1_clean'] = df['cat1'].apply(lambda x: x.lower())
df['cat1_clean'] = df['cat1_clean'].str.extract(r'^(.*?)\(', expand=True)
df['cat1_clean'].fillna(df['cat1'], inplace=True)
df['cat1_clean']

In [None]:
df['cat1_clean'] = df['cat1_clean'].str.strip()

df['cat1'] = df['cat1_clean']
df['cat1'].unique()

In [None]:
df.head()

In [None]:
print(df.loc[400, 'content'])
print(df.loc[400, 'clean_content'])

In [None]:
%store df