Import necessary dependencies

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

# For Tokenizing
import re
import string
import itertools
from spellchecker import SpellChecker
from contractions import contractions_dict

from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer

# For Text Classification
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment import SentimentAnalyzer
from nltk.corpus import subjectivity
from nltk.classify import NaiveBayesClassifier

Download Necessary Corpus for tokenizer, and sentiment analyzers to work.

In [6]:
download('punkt')
download('stopwords')
download('vader_lexicon')

[nltk_data] Downloading package punkt to /home/lyqht/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lyqht/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/lyqht/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## The Dataset

Import the dataset and rename some columns

In [7]:
df = pd.read_csv("../data/googleplaystore_user_reviews.csv")
df.rename(columns = {'Translated_Review':'Review'}, inplace=True)
df.sample(4)

Unnamed: 0,App,Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
26969,Clash Royale,I really like meet new people talk friends. It...,Positive,0.007273,0.490909
1507,8 Ball Pool,Loved game UNTIL I update. Now open even allow...,Negative,-0.009821,0.607143
22854,Camera360 Lite - Selfie Camera,,,,
60142,Harry Potter: Hogwarts Mystery,,,,


## Terminology
Source of definitions: Bing Liu. Sentiment Analysis and Opinion Mining, Morgan & Claypool Publishers, May 2012
- Sentiment Polarity: Can be split into rational or emotional sentiment. Hence resulting in the common design of 5 sentiment ratings.
    - emotional negative (-2)
    - rational negative (-1)
    - neutral (0): In practice, neutral often means no opinion or sentiment expressed.
    - rational positive (+1)
    - emotional positive (+2)
- Sentiment Subjectivity: Subjective sentences expresses some personal feelings, views, or beliefs. They may not express any sentiment. 




We see that the original dataset also has labels of the sentiment level, polarity and subjectivity by the original author. 
We will build our own model then compare the values to the values in the original dataset for comparison.

Getting the reviews 

In [8]:
raw_reviews = df["Review"].copy()
raw_reviews.sample(5)

60037                                    Most time respond
11065                                                  NaN
10015    I solved problem know login app. I know hard i...
10395                                                  NaN
13682                                                  NaN
Name: Review, dtype: object

# Text Mining

## Preprocessing

We will drop NA types in the reviews data, since they do not contain any text. They probably got into the dataset because of imperfect data scraping.

In [9]:
reviews = raw_reviews.dropna(axis=0)
print("Old data frame length:", len(raw_reviews), "\nNew data frame length:",  
       len(reviews), "\nNumber of rows with at least 1 NA value: ", 
       (len(raw_reviews)-len(reviews))) 

Old data frame length: 64295 
New data frame length: 37427 
Number of rows with at least 1 NA value:  26868


These are the preprocessing steps that we will be doing for every review.
1. Converting text to lowercase
2. Remove html tags
3. Expand Contractions
    - Dictionary source from : https://github.com/pemagrg1/Text-Pre-Processing-in-Python/blob/master/individual_python_files/contractions.py
4. Remove punctuation
5. Using NLKT's `PunktSentenceTokenizer` to create tokens
6. Word Normalization for Tokens
    - Correcting mispelled words
    - Stop word removal
    - Lemmatization 

In [10]:
lemmatizer = WordNetLemmatizer()
stopword = set(stopwords.words('english'))
spell = SpellChecker()

In [11]:
# Regex Patterns
html_pattern = re.compile('<[^<]+?>')
lengthening_pattern = re.compile(r"(.)\1{2,}")
contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

In [12]:
def remove_html_tags(s):
    global html_pattern
    return html_pattern.sub('', s)
    
def remove_punctuation(s):
    return s.translate(str.maketrans('', '', string.punctuation))    

def expand_contractions(text):
    global contractions_pattern
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) \
            if contractions_dict.get(match) \
            else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def reduce_lengthening(s):
    global lengthening_pattern
    return lengthening_pattern.sub(r"\1\1", s)

def lemmatize(w):
    global lemmatizer
    return lemmatizer.lemmatize(w)

def normalize(s):
    global spell
    tokens = word_tokenize(s)
    tokens = [lemmatize(word) for word in tokens]
    return tokens

def preprocess(s):
    s = s.lower()
    s = remove_html_tags(s)
    s = expand_contractions(s)
    s = remove_punctuation(s)
    s = reduce_lengthening(s)
    words = normalize(s)
    
    return words

Testing preprocessing on some reviews

In [13]:
random_sample = reviews.sample(5)
print("Random Sample")
print(random_sample, "\n")
print("Normalized Tokens produced")
random_sample.progress_apply(preprocess)

  0%|          | 0/5 [00:00<?, ?it/s]

Random Sample
5061     Great app, please make picture Airbnb trip/fla...
49977    If encounter technical issues addressed. Very ...
1473     Why I wait opponent I'm ready shoot???????? Al...
36004    Why permissions for? Why would child's need th...
54725                            Good product. Good value.
Name: Review, dtype: object 

Normalized Tokens produced


100%|██████████| 5/5 [00:02<00:00,  1.68it/s]


5061     [great, app, please, make, picture, airbnb, tr...
49977    [if, encounter, technical, issue, addressed, v...
1473     [why, i, wait, opponent, ready, shoot, also, n...
36004    [why, permission, for, why, would, child, need...
54725                         [good, product, good, value]
Name: Review, dtype: object

Applying preprocessing to all the reviews

In [None]:
tokens = reviews.apply(preprocess)
processed_df = pd.DataFrame(reviews)
processed_df["Tokens"] = tokens
processed_df.sample(5)

Saving this data for reuse

In [None]:
processed_df.to_csv("../data/tokenized.csv")

Visualizing the most common tokens and saving the plot

In [None]:
fig = plt.figure(figsize = (10,4))
plt.gcf().subplots_adjust(bottom=0.15)

word_counts = list(itertools.chain(*tokens))
freq_dist = FreqDist(word_counts)
freq_dist.plot(30, cumulative=False)

plt.show()
fig.savefig('images/freqDist.png', bbox_inches = "tight")

Some steps that we did not include for preprocessing that could affect our model:
- Removing numbers: sometimes in reviews we see that users explain their rationale of giving a specific number of star rating, so we did not want to remove them.
- Removing chinese reviews and apps: We assumed that China does not have access to Google Playstore, so there will be lesser chinese reviews and apps made in Chinese, but this is certainly not true. However, filtering them out is a hassle atm so we did not do this step.
- Spell Checking every token: We tried this, but it causes the computation to be taking too long! So we ignore it for now...

# Modelling

We will use `VADER SentimentIntensityAnalyzer` for calculating polarity scores for each review. The scoring is computed as following from https://github.com/cjhutto/vaderSentiment:

- The `compound` score is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive). This is the most useful metric if you want a single unidimensional measure of sentiment for a given sentence. Calling it a 'normalized, weighted composite score' is accurate.
    - positive sentiment: compound score >= 0.05
    - neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
    - negative sentiment: compound score <= -0.05

The `pos`, `neu`, and `neg` scores are ratios for proportions of text that fall in each category (so these should all add up to be 1... or close to it with float operation). These are the most useful metrics if you want multidimensional measures of sentiment for a given sentence.

In [None]:
analyzer = SentimentIntensityAnalyzer()
def sentiment_polarity(s):
    global analyzer
    polarity_scores = analyzer.polarity_scores(s)
    compound_score = polarity_scores["compound"]
    if compound_score >= 0.5:
        label = "Positive"
    elif compound_score > -0.05 and compound_score < 0.05:
        label = "Neutral"
    else:
        label = "Negative"
    return label, polarity_scores["neu"], polarity_scores["pos"], polarity_scores["neg"]

In [None]:
df = processed_df
df["Sentiment"], df["Neutral Proportion"], df["Positive Proportion"], df["Negative Proportion"] =  zip(*df["Review"].apply(sentiment_polarity))
df.sample(3)

In [None]:
df.to_csv("../data/polarity.csv")

Now we will classify the reviews based on subjectivity using a `NaiveBayesClassifier`. First, we split the data for testing and training.

In [None]:
test_size = int(0.2*len(reviews))
x_train = df[:test_size]
x_test = df[test_size:]

Next, we build a vocabulary consisting of all the words that the training set of reviews have. From the output, you can see that there is still much to be improved upon for the tokens that were obtained despite the preprocessing.

In [None]:
vocabulary = set(word_counts)
print("Number of words in vocabulary: {}".format(len(vocabulary)))
vocabulary

Then for each data point (tokens with the pos/neg tag), we will build a dictionary that says whether it has particular features/ words from the vocabulary.

In [None]:
def make_features(row):
    return ({i:(i in row["Review"]) for i in vocabulary}, row["Sentiment"])

features = x_train.progress_apply(make_features, axis=1)

In [None]:
classifier = NaiveBayesClassifier.train(features)