# Classification with FastText

FastText uses a simple and efficient baseline for sentence classification( represent sentences as bag of words (BoW) and train a linear classifier). It uses negative sampling, hierarchical softmax and N-gram features to reduce computational cost and improve efficiency.

# Set up

Import packages

In [78]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

# For Tokenizing
import re
import string
import itertools
from contractions import contractions_dict

from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer

# For Text Classification
import fasttext

# for help on fasttext, uncomment the line below and run it.
# help(fasttext.FastText)

Import the data, including the polarity values that we have calculated previously in `prelim_naive_model.ipynb`.

In [79]:
reviews = pd.read_csv("../data/googleplaystore_user_reviews.csv")
general_data = pd.read_csv("../data/googleplaystore_cleaned.csv")
calculated_polarity = pd.read_csv("../data/reviews_naive_polarity.csv", index_col=0)

Checking the number of apps that actually have reviews

In [80]:
apps_with_reviews = general_data[(general_data["App"].isin(reviews["App"]))]
print("Number of apps given in dataset", len(general_data["App"]))
print("Number of apps that actually have reviews ", len(apps_with_reviews["App"]))

Number of apps given in dataset 10840
Number of apps that actually have reviews  1445


Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


Merge the data together

In [115]:
reviews_with_ratings = pd.merge(reviews,general_data,on="App")

columns_to_drop = ["Sentiment", "Sentiment_Polarity", "Sentiment_Subjectivity"]
reviews_with_ratings = reviews_with_ratings.drop(columns=columns_to_drop)

reviews_with_ratings = reviews_with_ratings.drop_duplicates().dropna()

# renaming columns for clarity
renamed_columns = {'Rating':'Average_Rating', 
                  'Translated_Review': 'Review',
                  'Reviews': 'Num_Reviews'}
reviews_with_ratings.rename(columns = renamed_columns, inplace=True)
# sentiment_ratings = calculated_polarity['Compound Score']
reviews_with_ratings = pd.merge(reviews_with_ratings, calculated_polarity, on="Review")
reviews_with_ratings["Sentiment_Rating"] = reviews_with_ratings["Compound Score"].apply(lambda x: (x-(-1))/(1-(-1))*(5-1)+1)
reviews_with_ratings.head()

Unnamed: 0,App,Review,Category,Average_Rating,Num_Reviews,Size,Installs,Type,Price,Content Rating,Genres,Tokens,Sentiment,Neutral Proportion,Positive Proportion,Negative Proportion,Compound Score,Sentiment_Rating
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,HEALTH_AND_FITNESS,4.0,2490,3800000.0,500000,Free,0,Everyone,Health & Fitness,"['i', 'like', 'eat', 'delicious', 'food', 'tha...",Positive,0.466,0.534,0.0,0.9531,4.9062
1,10 Best Foods for You,This help eating healthy exercise regular basis,HEALTH_AND_FITNESS,4.0,2490,3800000.0,500000,Free,0,Everyone,Health & Fitness,"['this', 'help', 'eating', 'healthy', 'exercis...",Positive,0.481,0.519,0.0,0.6597,4.3194
2,10 Best Foods for You,Works great especially going grocery store,HEALTH_AND_FITNESS,4.0,2490,3800000.0,500000,Free,0,Everyone,Health & Fitness,"['work', 'great', 'especially', 'going', 'groc...",Positive,0.549,0.451,0.0,0.6249,4.2498
3,10 Best Foods for You,Best idea us,HEALTH_AND_FITNESS,4.0,2490,3800000.0,500000,Free,0,Everyone,Health & Fitness,"['best', 'idea', 'u']",Positive,0.323,0.677,0.0,0.6369,4.2738
4,10 Best Foods for You,Best way,HEALTH_AND_FITNESS,4.0,2490,3800000.0,500000,Free,0,Everyone,Health & Fitness,"['best', 'way']",Positive,0.192,0.808,0.0,0.6369,4.2738


Give labels based on predicted rating

In [116]:
def rating_label(sentiment):
    if sentiment < 1.5: 
        return 1
    elif sentiment >= 1.5 and sentiment < 2.5:
        return 2
    elif sentiment >= 2.5 and sentiment < 3.5:
        return 3
    elif sentiment >= 3.5 and sentiment < 4.5:
        return 4
    elif sentiment >= 4.5:
        return 5
    else:
        return 0
rating_labels = reviews_with_ratings["Sentiment_Rating"].apply(rating_label)
reviews_with_ratings.insert(4, "Rating_Label", rating_labels)
reviews_with_ratings.head()

Unnamed: 0,App,Review,Category,Average_Rating,Rating_Label,Num_Reviews,Size,Installs,Type,Price,Content Rating,Genres,Tokens,Sentiment,Neutral Proportion,Positive Proportion,Negative Proportion,Compound Score,Sentiment_Rating
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,HEALTH_AND_FITNESS,4.0,5,2490,3800000.0,500000,Free,0,Everyone,Health & Fitness,"['i', 'like', 'eat', 'delicious', 'food', 'tha...",Positive,0.466,0.534,0.0,0.9531,4.9062
1,10 Best Foods for You,This help eating healthy exercise regular basis,HEALTH_AND_FITNESS,4.0,4,2490,3800000.0,500000,Free,0,Everyone,Health & Fitness,"['this', 'help', 'eating', 'healthy', 'exercis...",Positive,0.481,0.519,0.0,0.6597,4.3194
2,10 Best Foods for You,Works great especially going grocery store,HEALTH_AND_FITNESS,4.0,4,2490,3800000.0,500000,Free,0,Everyone,Health & Fitness,"['work', 'great', 'especially', 'going', 'groc...",Positive,0.549,0.451,0.0,0.6249,4.2498
3,10 Best Foods for You,Best idea us,HEALTH_AND_FITNESS,4.0,4,2490,3800000.0,500000,Free,0,Everyone,Health & Fitness,"['best', 'idea', 'u']",Positive,0.323,0.677,0.0,0.6369,4.2738
4,10 Best Foods for You,Best way,HEALTH_AND_FITNESS,4.0,4,2490,3800000.0,500000,Free,0,Everyone,Health & Fitness,"['best', 'way']",Positive,0.192,0.808,0.0,0.6369,4.2738


# Preprocessing

Performing similar preprocessing in [`prelim_nlp_model.ipynb`](./prelim_nlp_model.ipynb), but now instead of tokens, we will return the entire text. We will explain why later.

We will also not include a SpellChecker anymore, as FastText is able to produce vectors for any words, even made-up ones. 
- fastText word vectors are built from vectors of substrings of characters contained in it. 
- This allows to build vectors even for misspelled words or concatenation of words.

Refer here: https://fasttext.cc/docs/en/faqs.html

In [117]:
lemmatizer = WordNetLemmatizer()
en_stop = set(stopwords.words('english'))

# Regex Patterns
html_pattern = re.compile('<[^<]+?>')
lengthening_pattern = re.compile(r"(.)\1{2,}")
contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

def remove_html_tags(s):
    global html_pattern
    return html_pattern.sub('', s)
    
def remove_punctuation(s):
    return s.translate(str.maketrans('', '', string.punctuation))    

def expand_contractions(text):
    global contractions_pattern
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) \
            if contractions_dict.get(match) \
            else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def reduce_lengthening(s):
    global lengthening_pattern
    return lengthening_pattern.sub(r"\1\1", s)

def lemmatize(w):
    global lemmatizer
    return lemmatizer.lemmatize(w)

def normalize(s):
    tokens = word_tokenize(s)
    tokens = [lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in en_stop]
    tokens = [word for word in tokens if len(word) > 3]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


def preprocess(s):
    s = s.lower()
    s = remove_html_tags(s)
    s = expand_contractions(s)
    s = remove_punctuation(s)
    s = reduce_lengthening(s)
    result = normalize(s)
    
    return result

In [118]:
df = reviews_with_ratings["Review"]
reviews_with_ratings.insert(2, "Preprocessed_Review", df.apply(preprocess))
reviews_with_ratings.sample(4)

Unnamed: 0,App,Review,Preprocessed_Review,Category,Average_Rating,Rating_Label,Num_Reviews,Size,Installs,Type,Price,Content Rating,Genres,Tokens,Sentiment,Neutral Proportion,Positive Proportion,Negative Proportion,Compound Score,Sentiment_Rating
16995,EyeCloud,Lost alerts recently phones. Any ideas? We uni...,lost alert recently phone idea uninstalled rei...,LIBRARIES_AND_DEMO,3.1,5,1267,55000000.0,100000,Free,0,Everyone,Libraries & Demo,"['lost', 'alert', 'recently', 'phone', 'any', ...",Positive,0.533,0.395,0.072,0.8834,4.7668
6339,Baby Panda Learns Shapes,you,,PARENTING,4.0,3,3789,42000000.0,1000000,Free,0,Everyone,Parenting;Education,['you'],Neutral,1.0,0.0,0.0,0.0,3.0
4475,Angry Birds Classic,"It much addicted game me, I really enjoy playi...",much addicted game really enjoy playing graphi...,GAME,4.4,5,5566889,97000000.0,100000000,Free,0,Everyone,Arcade,"['it', 'much', 'addicted', 'game', 'me', 'i', ...",Positive,0.539,0.31,0.151,0.8248,4.6496
16180,EatStreet Food Delivery App,I quite often lazy s*** lol extremely useful a...,quite often lazy extremely useful aware driver...,FOOD_AND_DRINK,4.6,4,7690,25000000.0,100000,Free,0,Everyone,Food & Drink,"['i', 'quite', 'often', 'lazy', 's', 'lol', 'e...",Negative,0.597,0.275,0.128,0.4964,3.9928


Save this preprocessed data for reusability

In [119]:
reviews_with_ratings.to_csv("../data/reviews_joined.csv")

FastText provides two models for computing word representations: 
1. skipgram: predict a target word wrt a nearby word
2. cbow ('continuous-bag-of-words'): predict a target word accordin to its context. The context is represented as a bag of the words contained in a fixed size window around the target word.

In practice, we observe that skipgram models works better with subword information than cbow. So we wil try out the skipgram model.

But first thing first, we have to make the input in the following format:

`_label_0 your_text`

In [120]:
import csv

col = ["Rating_Label", "Preprocessed_Review"]
df = reviews_with_ratings[col]
df['Rating_Label'] = df['Rating_Label'].apply(lambda s: '__label__'+ str(s))
df['Preprocessed_Review'] = df['Preprocessed_Review'].replace('\n',' ', regex=True).replace('\t',' ', regex=True)
df.to_csv("FastText_data/reviews_joined.txt", index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [121]:
df.head()

Unnamed: 0,Rating_Label,Preprocessed_Review
0,__label__5,like delicious food cooking food case best foo...
1,__label__4,help eating healthy exercise regular basis
2,__label__4,work great especially going grocery store
3,__label__4,best idea
4,__label__4,best


In [122]:
test_size = int(0.2*len(df["Preprocessed_Review"]))
test_size

4987

In [123]:
train_size = int(0.8*len(df["Preprocessed_Review"]))
train_size

19948

In [124]:
!head -n 2149 "FastText_data/reviews_joined.txt" > "FastText_data/reviews_test.txt"
!tail -n 8596 "FastText_data/reviews_joined.txt" > "FastText_data/reviews_train.txt"

In [125]:
%%time
model = fasttext.train_supervised(input="FastText_data/reviews_train.txt")
model.save_model("fasttext_reviews_model.bin")

CPU times: user 359 ms, sys: 125 ms, total: 484 ms
Wall time: 251 ms


Testing model

In [128]:
model.predict("This app sucks horrible just delete")

(('__label__2',), array([0.88790017]))

In [129]:
model.predict("I love this app great features")

(('__label__5',), array([0.99989247]))

In [131]:
reviews_with_ratings["FastText Prediction"] = reviews_with_ratings["Review"].apply(lambda x: model.predict(x)[0])


In [134]:
reviews_with_ratings.sample(4)

Unnamed: 0,App,Review,Preprocessed_Review,Category,Average_Rating,Rating_Label,Num_Reviews,Size,Installs,Type,...,Content Rating,Genres,Tokens,Sentiment,Neutral Proportion,Positive Proportion,Negative Proportion,Compound Score,Sentiment_Rating,FastText Prediction
1320,8 Ball Pool,The game rigged completely. Spin wheel never p...,game rigged completely spin wheel never extra ...,GAME,4.5,3,14198297,52000000.0,100000000,Free,...,Everyone,Sports,"['the', 'game', 'rigged', 'completely', 'spin'...",Negative,0.769,0.13,0.101,-0.0772,2.8456,"(__label__3,)"
18679,Find&Save - Local Shopping,"I reading reviews i'm sure, want waste time pu...",reading review sure want waste time putting re...,SHOPPING,4.0,4,4602,6200000.0,500000,Free,...,Everyone,Shopping,"['i', 'reading', 'review', 'sure', 'want', 'wa...",Negative,0.671,0.23,0.099,0.4019,3.8038,"(__label__4,)"
2059,A+ Mobile,crashes mobile deposit work,crash mobile deposit work,FINANCE,3.9,3,730,6300000.0,10000,Free,...,Everyone,Finance,"['crash', 'mobile', 'deposit', 'work']",Neutral,1.0,0.0,0.0,0.0,3.0,"(__label__3,)"
22573,HD Movie Video Player,I love much. I every thing want .This perfect ...,love much every thing want perfect must like r...,VIDEO_PLAYERS,4.5,5,18699,4100000.0,1000000,Free,...,Everyone,Video Players & Editors,"['i', 'love', 'much', 'i', 'every', 'thing', '...",Positive,0.394,0.606,0.0,0.9612,4.9224,"(__label__5,)"


In [None]:
len(reviews_with_ratings["Rating_Label"] == ["FastText_Pre])