In [1]:
from __future__ import annotations

import json
import warnings

import pandas as pd
import nltk

from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

# warnings.filterwarnings('ignore')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/azalea/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/azalea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/azalea/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load Data

In [2]:
from ast import literal_eval

def read_urr_data():
    def get_labels(x):
        if isinstance(x, list):
            terms = [i['term'] for i in x]
            if len(terms) > 2:
                terms = terms[:2]
            return terms

    data = pd.read_csv('trainset_ar_miner.csv', encoding='ISO-8859-1')
    data['Class'] = data['Class'].apply(literal_eval).apply(get_labels)
    data['informative'] = data['Class'].apply(lambda x: 'Informative' in x)

    return data

data = read_urr_data()
data.head()

Unnamed: 0,appVersion,User Review,Date,Rating,Class,informative
0,2.06,Fundamental flaw Sometimes drops a 4 instead o...,15-01-16,4,[Informative],True
1,2.06,Too easy and too slow Being able to undo just ...,18-01-16,2,[Informative],True
2,2.06,"Slow Since latest update, it's moving very slo...",15-02-16,4,[Informative],True
3,2.06,love it.. good excercise before i go to work..,20-02-16,5,[Non-informative],False
4,2.06,Keeps crashing Since I upgraded my android thi...,26-02-16,1,[Informative],True


In [4]:
from pathlib import Path


def read_type_data():
    """
    Read data from the paper "Bug Report, Feature Request, or Simply Praise?" (Maalej & Nabil 2016)
    """
    cols = ['app', 'platform', 'pkg', 'id', 'lang', 'blank', 'type', 'date', 'user', 'title', 'User Review']
    data = pd.read_excel(Path('other_data/Informative_Training_Set.xlsx'), names=cols, header=None).drop('blank', axis=1)
    data['informative'] = data['type'].apply(lambda x: x <= 2)
    # data['Class'] = data['informative'].apply(lambda x: ['Informative'] if x else ['Non-informative'])
    data['User Review'] = data['User Review'].apply(lambda x: str(x))
    return data


data = read_type_data()
data.head()

Unnamed: 0,app,platform,pkg,id,lang,type,date,user,title,User Review,informative
0,Instagram,Google Play,com.instagram.android,Z3A6QU9xcFRPRUNyR05RcVZVV2t2ZzR0eXNpNTB2QXkwdz...,English,2,2016-08-05,Angie Martinez,New instagram,I hate the new instagram I don't see why gotta...,True
1,Instagram,Google Play,com.instagram.android,Z3A6QU9xcFRPRzhuenVHSUpsNGU3YVI1U3h5RVFETjJ2Y0...,English,5,2016-08-05,Emad Jadalla,So nice,I love it . So easy . Fabulous â™¥,False
2,Instagram,Google Play,com.instagram.android,Z3A6QU9xcFRPR2c3UmI4VWFVWGhCMGI1VEowczQ3cGZtRF...,English,5,2016-08-05,Halie Godesky,Love it,Its great,False
3,Instagram,Google Play,com.instagram.android,Z3A6QU9xcFRPRTltSjhyd09zYk5vaVNFMldURHQwUzNxbU...,English,1,2016-08-05,Tamera Campbell,Update,"Please get rid of ""stories"" the video quality ...",True
4,Instagram,Google Play,com.instagram.android,Z3A6QU9xcFRPRmcwYjA3U1RBZl80MFV4WDFjX1JGeE1hWn...,English,4,2016-08-05,adam haris,nice app,good,False


In [5]:
import re
from nltk.corpus import stopwords


stop_words = {w.lower() for w in stopwords.words('english')}


# Data Cleaning
def clean_text(text: str):
    # remove everything except alphabets
    text = re.sub("[^a-zA-Z]", " ", text.lower())

    # remove stop words
    text = ' '.join(w for w in text.split() if not w in stop_words)
    
    return text


data['soup'] = data['User Review'].apply(clean_text)
data.sample(10)

Unnamed: 0,app,platform,pkg,id,lang,type,date,user,title,User Review,informative,soup
4775,WeChat,Google Play,com.tencent.mm,Z3A6QU9xcFRPSGloNm1MRHU0UFQtRk5HUXJXSWpnUzhxMn...,English,5,2016-07-30,Manjula Jayasekara,,Very good chatting application.,False,good chatting application
2688,Spotify Music,Google Play,com.spotify.music,Z3A6QU9xcFRPRVBCT29TR0ZPMUU3bUFZYXZZV1h1SjZ0UX...,English,5,2016-08-04,Jason L. Cheung,,Love it!,False,love
1934,Snapchat,Google Play,com.snapchat.android,Z3A6QU9xcFRPRVlFNGcyeThnWGhISWlEbmRNYXJTVDdYen...,English,5,2016-08-04,M Newby,ALMOST GOT IT,I love the app and everything it does. Literal...,False,love app everything literally cant find compla...
628,Instagram,Google Play,com.instagram.android,Z3A6QU9xcFRPR2dSZVpielA2QWFFeEppUEQwQi1vUDU1RT...,English,5,2016-08-04,Muhammad Kabeir,Kay.....,E dey sure die,False,e dey sure die
1942,Snapchat,Google Play,com.snapchat.android,Z3A6QU9xcFRPR1B3OG9tVkxzYTA3TlE4QnlmQW9OXzN2M3...,English,4,2016-08-04,Jose Barajas,Awesome,Great for looking at stories,False,great looking stories
635,Instagram,Google Play,com.instagram.android,Z3A6QU9xcFRPR2dndjZvb3Q2SkIxU213dV85Ny1KQ3Q4cT...,English,5,2016-08-04,Joshua Wheeler,,Awesome,False,awesome
891,Instagram,Google Play,com.instagram.android,Z3A6QU9xcFRPRW5qU3ltXzJfcFhWU3ZpVzJVOWZkcE5xZW...,English,5,2016-08-04,Qayom Lala,lovIng iT,sUpErb apP,False,superb app
7906,LinkedIn,Google Play,com.linkedin.android,Z3A6QU9xcFRPRTgzYzR1SnVHTGVLUC1GU0FQWmlBX1FDb1...,English,5,2016-07-21,vijeta hardeep,app,so nice .,False,nice
4224,WeChat,Google Play,com.tencent.mm,Z3A6QU9xcFRPRUJMM3dDblREajJLUlRUNGZFcDJLRHYxSn...,English,5,2016-08-03,Keng Teong,Google good,Faster to use app,False,faster use app
5302,VLC for Android,Google Play,org.videolan.vlc,Z3A6QU9xcFRPR2pxaDZvREtobUUzRkR3dmZBNjRocm1Wa2...,English,4,2016-08-05,PAULDIP AHEIBAM,,Good,False,good


In [6]:
def resample(data):
    """
    Resample data so that it has similar numbers of 0 and 1s
    """
    d1 = data[data.informative == True]
    d0 = data[data.informative == False]
    num_1s = len(d1)
    num_0s = len(d0)
    final = min(num_1s, num_0s)

    return pd.concat([d1.sample(final), d0.sample(final)], ignore_index=True)


print(f'The data has {sum(data["informative"]) / len(data) * 100:.1f}% informative reviews')
data = resample(data)
print(f'The data has {sum(data["informative"]) / len(data) * 100:.1f}% informative reviews after resample')

The data has 17.8% informative reviews
The data has 50.0% informative reviews after resample


In [7]:
stem_process = PorterStemmer()

def tokenize_and_stem(text):
    # tokenization to ensure that punctuation is caught as its own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    lem = [stem_process.stem(t) for t in filtered_tokens]
    return lem

# Defining a TF-IDF Vectorizer
tfidf_vec = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenize_and_stem, max_features=10000, use_idf=True)

In [8]:
# mb = MultiLabelBinarizer()
# mb.fit(data['Class'])

# y = mb.transform(data['Class'])

y = data['informative']

# Basic validation: splitting the data 80-20 train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['soup'], y, test_size=0.2, random_state=55)

# Tf-Idf transformation
xtrain_tfidf = tfidf_vec.fit_transform(X_train)
xtest_tfidf = tfidf_vec.transform(X_test)
xtrain_tfidf.shape

(2280, 10000)

# EMNB Classifier

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

multinomial_clf = MultinomialNB(alpha=0.1)
oneVsRest_rf = OneVsRestClassifier(multinomial_clf)

oneVsRest_rf.fit(xtrain_tfidf, y_train)

In [10]:
from sklearn.metrics import precision_recall_fscore_support

y_pred1 = oneVsRest_rf.predict(xtest_tfidf)

# Performance metrics
precision, recall, f1score, support = precision_recall_fscore_support(y_test, y_pred1, average='binary')
print(f'Precision: {precision*100:.1f}%')
print(f'Recall   : {recall*100:.1f}%')
print(f'F1-score : {f1score*100:.1f}%')
# print(f'Precision : {", ".join(f"{p*100:.1f}%" for p in precision)}')
# print(f'Recall    : {", ".join(f"{p*100:.1f}%" for p in recall)}')
# print(f'F1-score  : {", ".join(f"{p*100:.1f}%" for p in f1score)}')

Precision: 81.6%
Recall   : 81.4%
F1-score : 81.5%


In [11]:
def infer_tags(texts: list[str]) -> list[bool]:
    """
    Infer informative/noninformative

    :param texts: List of texts
    :return: is informative (Informative: true, noninformative: false)
    """
    texts = [clean_text(t) for t in texts]
    texts_vec = tfidf_vec.transform(texts)
    pred = oneVsRest_rf.predict(texts_vec)

    return list(pred)
    # return [v[0] == 'Informative' for v in mb.inverse_transform(pred)]


infer_tags(['This is nice', 'This app doesn\'t work', 'This app is good', 'The notepad feature crashes'])

[False, False, False, True]

# Predict informative/noninformative for new data

In [12]:
import random
from hypy_utils.tqdm_utils import pmap, smap
from typing import NamedTuple
from pathlib import Path
import langid
from langdetect import detect


DIR_REVIEWS = Path('reviews')
DIR_APP_REVIEWS = lambda pkg: DIR_REVIEWS / 'play_store' / f'{pkg}.json'


def predict_app(app: str):
    out = Path(f'reviews/play_store_pred/{app}.json')
    if out.is_file():
        return json.loads(out.read_text())

    # Filter english reviews only
    reviews = [r for r in json.loads(DIR_APP_REVIEWS(app).read_text()) if r['text'] is not None]
    tags = infer_tags([r['text'] for r in reviews])
    for r, t in zip(reviews, tags):
        r['pred_language'] = langid.classify(r['text'])[0]
        # r['pred_language'] = detect(r['text'])
        r['pred_informative'] = bool(t)

    # print(reviews)

    out.parent.mkdir(exist_ok=True, parents=True)
    out.write_text(json.dumps(reviews))
    return reviews


apps: dict[str, str] = json.loads(Path('other_data/app_sets/selected_apps.json').read_text())
all_reviews = smap(predict_app, list(apps.keys()), desc='Predicting informativeness', unit='apps')
all_reviews = random.sample([i for lst in all_reviews for i in lst], 1500)

100%|██████████| 637/637 [00:31<00:00, 19.97it/s] 


In [14]:
cols = ['id', 'text', 'pred_informative']
all_reviews_df = pd.DataFrame([[r[k] for k in cols] for r in all_reviews], columns=cols)
all_reviews_df.head()


Unnamed: 0,id,text,pred_informative
0,224aabb6-1a20-4ad2-b6f3-fe8fdb902f08,Amazing game with a amazing community,False
1,dae29bd9-18df-4243-ae0f-3e6be40a9ea0,Great app. Helps me stay organized,False
2,8b5d6a89-fd59-43a2-86ed-d9cf6ae0c3cb,Better than Google,True
3,648ee8db-5d41-465b-9e93-28fb4da07093,Good,False
4,929e329d-dbf6-45ba-9786-6c553dd4a5b2,When screen is off In S6E the phisycal back an...,True
