In [1]:
import pandas as pd
import numpy as np
import os
import sys

In [14]:
# Combining reviews

with open('../googleplay-apps.txt') as appfile:
    line = appfile.readline()

    while(line):
        app = line.strip()

        applestoredf = pd.read_csv('../review-files-sentiment/applestore-review-' + app + '_sentiment.csv')
        googleplaydf = pd.read_csv('../review-files-sentiment/googleplay-review-' + app + '_sentiment.csv')
        googleplaydf = googleplaydf.rename(columns={"content": "review", "score": "rating", "at": "date"})
        cdf = pd.concat([applestoredf, googleplaydf], ignore_index=True)

        cdf.to_csv('review-files-combined/combined-review-' + app + '_sentiment.csv', index=False)
        line = appfile.readline()

In [6]:
# Standardize reviews

import string
import re
from unidecode import unidecode
import nltk
from nltk.corpus import stopwords

def standardize(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove diacritics and accents
    text = unidecode(text)

    # Tokenization
    text = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    text = [word for word in text if word not in stop_words]

    # Return the standardized string
    return ' '.join(text)


filelist = [file for file in os.listdir('../review-files-sentiment/')]


for file in filelist:
    print(file)

    df = pd.read_csv('../review-files-sentiment/' + file)
    df = df.rename(columns={"content": "review", "score": "rating", "at": "date"})

    # if 'apple' in file:
    #     df['review'] = df['review'].astype('str')
    #     df['standardizedReview'] = df['review'].apply(standardize)
    # else:
    #     df['content'] = df['content'].astype('str')
    #     df['standardizedReview'] = df['content'].apply(standardize)

    df['review'] = df['review'].astype('str')
    df['standardizedReview'] = df['review'].apply(standardize)

    

    output_file = os.path.splitext(file)[0] + '_standardized.csv'

    df.to_csv('googleplay-review-files-standardized/' + output_file, index=False)
    

applestore-review-programming-hub_sentiment.csv
applestore-review-python-x_sentiment.csv
googleplay-review-sololearn_sentiment.csv
googleplay-review-datacamp_sentiment.csv
googleplay-review-programming-hero_sentiment.csv
applestore-review-sololearn_sentiment.csv
googleplay-review-encode_sentiment.csv
googleplay-review-learn-python-programiz_sentiment.csv
applestore-review-programming-hero_sentiment.csv
googleplay-review-codeacademy_sentiment.csv
applestore-review-learn-python-programiz_sentiment.csv
googleplay-review-mimo_sentiment.csv
applestore-review-mimo_sentiment.csv
applestore-review-codeacademy_sentiment.csv
applestore-review-encode_sentiment.csv
googleplay-review-programming-hub_sentiment.csv
applestore-review-datacamp_sentiment.csv


In [7]:
# Getting stems

from stemming.porter2 import stem
from collections import defaultdict

# with open('features.txt') as file:
#     str = file.read()

# features = [feature.strip() for feature in str.split(',')]

# df = pd.read_csv('features-all.csv')
# print(df)

# def stem_phrase(phrase):
#     words = phrase.split()
#     stemmed_words = [stem(word) for word in words]
#     return ' '.join(stemmed_words)

# df['Stem'] = df['Feature'].apply(stem)

# df['SynonymStem'] = df['Synonym'].apply(stem)
# df['SynonymStem'] = df['SynonymStem'].apply(standardize)

# print(df)

# df.to_csv('feature-list.csv', index=False)
df = pd.read_csv('feature-list.csv')

my_dict = defaultdict(set)
for feature, synonym in zip(df['Feature'], df['SynonymStem']):
    my_dict[feature].add(synonym)

# Print the resulting dictionary
print(my_dict)

defaultdict(<class 'set'>, {'daily streaks': {'daily habit', 'daily remind', 'daily streak', 'daily progress', 'gamified routin', 'consecutive engag', 'motivation engag', 'gamified habit', 'sense progress', 'sense accomplish', 'gamified incent', 'daily routin', 'habitform', 'daily reward', 'daily checkin', 'daily bonus', 'gamified consist', 'daily challeng', 'gamified track', 'daily motiv'}, 'points system': {'leaderboard competit', 'gamified motiv', 'point accumul', 'point reward', 'points system', 'pointbased reward', 'pointbased recognit', 'point track', 'gamified point', 'gamified measur', 'level advanc', 'gamified feedback', 'gamified recognit', 'sense progress', 'sense accomplish', 'gamified incent', 'gamified reward', 'point valu', 'pointbased feedback', 'pointbased motiv'}, 'progress bar': {'goal progress', 'progress incent', 'progress feedback', 'goaloriented feedback', 'progress display', 'progress track', 'progress represent', 'milestone recognit', 'milestone track', 'visual

In [9]:
# Filtering reviews

filelist = [file for file in os.listdir('standardized-reviews/googleplay-review-files-standardized/')]


for file in filelist:
    print(file)

    df = pd.read_csv('standardized-reviews/googleplay-review-files-standardized/' + file)
    df['standardizedReview'] = df['standardizedReview'].astype('str')
    

    df['mentionedFeature'] = df['standardizedReview'].apply(lambda x: ', '.join([feature for feature, synonyms in my_dict.items() if any(syn in x for syn in synonyms)]))
    df['mentionedSynonyms'] = df['standardizedReview'].apply(lambda x: ', '.join([syn for feature, synonyms in my_dict.items() for syn in synonyms if syn in x]))

    df.replace('', np.nan, inplace=True)
    df.dropna(subset=['mentionedFeature'], inplace=True)
    #df['standardizedReview'] = df['review'].apply(standardize)

    output_file = os.path.splitext(file)[0] + '_filtered.csv'

    df.to_csv('./filtered-reviews/googleplay-review-files-filtered/' + output_file, index=False)

googleplay-review-sololearn_sentiment_standardized.csv
googleplay-review-mimo_sentiment_standardized.csv
googleplay-review-programming-hub_sentiment_standardized.csv
googleplay-review-datacamp_sentiment_standardized.csv
googleplay-review-codeacademy_sentiment_standardized.csv
googleplay-review-programming-hero_sentiment_standardized.csv
googleplay-review-learn-python-programiz_sentiment_standardized.csv
googleplay-review-encode_sentiment_standardized.csv


In [52]:
# Aggregating feature sentiments

scores = defaultdict(list)
for feature in my_dict.keys():
    feature = feature.lower()
    scores[feature] = [0, 0]

print(scores)
print(scores['daily streaks'][0])
print(scores['daily streaks'][1])

filelist = [file for file in os.listdir('review-files-combined-filtered/')]

for file in filelist:
    print(file)

    df = pd.read_csv('review-files-combined-filtered/' + file)

    for index, row in df.iterrows():
        mentioned_features = [feature.strip() for feature in row['mentionedFeature'].split(',')]
        #print(mentioned_features)
        for feature in mentioned_features:
            #print(feature)
            if row['sentiment'] == 'POSITIVE':
                scores[feature][0] += 1
            elif row['sentiment'] == 'NEGATIVE':
                scores[feature][1] -= 1

for feature in scores.keys():
    scores[feature].append((scores[feature][0] + scores[feature][1]) / (scores[feature][0] - scores[feature][1]))

print(scores)

overallscoredf = pd.DataFrame(columns = ['feature', 'positiveCount', 'negativeCount', 'score'])

for feature in scores.keys():
    new_row = pd.Series({'feature': feature, 'positiveCount': scores[feature][0], 'negativeCount': scores[feature][1], 'score': scores[feature][2]})
    overallscoredf = pd.concat([overallscoredf, new_row.to_frame().T], ignore_index=False)

overallscoredf.to_csv('feature-scores.csv')

defaultdict(<class 'list'>, {'daily streaks': [0, 0], 'points system': [0, 0], 'progress bar': [0, 0], 'strike habit/competition': [0, 0], 'social discovery': [0, 0], 'challenges': [0, 0], 'leaderboard/competition': [0, 0], 'boosters': [0, 0], 'virtual economy': [0, 0], 'torture breaks': [0, 0], 'visual grave': [0, 0], 'high five': [0, 0], 'last mile drive': [0, 0], 'anticipation parade': [0, 0], 'avatar': [0, 0], 'win states': [0, 0], 'voting': [0, 0], 'crowning': [0, 0], 'status quo sloth': [0, 0]})
0
0
combined-review-mimo_sentiment_standardized_filtered.csv
combined-review-sololearn_sentiment_standardized_filtered.csv
combined-review-datacamp_sentiment_standardized_filtered.csv
combined-review-codeacademy_sentiment_standardized_filtered.csv
combined-review-programming-hero_sentiment_standardized_filtered.csv
combined-review-learn-python-programiz_sentiment_standardized_filtered.csv
combined-review-encode_sentiment_standardized_filtered.csv
combined-review-programming-hub_sentiment_

In [49]:
# Feature sentiment per app

scoredf = pd.DataFrame(columns = ['app', 'feature', 'positiveCount', 'negativeCount', 'score'])

filelist = [file for file in os.listdir('review-files-combined-filtered/')]

with open('../googleplay-apps.txt') as appfile:
    line = appfile.readline()

    while(line):
        app = line.strip()
        
        df = pd.read_csv('review-files-combined-filtered/combined-review-' + app + '_sentiment_standardized_filtered.csv')

        appscores = defaultdict(list)
        for feature in my_dict.keys():
            feature = feature.lower()
            appscores[feature] = [0, 0]

        for index, row in df.iterrows():
            mentioned_features = [feature.strip() for feature in row['mentionedFeature'].split(',')]
            #print(mentioned_features)
            for feature in mentioned_features:
                #print(feature)
                if row['sentiment'] == 'POSITIVE':
                    appscores[feature][0] += 1
                elif row['sentiment'] == 'NEGATIVE':
                    appscores[feature][1] -= 1

        for feature in appscores.keys():
            if appscores[feature][0] - appscores[feature][1] == 0:
                appscores[feature].append(0)
                continue
            appscores[feature].append((appscores[feature][0] + appscores[feature][1]) / (appscores[feature][0] - appscores[feature][1]))

        for feature in appscores.keys():
            new_row = pd.Series({'app': app, 'feature': feature, 'positiveCount': appscores[feature][0], 'negativeCount': appscores[feature][1], 'score': appscores[feature][2]})
            
            scoredf = pd.concat([scoredf, new_row.to_frame().T], ignore_index=False)

        line = appfile.readline()

scoredf.to_csv('feature-scores-by-app.csv')


In [12]:
# Overall sentiment per app
sentimentdf = pd.DataFrame(columns = ['app', 'positiveCount', 'negativeCount', 'score'])

with open('../googleplay-apps.txt') as appfile:
    line = appfile.readline()

    while(line):
        app = line.strip()
        print(app)

        df = pd.read_csv('../review-files-sentiment/applestore-reviews/applestore-review-' + app + '_sentiment.csv')

        positiveCount = 0
        negativeCount = 0

        for index, row in df.iterrows():

            if row['sentiment'] == 'POSITIVE':
                positiveCount += 1
            elif row['sentiment'] == 'NEGATIVE':
                negativeCount -= 1
        
        score = (positiveCount + negativeCount) / (positiveCount - negativeCount)
        new_row = pd.Series({'app': app, 'positiveCount': positiveCount, 'negativeCount': negativeCount, 'score': score})
        sentimentdf = pd.concat([sentimentdf, new_row.to_frame().T], ignore_index=False)

        line = appfile.readline()

sentimentdf.to_csv('applestore-app-scores.csv', index=False)

codeacademy
datacamp
encode
learn-python-programiz
mimo
programming-hero
programming-hub
sololearn


In [10]:
# Renaming score for clarity and adding proportions of positive and negative reviews

files = ['app-sentiment.csv', 
         'feature-sentiment.csv', 
         'feature-sentiment_python.csv', 
         'feature-sentiment-by-app.csv', 
         'feature-sentiment-by-app_python.csv']

for file in files:
    df = pd.read_csv(file)

    df = df.rename(columns={'score': 'aggregatedAverage'})

    df['negativeCount'] = df['negativeCount']*(-1)

    df['positiveProportion'] = df['positiveCount'] / (df['positiveCount'] + df['negativeCount'])
    df['negativeProportion'] = 1 - df['positiveProportion']

    df.to_csv(file, index=False)