In [5]:
import pandas as pd
import numpy as np
import json
import os
import sys

In [None]:
# Read the JSON data from the text file
with open('feature-synonyms_Spanish.txt', 'r') as file:
    json_data = json.load(file)

# Create a DataFrame from the JSON data
df = pd.DataFrame(json_data)

# Print the DataFrame
print(df)

In [14]:
# # Combining reviews

# with open('../googleplay-apps.txt') as appfile:
#     line = appfile.readline()

#     while(line):
#         app = line.strip()

#         applestoredf = pd.read_csv('../review-files-Spanish-sentiment/applestore-review-' + app + '_Spanish_sentiment.csv')
#         googleplaydf = pd.read_csv('../review-files-Spanish-sentiment/googleplay-review-' + app + '_Spanish_sentiment.csv')
#         googleplaydf = googleplaydf.rename(columns={"content": "review", "score": "rating", "at": "date"})
#         cdf = pd.concat([applestoredf, googleplaydf], ignore_index=True)

#         cdf.to_csv('review-files-combined_Spanish/combined-review-' + app + 'Spanish_sentiment.csv', index=False)
#         line = appfile.readline()

In [9]:
# Standardize reviews

import string
import re
from unidecode import unidecode
import nltk
from nltk.corpus import stopwords

def standardize(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove diacritics and accents
    text = unidecode(text)

    # Tokenization
    text = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    text = [word for word in text if word not in stop_words]

    # Return the standardized string
    return ' '.join(text)


filelist = [file for file in os.listdir('review-files-combined_Spanish/')]


for file in filelist:
    print(file)

    df = pd.read_csv('review-files-combined_Spanish/' + file)
    df['content'] = df['content'].astype('str')

    df['standardizedReview'] = df['content'].apply(standardize)

    output_file = os.path.splitext(file)[0] + '_standardized.csv'

    df.to_csv('review-files-combined-standardized_Spanish/' + output_file, index=False)
    

codeacademy_Spanish_sentiment.csv
programming-hero_Spanish_sentiment.csv
mimo_Spanish_sentiment.csv
sololearn_Spanish_sentiment.csv
learn-python-programiz_Spanish_sentiment.csv
datacamp_Spanish_sentiment.csv
programming-hub_Spanish_sentiment.csv


In [12]:
# Getting stems

# from stemming.porter2 import stem
from collections import defaultdict

# with open('features.txt') as file:
#     str = file.read()

# features = [feature.strip() for feature in str.split(',')]

df = pd.read_csv('feature-list_Spanish.csv')
print(df)

# def stem_phrase(phrase):
#     words = phrase.split()
#     stemmed_words = [stem(word) for word in words]
#     return ' '.join(stemmed_words)

# df['Stem'] = df['Feature'].apply(stem)

# df['SynonymStem'] = df['Synonym'].apply(stem)
# df['SynonymStem'] = df['SynonymStem'].apply(standardize)

# print(df)

# df.to_csv('feature-list.csv', index=False)

my_dict = defaultdict(set)
for feature, synonym in zip(df['Feature'], df['SynonymStem']):
    my_dict[feature].add(synonym)

# Print the resulting dictionary
print(my_dict)

              Feature                   Synonym  Stem   SynonymStem
0   días consecutivos         días consecutivos   NaN           dia
1   días consecutivos           cadenas diarias   NaN        cadena
2   días consecutivos             días seguidos   NaN           dia
3   días consecutivos        secuencias diarias   NaN     secuencia
4   días consecutivos  bonificaciones por racha   NaN  bonificacion
..                ...                       ...   ...           ...
85         coronación                coronación   NaN         coron
86         coronación               investidura   NaN    investidur
87         coronación        ceremonia de coron   NaN      ceremoni
88         coronación                 elevación   NaN       elevaci
89         coronación              coronamiento   NaN         coron

[90 rows x 4 columns]
defaultdict(<class 'set'>, {'días consecutivos': {'cadena', 'secuencia', 'bonificacion', 'dia'}, 'sistema de puntos': {'mecanismo', 'sistema', 'asignacion', 'est

In [13]:
# Filtering reviews

filelist = [file for file in os.listdir('review-files-combined-standardized_Spanish/')]


for file in filelist:
    print(file)

    df = pd.read_csv('review-files-combined-standardized_Spanish/' + file)
    df['standardizedReview'] = df['standardizedReview'].astype('str')
    

    df['mentionedFeature'] = df['standardizedReview'].apply(lambda x: ', '.join([feature for feature, synonyms in my_dict.items() if any(syn in x for syn in synonyms)]))
    df['mentionedSynonyms'] = df['standardizedReview'].apply(lambda x: ', '.join([syn for feature, synonyms in my_dict.items() for syn in synonyms if syn in x]))

    df.replace('', np.nan, inplace=True)
    df.dropna(subset=['mentionedFeature'], inplace=True)
    #df['standardizedReview'] = df['review'].apply(standardize)

    output_file = os.path.splitext(file)[0] + '_filtered.csv'

    df.to_csv('review-files-combined-filtered_Spanish/' + output_file, index=False)

learn-python-programiz_Spanish_sentiment_standardized.csv
mimo_Spanish_sentiment_standardized.csv
codeacademy_Spanish_sentiment_standardized.csv
programming-hub_Spanish_sentiment_standardized.csv
datacamp_Spanish_sentiment_standardized.csv
programming-hero_Spanish_sentiment_standardized.csv
sololearn_Spanish_sentiment_standardized.csv


In [16]:
# Aggregating feature sentiments

scores = defaultdict(list)
for feature in my_dict.keys():
    feature = feature.lower()
    scores[feature] = [0, 0]

print(scores)

filelist = [file for file in os.listdir('review-files-combined-filtered_Spanish/')]

for file in filelist:
    print(file)

    df = pd.read_csv('review-files-combined-filtered_Spanish/' + file)

    for index, row in df.iterrows():
        mentioned_features = [feature.strip() for feature in row['mentionedFeature'].split(',')]
        #print(mentioned_features)
        for feature in mentioned_features:
            #print(feature)
            if row['sentiment'] == 'POS':
                scores[feature][0] += 1
            elif row['sentiment'] == 'NEG':
                scores[feature][1] -= 1
            # elif row['sentiment'] == 'NEU':
            #     scores[feature][2] += 1

for feature in scores.keys():
    if scores[feature][0] - scores[feature][1] == 0:
        scores[feature].append(0)
        continue
    scores[feature].append((scores[feature][0] + scores[feature][1]) / (scores[feature][0] - scores[feature][1]))

print(scores)

overallscoredf = pd.DataFrame(columns = ['feature', 'positiveCount', 'negativeCount', 'score'])

for feature in scores.keys():
    new_row = pd.Series({'feature': feature, 'positiveCount': scores[feature][0], 'negativeCount': scores[feature][1], 'score': scores[feature][2]})
    overallscoredf = pd.concat([overallscoredf, new_row.to_frame().T], ignore_index=False)

overallscoredf.to_csv('feature-scores_Spanish.csv')

defaultdict(<class 'list'>, {'días consecutivos': [0, 0], 'sistema de puntos': [0, 0], 'barra de progreso': [0, 0], 'competencia de hábitos/golpe': [0, 0], 'descubrimiento social': [0, 0], 'desafíos': [0, 0], 'tablero de clasificación/competencia': [0, 0], 'potenciadores': [0, 0], 'economía virtual': [0, 0], 'descansos de tortura': [0, 0], 'tumba visual': [0, 0], 'choca esos cinco': [0, 0], 'última milla': [0, 0], 'desfile de anticipación': [0, 0], 'avatar': [0, 0], 'estados de victoria': [0, 0], 'votación': [0, 0], 'coronación': [0, 0]})
learn-python-programiz_Spanish_sentiment_standardized_filtered.csv
programming-hero_Spanish_sentiment_standardized_filtered.csv
sololearn_Spanish_sentiment_standardized_filtered.csv
codeacademy_Spanish_sentiment_standardized_filtered.csv
mimo_Spanish_sentiment_standardized_filtered.csv
programming-hub_Spanish_sentiment_standardized_filtered.csv
datacamp_Spanish_sentiment_standardized_filtered.csv
defaultdict(<class 'list'>, {'días consecutivos': [870,

In [19]:
# Feature sentiment per app

scoredf = pd.DataFrame(columns = ['app', 'feature', 'positiveCount', 'negativeCount', 'score'])

filelist = [file for file in os.listdir('review-files-combined-filtered_Spanish/')]

with open('../googleplay-apps.txt') as appfile:
    line = appfile.readline()

    while(line):
        app = line.strip()

        if app == 'encode':
            line = appfile.readline()
            continue
        
        df = pd.read_csv('review-files-combined-filtered_Spanish/' + app + '_Spanish_sentiment_standardized_filtered.csv')

        appscores = defaultdict(list)
        for feature in my_dict.keys():
            feature = feature.lower()
            appscores[feature] = [0, 0]

        for index, row in df.iterrows():
            mentioned_features = [feature.strip() for feature in row['mentionedFeature'].split(',')]
            #print(mentioned_features)
            for feature in mentioned_features:
                #print(feature)
                if row['sentiment'] == 'POS':
                    appscores[feature][0] += 1
                elif row['sentiment'] == 'NEG':
                    appscores[feature][1] -= 1

        for feature in appscores.keys():
            if appscores[feature][0] - appscores[feature][1] == 0:
                appscores[feature].append(0)
                continue
            appscores[feature].append((appscores[feature][0] + appscores[feature][1]) / (appscores[feature][0] - appscores[feature][1]))

        for feature in appscores.keys():
            new_row = pd.Series({'app': app, 'feature': feature, 'positiveCount': appscores[feature][0], 'negativeCount': appscores[feature][1], 'score': appscores[feature][2]})
            
            scoredf = pd.concat([scoredf, new_row.to_frame().T], ignore_index=False)

        line = appfile.readline()

scoredf.to_csv('feature-scores-by-app-Spanish.csv')


In [21]:
# Overall sentiment per app
sentimentdf = pd.DataFrame(columns = ['app', 'positiveCount', 'negativeCount', 'score'])

with open('../googleplay-apps.txt') as appfile:
    line = appfile.readline()

    while(line):
        app = line.strip()

        if app == 'encode':
            line = appfile.readline()
            continue
        
        print(app)

        df = pd.read_csv('review-files-combined_Spanish/' + app + '_Spanish_sentiment.csv')

        positiveCount = 0
        negativeCount = 0

        for index, row in df.iterrows():

            if row['sentiment'] == 'POS':
                positiveCount += 1
            elif row['sentiment'] == 'NEG':
                negativeCount -= 1
        
        if positiveCount == 0 and negativeCount == 0:
            score = 0
        else:
            score = (positiveCount + negativeCount) / (positiveCount - negativeCount)
            
        new_row = pd.Series({'app': app, 'positiveCount': positiveCount, 'negativeCount': negativeCount, 'score': score})
        sentimentdf = pd.concat([sentimentdf, new_row.to_frame().T], ignore_index=False)

        line = appfile.readline()

sentimentdf.to_csv('app-scores_Spanish.csv')

codeacademy
datacamp
learn-python-programiz
mimo
programming-hero
programming-hub
sololearn
