In [102]:
import pandas as pd
import json
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
nltk.download('vader_lexicon')
from datetime import datetime

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\o0o0o0o\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\o0o0o0o\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [103]:
# Leemos el json y lo guardamos en un string
with open('src/australian_user_reviews.json', encoding='UTF-8') as f:
    json_string = f.read()

In [104]:
#Reemplazamos las comillas dobles por simples y luego las simples por dobles para que el json sea válido
json_string = json_string.replace("\"", "'")
json_string = json_string.replace("\\'", "'")
json_string = json_string.replace("\\", "'")
json_string = json_string.replace("{'user_id': '", '{"user_id": "')
json_string = json_string.replace("', 'user_url': '", '", "user_url": "')
json_string = json_string.replace("', 'reviews': [{'funny': '", '", "reviews": [{"funny": "')
json_string = json_string.replace("', 'reviews': []}", '", "reviews": []}')
json_string = json_string.replace("'}, {'funny': '", '"}, {"funny": "')
json_string = json_string.replace("', 'posted': 'Posted ", '", "posted": "')
json_string = json_string.replace("', 'last_edited': '", '", "last_edited": "')
json_string = json_string.replace("', 'item_id': '", '", "item_id": "')
json_string = json_string.replace("', 'helpful': '", '", "helpful": "')
json_string = json_string.replace("', 'recommend': True, 'review': '", ' ", "recommend": "True", "review": "')
json_string = json_string.replace("', 'recommend': False, 'review': '", ' ", "recommend": "False", "review": "')
json_string = json_string.replace("'}]}", '"}]}')  
json_string = json_string.replace("\\xa0", ' ')


In [105]:
# Recorremos el json y lo convertimos en un array. La columna reviews se desanida y se crea un nuevo array,
# agregando el user_id a cada review
data_array = []
reviews_array = []
for line in json_string.splitlines():
    line = json.loads(line)
    
    if line['reviews'] != []:
        for review in line['reviews']:
                review["user_id"] = line['user_id']
                reviews_array.append(review)
    data_array.append(line)

In [106]:
# Creamos los dataframes
reviews_df = pd.DataFrame(reviews_array)
users_df = pd.DataFrame(data_array)

In [107]:
#Limpiamos el df de reviews
reviews_df = reviews_df.drop(columns=['funny', 'last_edited', 'helpful'])

#Limpiamos el df de users
users_df = users_df.drop(columns=['user_url', 'reviews'])

In [108]:
#Creamos una función para extraer la fecha del texto, en los casos en los que no hay año se asume que son de 2016
#ya que la última fecha con año es del 31 de diciembre de 2015
def cleanYear(x):
    try:
        return datetime.strptime(x, '%B %d, %Y.')
    except:
        return datetime.strptime(x + '2016','%B %d.%Y')

#Aplicamos la función al df de reviews y extraemos el año
reviews_df['posted'] = reviews_df['posted'].apply(lambda x: cleanYear(x)).dt.year

In [109]:
#Acondicionamos el el texto de la columna review
corpus = []
for i in range(0, reviews_df.shape[0]):
  review = re.sub('[^a-zA-Z]', ' ', reviews_df['review'][i])# remplaza cualquier cosa que no sea Letras por espacios
  review = review.lower()# pasar a minusculas
  review = review.split() # se divide en plabras sin espacions
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')# eliminar palabras que no dan sentimiento
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)


In [112]:
#agregamos la columna tokenizada al df
reviews_df['review_tokenized'] = corpus

In [114]:
#Instanciamos el SentimentIntensityAnalyzer y lo aplicamos a la columna review_tokenized en una nueva columna
sia = SentimentIntensityAnalyzer()

reviews_df['sentiment_analysis'] = reviews_df['review_tokenized'].apply(lambda x: round(sia.polarity_scores(x)['compound']) + 1)

In [None]:
#Eliminamos la columna review_tokenized y review y cambiamos el nombre de la columna sentiment por review
reviews_df = reviews_df.drop(columns=['review_tokenized'])
reviews_df = reviews_df.drop(columns=['review'])

In [121]:
#Convertimos los tipos de las columnas 

reviews_df = reviews_df.convert_dtypes()
reviews_df['item_id'] = reviews_df['item_id'].astype('int64')
reviews_df['posted'] = reviews_df['posted'].astype('int64')

In [125]:

#cargamos el dataset de juegos y eliminamos las reviews sin juego asociado
games_df = pd.read_parquet('src/cleaned/games.parquet')
games_id_list = games_df['id'].tolist()

reviews_df = reviews_df[reviews_df['item_id'].isin(games_id_list)]

In [126]:
#Guardamos los dataframes en archivos parquet
reviews_df.to_parquet('src/cleaned/reviews.parquet', index=False)
users_df.to_parquet('src/cleaned/users.parquet', index=False)
