Abrir Archivo comprimido de datos de reseñas de usuario:

In [1]:
# Importar librerías
import gzip
import pandas as pd
import json
import numpy as np

In [2]:
# Descomprimir el archivo gzip
with gzip.open('DataBase/user_reviews.json.gz', 'rb') as f:
    data = f.readlines()

In [3]:
# Convertir las líneas a registros JSON
records = [eval(line.strip()) for line in data]

In [4]:
# Crear el DataFrame a partir de los registros
df_reviews= pd.DataFrame(records)

In [5]:
df_reviews # 25799 rows x 3 columns

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [6]:
# Eliminamos las columnas que no serán utilizadas
df_reviews.drop(columns=["user_url"], inplace = True)
df_reviews.columns

Index(['user_id', 'reviews'], dtype='object')

In [7]:
#Desanidar la columna anidada "Reviews"    1,5 seg
data_desanidada = []

for index, row in df_reviews.iterrows():
    user_id = row['user_id']
    reviews = row['reviews']
    
    for i in reviews:   
        new_row = {
        'funny': i.get('funny', ''),
        'posted': i.get('posted', ''),
        'last_edited': i.get('last_edited', ''),
        'item_id': i.get('item_id', ''),
        'helpful': i.get('helpful', ''),
        'recommend': i.get('recommend', bool),
        'review': i.get('review', '')
        }
        
        data_desanidada.append(new_row)

In [8]:
# DataFrame desanidada
df_i_desanidado = pd.DataFrame(data_desanidada)

In [9]:
#Mirar DF desanidada
df_i_desanidado.head(5)

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [10]:
#Revisar NaNs
df_i_desanidado.isna().sum()

funny          0
posted         0
last_edited    0
item_id        0
helpful        0
recommend      0
review         0
dtype: int64

In [11]:
#Analizador de sentimientos, funciona en Python 3.11.4, no en python 3.11.5
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [12]:
analyzer = SentimentIntensityAnalyzer()

In [13]:
# Función para realizar el análisis de sentimiento
def analyze_sentiment(review):
    # Obtener el puntaje de polaridad del sentimiento
    compound_score = analyzer.polarity_scores(str(review))["compound"]

    # Clasificar la polaridad del sentimiento en 3 categorías: negativo, neutral, positivo
    if compound_score < 0:
        return 0  # Sentimiento negativo
    elif compound_score == 0:
        return 1  # Sentimiento neutral
    else:
        return 2  # Sentimiento positivo

In [14]:
# Aplicar el análisis de sentimiento a la columna "review"    40,2 seg
df_i_desanidado["sentiment_analysis"] = df_i_desanidado["review"].apply(analyze_sentiment)

In [17]:
# Mostrar las primeras filas del DataFrame con la nueva columna
df_i_desanidado.tail(100)

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,sentiment_analysis
59205,,"Posted December 20, 2015.",,304050,No ratings yet,True,"This game is great. The only thing is,Why cant...",2
59206,,"Posted September 4, 2015.",,620,No ratings yet,True,Portal 2 is a great game created by Valve. You...,0
59207,2 people found this review funny,Posted May 1.,,414390,4 of 5 people (80%) found this review helpful,True,NABOR NAGIBA + GOLD CARD XD,2
59208,,"Posted September 15, 2015.",,440,1 of 2 people (50%) found this review helpful,True,"MEEEEDIC, Medic, MMMEdic, кнопка E лучшая в за...",1
59209,,"Posted September 9, 2015.",,730,1 of 2 people (50%) found this review helpful,True,"поляки увидели что вы убили 4, но пошли сейвит...",2
...,...,...,...,...,...,...,...,...
59300,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...,2
59301,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...,2
59302,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,2
59303,,Posted July 20.,,730,No ratings yet,True,:D,2


In [18]:
df_i_desanidado = df_i_desanidado.drop(["funny", "helpful", "review"], axis=1)
df_i_desanidado

Unnamed: 0,posted,last_edited,item_id,recommend,sentiment_analysis
0,"Posted November 5, 2011.",,1250,True,2
1,"Posted July 15, 2011.",,22200,True,2
2,"Posted April 21, 2011.",,43110,True,2
3,"Posted June 24, 2014.",,251610,True,2
4,"Posted September 8, 2013.",,227300,True,2
...,...,...,...,...,...
59300,Posted July 10.,,70,True,2
59301,Posted July 8.,,362890,True,2
59302,Posted July 3.,,273110,True,2
59303,Posted July 20.,,730,True,2


In [19]:
#Fecha de last_edited
df_i_desanidado["year"] = df_i_desanidado["last_edited"].str.extract(r"(\d{4})") #year con last_edited

In [20]:
#Fecha de posted
df_i_desanidado["year2"] = df_i_desanidado["posted"].str.extract(r"(\d{4})") #year2 con posted

In [21]:
#Donde no hay fecha
df_i_desanidado["year3"] = df_i_desanidado["posted"].str.extract(r"(\d{4})").fillna(0) #year3 donde no hay year poner 0

In [22]:
df_i_desanidado.head(5)

Unnamed: 0,posted,last_edited,item_id,recommend,sentiment_analysis,year,year2,year3
0,"Posted November 5, 2011.",,1250,True,2,,2011,2011
1,"Posted July 15, 2011.",,22200,True,2,,2011,2011
2,"Posted April 21, 2011.",,43110,True,2,,2011,2011
3,"Posted June 24, 2014.",,251610,True,2,,2014,2014
4,"Posted September 8, 2013.",,227300,True,2,,2013,2013


In [23]:
# Los valores de la columna year, se reemplazan los NaNs con los valores de la columna year2
df_i_desanidado["year"].fillna(df_i_desanidado["year2"], inplace=True)

In [24]:
df_i_desanidado.head(5)

Unnamed: 0,posted,last_edited,item_id,recommend,sentiment_analysis,year,year2,year3
0,"Posted November 5, 2011.",,1250,True,2,2011,2011,2011
1,"Posted July 15, 2011.",,22200,True,2,2011,2011,2011
2,"Posted April 21, 2011.",,43110,True,2,2011,2011,2011
3,"Posted June 24, 2014.",,251610,True,2,2014,2014,2014
4,"Posted September 8, 2013.",,227300,True,2,2013,2013,2013


In [25]:
# Los valores NaNs de la columna year, se reemplazan los 0 de la columna year3
df_i_desanidado["year"].fillna(df_i_desanidado["year3"], inplace=True)

In [26]:
#Verificar si hay NaNs en columna "year"
nan_sum = df_i_desanidado["year"].isnull().sum()

In [27]:
print(nan_sum)

0


In [28]:
## Haremos una nueva dataframe para hacer el ML
#Crear una nueva dataframe con item_id, recommend, sentiment_analysis, year, sin duplicados
df_Score = df_i_desanidado[["item_id", "recommend", "sentiment_analysis", "year"]].drop_duplicates()

In [29]:
#Concatenamos con user_id
df_concatenated_score = pd.concat([df_reviews["user_id"], df_Score], axis=1)

In [32]:
df_concatenated_score #30478 rows × 5 columns

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis,year
0,76561197970982479,1250,True,2.0,2011
1,js41637,22200,True,2.0,2011
2,evcentric,43110,True,2.0,2011
3,doctr,251610,True,2.0,2014
4,maplemage,227300,True,2.0,2013
...,...,...,...,...,...
59281,,220090,True,1.0,0
59282,,262850,True,1.0,0
59285,,298110,False,1.0,0
59286,,431510,True,2.0,0


In [33]:
#Guardamos en un archivo .csv
df_concatenated_score.to_csv("Second_Reviews.csv", index=False)