In [1]:
import pandas as pd
import ast
import gzip

### Extraccion

In [2]:
rows = []

with gzip.open("Datasets\\user_reviews.json.gz",'rt',encoding='utf-8') as archivo:   
    for linea in archivo: #Itera sobre cada linea del archivo
# ast.literal_eval() convierte cada línea, que es una cadena de texto, en un objeto de Python
        rows.append(ast.literal_eval(linea)) # guarda los datos en la lista 'filas'

df = pd.DataFrame(rows)


In [36]:
df.head(4)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


### Transformación de datos

In [3]:
# Transforma la lista de diccionarios 'items' en un dataframe 
df_lista = pd.json_normalize(rows, record_path='reviews', meta=['user_id', 'user_url'])

In [38]:
df_lista.shape

(59305, 9)

In [39]:
df_lista.head(3)

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,user_id,user_url
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,76561197970982479,http://steamcommunity.com/profiles/76561197970...
2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479,http://steamcommunity.com/profiles/76561197970...


In [7]:
# Verifica que no haya datos nulos
df_lista.isnull().sum()

funny          0
posted         0
last_edited    0
item_id        0
helpful        0
recommend      0
review         0
user_id        0
user_url       0
dtype: int64

In [80]:
# Funcion para contar cadenas vacias
def countEmpty_Strings(name):
    total = (df_lista[name] == '').sum()
    print(f"total filas con cadenas vacias: {name} = {total}")
    

In [81]:
countEmpty_Strings('funny')
countEmpty_Strings('last_edited')

total filas con cadenas vacias: funny = 51154
total filas con cadenas vacias: last_edited = 53165


#### Eliminacion de columnas
Podemos observar que las columnas 'funny' y 'last_edited' Contienen muchas filas sin datos, por lo tanto las eliminamos.

In [4]:
# Se eliminan las columnas porque tienen mucha data nula
df_lista.drop(['funny','last_edited'],axis=1, inplace=True)

Las columnas 'helpful' y 'user_url' las eliminamos ya que contienen datos irrelevantes
- helpful:   Cantidad de personas a las que le parecio util la reseña.
- user_url:   Link de url del usuario

In [5]:

df_lista.drop(['helpful','user_url'],axis=1,inplace=True)

In [84]:
df_lista.head(3)

Unnamed: 0,posted,item_id,recommend,review,user_id
0,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...,76561197970982479
1,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.,76561197970982479
2,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479


Transformacion de la columna 'posted'

In [6]:
# Quita la palabra 'posted' y '.' (reemplaza por espacios en blanco)
df_lista['posted'] = df_lista.posted.str.replace('Posted','')
df_lista['posted'] = df_lista.posted.str.replace('.','')

# Elimino los espacios ''
df_lista['posted'] = df_lista['posted'].str.strip()

# Convierto a formato fecha, errors='coerce': acepta fechas sin año
df_lista['posted'] = pd.to_datetime(df_lista['posted'], errors='coerce')

In [86]:
# Las fechas sin año las toma como nulas
fechas_incompletas = df_lista['posted'].isnull().sum()
fechas_incompletas

10119

In [7]:
# Se eliminan las filas que no contienen año
df_lista.dropna(subset=['posted'],inplace=True)

# Se crea una nueva columna con el año extraido de la columna posted
df_lista['posted_year'] = df_lista['posted'].dt.year


In [8]:
# Elimina la columna posted
df_lista.drop(['posted'],axis=1,inplace=True)

#Resetea el indice de las filas
# Drop=True es para eliminar el indice anterior
df_lista.reset_index(drop=True,inplace=True)

In [40]:

df_lista.head(7)

Unnamed: 0,posted_year,user_id,item_id,recommend,review,sentiment_analysis
0,2011,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,2
1,2011,76561197970982479,22200,True,It's unique and worth a playthrough.,2
2,2011,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...,2
3,2014,js41637,251610,True,I know what you think when you see this title ...,2
4,2013,js41637,227300,True,For a simple (it's actually not all that simpl...,2
5,2013,js41637,239030,True,Very fun little game to play when your bored o...,2
6,2015,evcentric,370360,True,"""Run for fun? What the hell kind of fun is that?""",2


In [42]:
p = df_lista[df_lista['user_id'] == 'js41637']
p

Unnamed: 0,posted_year,user_id,item_id,recommend,review,sentiment_analysis
3,2014,js41637,251610,True,I know what you think when you see this title ...,2
4,2013,js41637,227300,True,For a simple (it's actually not all that simpl...,2
5,2013,js41637,239030,True,Very fun little game to play when your bored o...,2


### Implementación de función para el análisis de sentimientos

In [16]:
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\pablo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pablo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
# nltk.sentiment: es el analizador que da la probabilidad de que sea negativo, neutro o positivo
from nltk.sentiment import SentimentIntensityAnalyzer

- 0 Negativo, 1 Neutral, 2 Positivo
- 'compound': este es un valor entre -1 y 1 que viene a indicar de una única vez si la frase es Positiva o Negativa. Valores próximos a -1 indican que es Negativa, próximos a cero indicarían que es Neutra y próximos a 1 sería Positiva


In [18]:
# Esta funcion define el resultado del texto
def analisis(txt):
    # Creo una instancia del analizador
    sia = SentimentIntensityAnalyzer()
    resultadoSentimiento = sia.polarity_scores(txt)['compound']
    
    if resultadoSentimiento > 0.1:
        return 2 #Representa los Positivos
    elif resultadoSentimiento < -0.1:
        return 0 #Representa los Negativos
    else:
        return 1 #Representa los Neutrales
    

In [19]:
# Crea una nueva columna y le almacena el analisis de sentimientos
# en lambda 'x' representa cada fila de la columna review
df_lista['sentiment_analysis'] = df_lista['review'].apply(lambda x: analisis(x))

In [21]:
df_lista = df_lista[['posted_year','user_id','item_id','recommend', 'review', 'sentiment_analysis']]

In [38]:
df_lista.sample(3)

Unnamed: 0,posted_year,user_id,item_id,recommend,review,sentiment_analysis
10431,2013,76561198095314096,223710,True,this game is so scary,0
2406,2015,psn_jmcgamer12,730,True,"Until they patch out the R8 Revolver, don't ev...",2
16614,2013,yokta,4000,True,Garry's Mod on its own is a dull piece of soft...,2


In [39]:
df_lista['sentiment_analysis'].value_counts()

sentiment_analysis
2    31047
1    10752
0     7387
Name: count, dtype: int64

In [113]:
print(df_lista.isna().sum())

item_id               0
recommend             0
review                0
user_id               0
posted_year           0
sentiment_analysis    0
dtype: int64


Se exporta el dataset en formato parquet

In [114]:
df_lista.to_parquet('Datasets/user_reviews.parquet')

--------------------------------------------------------------------------------------------------------

Esto es para probar que en el merge aparecen los mismos datos (luego eliminar)

In [45]:
p = df_lista[df_lista['user_id'] == '76561197970982479']
p

Unnamed: 0,posted_year,user_id,item_id,recommend,review,sentiment_analysis
0,2011,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,2
1,2011,76561197970982479,22200,True,It's unique and worth a playthrough.,2
2,2011,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...,2
