# Librerías

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pyarrow.parquet as pq
import ast
from dateutil import parser
import re

# Endpoints

In [3]:
# Se cargan los dataframes de los tres archivos nuevamente

# steam_games
df_steam_games = pd.read_csv('/Users/mlucchesi/Henry/PI/data/csv/procesados/steam_games_procesado.csv')

# user_reviews
df_user_reviews = pd.read_csv('/Users/mlucchesi/Henry/PI/data/csv/procesados/user_reviews_procesado_nlp.csv')

# users_items
df_users_items = pd.read_csv('/Users/mlucchesi/Henry/PI/data/csv/procesados/users_items_procesado.csv')

### Archivo ```steam_games```

In [4]:
# Se crea una función que extraiga el año del lanzamiento del juego de 'release_date'

def get_year(date):
    try:
        date_object = parser.parse(date)
        return date_object.year
    except:
        return None
    
# Creo una columna con el año, usando la función creada

df_steam_games['year_developed'] = df_steam_games['release_date'].apply(get_year).astype('Int64')

In [5]:
# Muestro como quedo el dataframe
df_steam_games.head(1)

Unnamed: 0.1,Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer,year_developed
0,88310,Kotoshiro,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",http://steamcommunity.com/app/761140/reviews/?...,['Single-player'],4.99,0.0,761140.0,Kotoshiro,2018


In [8]:
# Exporto a CSV el dataframe completo

df_steam_games.to_csv('/Users/mlucchesi/Henry/PI/data/csv/procesados/steam_games_year.csv', index=False)

### Archivo ```user_reviews```

In [10]:
# Creo una función que extraiga el año de la fecha de posteo de la review

def get_year_reviews(posted):
    match = re.search(r'\d{4}', posted)  
    if match:
        return match.group()
    else:
        return None 

df_user_reviews['year_review'] = df_user_reviews['posted'].apply(get_year_reviews)

In [11]:
# Muestro como quedó

df_user_reviews.head()

Unnamed: 0.1,Unnamed: 0,user_id,user_url,posted,item_id,helpful,recommend,sentiment_analysis,year_review
0,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,No ratings yet,True,2,2011.0
1,1,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,15 of 20 people (75%) found this review helpful,True,2,2014.0
2,2,evcentric,http://steamcommunity.com/id/evcentric,Posted February 3.,248820,No ratings yet,True,2,
3,3,doctr,http://steamcommunity.com/id/doctr,"Posted October 14, 2013.",250320,2 of 2 people (100%) found this review helpful,True,2,2013.0
4,4,maplemage,http://steamcommunity.com/id/maplemage,"Posted April 15, 2014.",211420,35 of 43 people (81%) found this review helpful,True,1,2014.0


In [12]:
print(df_user_reviews['recommend'].unique())

[ True]


## Funciones

### ```UsersNotRecommend```
_Devuelve el top 3 de juegos MENOS recomendados por usuarios para el año dado. (reviews.recommend = False y comentarios negativos)_

In [14]:
# Para optimizar, se extraen solo las columnas necesarias para hacer el merge

# Primero, se obtiene el app_name y su respectivo 'id'

df_game_id = df_steam_games[['app_name', 'id']]
df_game_id.head()

Unnamed: 0,app_name,id
0,Lost Summoner Kitty,761140.0
1,Ironbound,643980.0
2,Real Pool 3D - Poolians,670290.0
3,弹炸人2222,767400.0
4,Log Challenge,773570.0


In [15]:
# Luego, se obtiene el 'id' de juego, el año de posteo de la review, y aquellos que fueron negativos (recommend = False y sentiment analysis = 0)

df_reviews_negativas = df_user_reviews[['item_id', 'recommend', 'sentiment_analysis', 'year_review']]
df_reviews_negativas.head()

Unnamed: 0,item_id,recommend,sentiment_analysis,year_review
0,1250,True,2,2011.0
1,251610,True,2,2014.0
2,248820,True,2,
3,250320,True,2,2013.0
4,211420,True,1,2014.0


In [16]:
# Como se van a unir por id de juego, hay que renombrar las columnas para que coincidan

# En el df de los juegos, se cambia 'id' por 'item_id'

df_game_id = df_game_id.rename(columns={'id': 'item_id'})
df_game_id.head()

Unnamed: 0,app_name,item_id
0,Lost Summoner Kitty,761140.0
1,Ironbound,643980.0
2,Real Pool 3D - Poolians,670290.0
3,弹炸人2222,767400.0
4,Log Challenge,773570.0


In [17]:
# Se mergean los dataframes

df_game_negative_review = df_game_id.merge(df_reviews_negativas, on='item_id', how='right')
df_game_negative_review

Unnamed: 0,app_name,item_id,recommend,sentiment_analysis,year_review
0,Killing Floor,1250.0,True,2,2011
1,,251610.0,True,2,2014
2,Risk of Rain,248820.0,True,2,
3,The Wolf Among Us,250320.0,True,2,2013
4,DARK SOULS™: Prepare To Die™ Edition,211420.0,True,1,2014
...,...,...,...,...,...
55827,PlanetSide 2,218230.0,True,2,2015
55828,Millie,294230.0,True,1,2015
55829,Counter-Strike Nexon: Zombies,273110.0,True,1,2015
55830,Who's Your Daddy,427730.0,True,1,


In [18]:
# Se exporta a un CSV

df_game_negative_review.to_csv('/Users/mlucchesi/Henry/PI/data/csv/procesados/game_negative_review.csv')

In [19]:
df_game_negative_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55832 entries, 0 to 55831
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   app_name            50207 non-null  object 
 1   item_id             55832 non-null  float64
 2   recommend           55832 non-null  bool   
 3   sentiment_analysis  55832 non-null  int64  
 4   year_review         46101 non-null  object 
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 1.8+ MB


In [20]:
# Llena los valores None en la columna 'year_review' con 0
df_game_negative_review['year_review'] = df_game_negative_review['year_review'].fillna(0)

# Convierte la columna 'year_review' a tipo de datos int
df_game_negative_review['year_review'] = df_game_negative_review['year_review'].astype(int)

In [21]:
# Se define la función

def UsersNotRecommend(year: int):
    df_filtrado = df_game_negative_review[(df_game_negative_review['year_review'] == year) &
                                               (df_game_negative_review['sentiment_analysis'] == 0)]

    if df_filtrado.empty:
        return None

    peores_games = df_filtrado.groupby('app_name')['recommend'].sum().sort_values(ascending=False).head(3)

    result = [{"Top {}".format(i + 1): app_name} for i, (app_name, _) in enumerate(peores_games.items())]

    return result

In [22]:
# Probamos la función

UsersNotRecommend(2014)

[{'Top 1': 'Team Fortress 2'},
 {'Top 2': 'Counter-Strike: Global Offensive'},
 {'Top 3': "Garry's Mod"}]

### ```sentiment_analysis```
_Según el año de lanzamiento, se devuelve una lista con la cantidad de registros de reseñas de usuarios que se encuentren categorizados con un análisis de sentimiento._

In [23]:
# Para optimizar, se extraen solo las columnas necesarias para hacer el merge

# Primero, se obtiene el id del juego con 'sentiment_analysis'

df_id_sentiment = df_user_reviews [['item_id', 'sentiment_analysis']]
df_id_sentiment.head()

Unnamed: 0,item_id,sentiment_analysis
0,1250,2
1,251610,2
2,248820,2
3,250320,2
4,211420,1


In [24]:
# Lo mismo, para el dataframe de steam_games

df_id_yeardev = df_steam_games[['id', 'year_developed']]
df_id_yeardev.head()

Unnamed: 0,id,year_developed
0,761140.0,2018.0
1,643980.0,2018.0
2,670290.0,2017.0
3,767400.0,2017.0
4,773570.0,


In [25]:
# Como se van a unir por id de juego, hay que renombrar las columnas para que coincidan

# En el df de los developers, se cambia 'id' por 'item_id'

df_id_yeardev = df_id_yeardev.rename(columns={'id': 'item_id'})
df_id_yeardev.head()

Unnamed: 0,item_id,year_developed
0,761140.0,2018.0
1,643980.0,2018.0
2,670290.0,2017.0
3,767400.0,2017.0
4,773570.0,


In [26]:
# Se mergean los dataframes

df_id_yeardev_sentiment = df_id_yeardev.merge(df_id_sentiment, on='item_id', how='right')
df_id_yeardev_sentiment.head()

Unnamed: 0,item_id,year_developed,sentiment_analysis
0,1250.0,2009.0,2
1,251610.0,,2
2,248820.0,2013.0,2
3,250320.0,2013.0,2
4,211420.0,2012.0,1


In [34]:
# Se define la función

def sentiment_analysis(year: int):
    df_filtrado = df_id_yeardev_sentiment[df_id_yeardev_sentiment['year_developed'] == year] 

    if df_filtrado.empty:
        return None
    
    count_sentiment_analysis = df_filtrado.groupby('sentiment_analysis').size().to_dict()

    resultado = [f'Negative = {count_sentiment_analysis.get(0, 0)}', f'Positive = {count_sentiment_analysis.get(2, 0)}']

    return resultado


In [35]:
# Pruebo la función

sentiment_analysis(2014)

['Negative = 1405', 'Positive = 3352']

### Exportar los dataframes

```df_id_yeardev_sentiment```

In [36]:
# Exporto el CSV

df_id_yeardev_sentiment.to_csv('/Users/mlucchesi/Henry/PI/data/csv/procesados/year_sentiment.csv', index=False)

In [37]:
# Exporto en parquet, para que la API esté optimizada

df_id_yeardev_sentiment.to_parquet('/Users/mlucchesi/Henry/PI/MLOps/project/PI-MLOps/archivos/year_sentiment.parquet', engine='pyarrow')

```df_developer_negative_review```

In [38]:
# Exporto el CSV

df_game_negative_review.to_csv('/Users/mlucchesi/Henry/PI/data/csv/procesados/game_negative_review.csv', index=False)

In [39]:
# Exporto en parquet, para que la API esté optimizada

df_game_negative_review.to_parquet('/Users/mlucchesi/Henry/PI/MLOps/project/PI-MLOps/archivos/game_negative_review.parquet', engine='pyarrow')