In [1]:
import numpy as np
import pandas as pd
from textblob import TextBlob
import ast
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime

import warnings
warnings.filterwarnings(action= 'ignore')

La finalidad de este feature engineer es realizar la modelacion de los datos para dejar nuestros dataframes lo mejor organizados y formateados, ya que posteriormente los utilizaremos tanto para ser consumidos por la api, como para nuestro modelo de recomendacion

Importamos nuestros datasets, para realizar el feature engineer.

In [2]:
df_games = pd.read_parquet('../data/ETL2_steam_games.parquet')


In [4]:
df_items = pd.read_parquet('../data/ETL1_users_items.parquet')

In [6]:
df_reviews = pd.read_parquet('../data/ETL2_users_reviews.parquet')

### Analisis de Sentimiento


Aqui lo que vamos a realizar es un analisis de sentimiento, aplicando la tecnica de NCP de lenguaje 

In [7]:
# tomamos de el dataframe df_games, las columnas 'developer' y 'id'
developer = df_games[['id', 'developer']]

# renombramos la columna 'id' para luego utilizarla como llave para la union con el dataframe de los demas datos que vamos a utilizar
developer = developer.rename(columns={'id':'reviews_item_id'})

In [8]:
# trabajamos con la columna reviews_review de la tabla df_reviews del archivo user_reviews

reviews= TextBlob(df_reviews['reviews_review'][0])
a= reviews.sentiment
a

Sentiment(polarity=0.17444444444444446, subjectivity=0.3796031746031746)

escala: 

* debe tomar el valor '0' si es malo, '1' si es neutral y '2' si es positivo
* De no ser posible este análisis por estar ausente la reseña escrita, debe tomar el valor de 1

In [9]:
def get_analysis(review) -> int:

    if review == None:
        return 1
    
    reviews= TextBlob(review)
    polaridad= reviews.sentiment.polarity

    

    if polaridad > 0.1:
        return 2
    elif polaridad < -0.1:
        return 0
    else:
        return 1

In [10]:
df_reviews['sentiment_analysis'] = df_reviews['reviews_review'].apply(lambda x: get_analysis(x))

In [11]:
for x in range(1,8):
    print(f'Analisis de sentimiento (valor): {df_reviews["sentiment_analysis"].iloc[x]}\tReview valorizada: {df_reviews["reviews_review"].iloc[x]}')

Analisis de sentimiento (valor): 2	Review valorizada: I know what you think when you see this title "Barbie Dreamhouse Party" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what true fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8
Analisis de sentimiento (valor): 2	Review valorizada: A suitably punishing roguelike platformer.  Winning feels good.  Progressive unlocks mean a good slog ending in failure doesn't feel like a waste.
Analisis de sentimiento (valor): 2	Review valorizada: This game... is so fun. The fight sequences have been improved from walking dead. It also includes more of a Sam and Max puz

In [12]:
# eliminamos la columna 'reviews_review' ya que ya contamos con la columna sentiment_analysis, y las las columnas 'user_url' y 'reviews_helpful', ya que no presentan informacion reelevante para el analisis.

df_reviews.drop(columns= ['user_url', 'reviews_helpful', 'reviews_review'], inplace= True)

In [13]:
# guardamos como archivo .csv y como .parquet el dataframe df_reviews para utilizarlo posteriormente en el EDA y la construccion del modelo de recomendacion
df_reviews.to_csv('../archivos_csv/users_reviews_final.csv', index=False, encoding='utf-8')
pq.write_table(pa.Table.from_pandas(df_reviews), '../data/users_reviews_final.parquet')

In [14]:
# unimos ambos dataframes (developer, df_reviews)

df_reviews = df_reviews.merge(developer, on='reviews_item_id')
df_reviews

Unnamed: 0,user_id,reviews_posted,reviews_item_id,reviews_recommend,sentiment_analysis,developer
0,76561197970982479,2011-11-05,1250,True,2,Tripwire Interactive
1,EndAtHallow,2015-01-15,1250,True,1,Tripwire Interactive
2,76561198107847795,2014-12-12,1250,True,1,Tripwire Interactive
3,usaidwotnow,2013-12-13,1250,True,0,Tripwire Interactive
4,76561198081529182,2014-08-19,1250,True,1,Tripwire Interactive
...,...,...,...,...,...,...
125595,76561198029064257,2012-10-22,16600,True,2,Redlynx
125596,76561198029064257,2012-10-22,16600,True,2,Redlynx
125597,Darkjet15,2014-01-19,232950,True,1,Halycon Media GmbH &amp; Co. KG
125598,themajesticlemon,2013-12-31,205080,False,0,Gaijin Games


In [15]:
df_sentimiento_x_desarrollador = df_reviews[['developer', 'sentiment_analysis']]

In [16]:
df_sentimiento_x_desarrollador

Unnamed: 0,developer,sentiment_analysis
0,Tripwire Interactive,2
1,Tripwire Interactive,1
2,Tripwire Interactive,1
3,Tripwire Interactive,0
4,Tripwire Interactive,1
...,...,...
125595,Redlynx,2
125596,Redlynx,2
125597,Halycon Media GmbH &amp; Co. KG,1
125598,Gaijin Games,0


### Cantidad de items y porcentaje de contenido Free por año según empresa desarrolladora

In [17]:
muestra = df_games.drop_duplicates(subset= 'app_name').reset_index(drop= True)
muestra = muestra[['developer', 'app_name', 'release_date', 'price']]
muestra

Unnamed: 0,developer,app_name,release_date,price
0,Kotoshiro,Lost Summoner Kitty,2018,4.99
1,Secret Level SRL,Ironbound,2018,0.00
2,Poolians.com,Real Pool 3D - Poolians,2017,0.00
3,彼岸领域,弹炸人2222,2017,0.99
4,Trickjump Games Ltd,Battle Royale Trainer,2018,3.99
...,...,...,...,...
28822,Bidoniera Games,Kebab it Up!,2018,1.99
28823,"Nikita ""Ghost_RUS""",Colony On Mars,2018,1.99
28824,Sacada,LOGistICAL: South Africa,2018,4.99
28825,Laush Dmitriy Sergeevich,Russian Roads,2018,1.99


In [18]:
muestra[muestra['developer'] == '2Chance Projects,IIchan Eroge Team']

Unnamed: 0,developer,app_name,release_date,price
21382,"2Chance Projects,IIchan Eroge Team",Frosty Kiss,2015,0.0


In [19]:
muestra['contenido_free'] = muestra['price'] == 0.00
muestra

Unnamed: 0,developer,app_name,release_date,price,contenido_free
0,Kotoshiro,Lost Summoner Kitty,2018,4.99,False
1,Secret Level SRL,Ironbound,2018,0.00,True
2,Poolians.com,Real Pool 3D - Poolians,2017,0.00,True
3,彼岸领域,弹炸人2222,2017,0.99,False
4,Trickjump Games Ltd,Battle Royale Trainer,2018,3.99,False
...,...,...,...,...,...
28822,Bidoniera Games,Kebab it Up!,2018,1.99,False
28823,"Nikita ""Ghost_RUS""",Colony On Mars,2018,1.99,False
28824,Sacada,LOGistICAL: South Africa,2018,4.99,False
28825,Laush Dmitriy Sergeevich,Russian Roads,2018,1.99,False


In [20]:
# agrupamos nuestros datos primero por desarrollador, y luego por año de lanzamiento. A su vez agregamos las columnas 'app_name' que contiene el conteo total de juegos desarrollados por 
# el desarrollador en el año correspondiente, y la columna 'contenido_free', que contiene la sumatoria de sus valores (recordar que el valor booleano True tiene por defecto valor 1 en operaciones matematicas y el False 0)
df_dev_free = muestra.groupby(['developer', 'release_date']).agg({'app_name': 'count','contenido_free': 'sum'}).reset_index()

#renombramos la columna 'app_name' a 'Cantidad_de_items'
df_dev_free = df_dev_free.rename(columns= {'app_name': 'Cantidad_de_items'})

In [21]:
# creamos la columna Contenido_free que va a contener el porcentaje de contenido free del contenido total que saco cada desarrollador en el año correspondiente
df_dev_free['Contenido_free (%)'] = round(df_dev_free['contenido_free'] / df_dev_free['Cantidad_de_items'] * 100).astype(int)

#transformamos el tipo de dato a str para agregar una mejor visualizacion del dato en el momento de la consulta
df_dev_free['Contenido_free (%)'] = df_dev_free['Contenido_free (%)'].astype(str) + '%'

In [22]:
#eliminamos la columna conteindo_free
df_dev_free.drop(columns= 'contenido_free', inplace= True)

In [23]:
df_dev_free

Unnamed: 0,developer,release_date,Cantidad_de_items,Contenido_free (%)
0,+7 Software,2016,1,0%
1,"+Mpact Games, LLC.",2017,1,0%
2,.M.Y.W.,2016,1,0%
3,.ez Games,2017,1,0%
4,07th Expansion,2015,2,0%
...,...,...,...,...
14995,萌石游戏,2017,1,0%
14996,高考恋爱委员会,2015,1,100%
14997,"高考恋爱委员会,Days",2015,1,0%
14998,"高考恋爱委员会,橘子班",2015,1,0%


###  Cantidad de dinero gastado por el usuario, el porcentaje de recomendación y cantidad de items

In [24]:
games = df_games[['id', 'price']].drop_duplicates().reset_index(drop= True)
games = games.rename(columns= {'id': 'item_id'})

In [25]:
muestra_reviews = df_reviews[[ 'user_id','reviews_item_id','reviews_recommend']]
muestra_reviews = muestra_reviews.rename(columns= {'reviews_item_id': 'item_id'})

In [26]:
muestra_reviews

Unnamed: 0,user_id,item_id,reviews_recommend
0,76561197970982479,1250,True
1,EndAtHallow,1250,True
2,76561198107847795,1250,True
3,usaidwotnow,1250,True
4,76561198081529182,1250,True
...,...,...,...
125595,76561198029064257,16600,True
125596,76561198029064257,16600,True
125597,Darkjet15,232950,True
125598,themajesticlemon,205080,False


In [27]:
muestra_agrupada = muestra_reviews.groupby('user_id').agg({'item_id': 'count', 'reviews_recommend': 'sum'}).reset_index().rename(columns= {'item_id': 'Cantidad_items'})
muestra_agrupada = muestra_agrupada.rename(columns= {'item_id': 'Cantidad_items'})
# muestra_agrupada = muestra_agrupada.merge(muestra_reviews[['user_id', 'item_id']], how= 'left', on= 'user_id').reset_index(drop= True)

# muestra_final = muestra_agrupada.groupby(['user_id', 'item_id'])
muestra_agrupada2 = muestra_reviews.merge(muestra_agrupada[['user_id', 'Cantidad_items']], on='user_id', how='left')
# muestra_final[muestra_final['item_id'] != muestra_final['reviews_recommend']]
muestra = muestra_agrupada2.groupby(['user_id','item_id'] ).agg({'Cantidad_items': 'sum', 'reviews_recommend': 'sum'}).reset_index()


In [28]:
games_reviews = muestra.merge(games, on='item_id', how='left')
games_reviews

Unnamed: 0,user_id,item_id,Cantidad_items,reviews_recommend,price
0,--000--,1250,1,1,19.99
1,--ace--,440,12,2,0.00
2,--ace--,113200,24,4,4.99
3,--ionex--,730,5,1,14.99
4,--ionex--,105600,20,4,9.99
...,...,...,...,...,...
49458,zyr0n1c,4000,30,2,9.99
49459,zyr0n1c,8980,30,2,19.99
49460,zyr0n1c,17470,15,1,19.99
49461,zyr0n1c,208090,45,3,0.00


In [29]:
df_games_reviews = games_reviews.groupby('user_id').agg({'Cantidad_items': 'sum', 'reviews_recommend': 'sum', 'price': 'sum'}).reset_index()
df_games_reviews['Dinero_gastado'] = df_games_reviews['price'] * df_games_reviews['Cantidad_items']
df_games_reviews['Recomendacion (%)'] = round((df_games_reviews['reviews_recommend'] / df_games_reviews['Cantidad_items']) * 100).astype(int)
df_games_reviews

Unnamed: 0,user_id,Cantidad_items,reviews_recommend,price,Dinero_gastado,Recomendacion (%)
0,--000--,1,1,19.99,19.99,100
1,--ace--,36,6,4.99,179.64,17
2,--ionex--,25,5,24.98,624.50,20
3,-2SV-vuLB-Kg,25,5,64.97,1624.25,20
4,-Azsael-,4,2,29.99,119.96,50
...,...,...,...,...,...,...
23256,zvanik,144,12,14.99,2158.56,8
23257,zwanzigdrei,4,2,0.00,0.00,50
23258,zy0705,4,2,0.00,0.00,50
23259,zynxgameth,16,4,0.00,0.00,25


In [30]:
df_games_reviews.drop(columns= ['reviews_recommend', 'price'], inplace= True)

In [31]:
df_games_reviews

Unnamed: 0,user_id,Cantidad_items,Dinero_gastado,Recomendacion (%)
0,--000--,1,19.99,100
1,--ace--,36,179.64,17
2,--ionex--,25,624.50,20
3,-2SV-vuLB-Kg,25,1624.25,20
4,-Azsael-,4,119.96,50
...,...,...,...,...
23256,zvanik,144,2158.56,8
23257,zwanzigdrei,4,0.00,50
23258,zy0705,4,0.00,50
23259,zynxgameth,16,0.00,25


In [32]:
df_games_reviews.duplicated().value_counts()

False    23261
dtype: int64

 ### Top 3 de desarrolladores con juegos MÁS recomendados por usuarios para el año dado

In [33]:
df_genre_sentiment = df_reviews[['reviews_item_id', 'reviews_posted', 'reviews_recommend', 'sentiment_analysis']]

In [34]:
df_genre_sentiment = df_genre_sentiment.rename(columns= {'reviews_item_id': 'item_id'})

In [35]:
df_col_games = df_games[['id', 'app_name', 'developer']]

In [36]:
df_col_games = df_col_games.rename(columns= {'id': 'item_id'})

In [37]:
agg_genero = df_genre_sentiment.merge(df_col_games, on= 'item_id')

In [38]:
agg_genero 

Unnamed: 0,item_id,reviews_posted,reviews_recommend,sentiment_analysis,app_name,developer
0,1250,2011-11-05,True,2,Killing Floor,Tripwire Interactive
1,1250,2015-01-15,True,1,Killing Floor,Tripwire Interactive
2,1250,2014-12-12,True,1,Killing Floor,Tripwire Interactive
3,1250,2013-12-13,True,0,Killing Floor,Tripwire Interactive
4,1250,2014-08-19,True,1,Killing Floor,Tripwire Interactive
...,...,...,...,...,...,...
411047,232950,2014-01-19,True,1,Bridge Project,Halycon Media GmbH &amp; Co. KG
411048,205080,2013-12-31,False,0,BIT.TRIP FATE,Gaijin Games
411049,205080,2013-12-31,False,0,BIT.TRIP FATE,Gaijin Games
411050,205080,2013-12-31,False,0,BIT.TRIP FATE,Gaijin Games


In [39]:
def convertir_fecha(fecha):
    try:
        return datetime.strptime(str(fecha), '%Y-%m-%d').strftime("%Y")
    except ValueError:
        return 'dato invalido'

In [40]:
## debemos extraer el valor del año de la columna reviews_posted.
    

agg_genero['reviews_posted'] = agg_genero['reviews_posted'].apply(lambda x: convertir_fecha(x))

In [41]:
agg_genero

Unnamed: 0,item_id,reviews_posted,reviews_recommend,sentiment_analysis,app_name,developer
0,1250,2011,True,2,Killing Floor,Tripwire Interactive
1,1250,2015,True,1,Killing Floor,Tripwire Interactive
2,1250,2014,True,1,Killing Floor,Tripwire Interactive
3,1250,2013,True,0,Killing Floor,Tripwire Interactive
4,1250,2014,True,1,Killing Floor,Tripwire Interactive
...,...,...,...,...,...,...
411047,232950,2014,True,1,Bridge Project,Halycon Media GmbH &amp; Co. KG
411048,205080,2013,False,0,BIT.TRIP FATE,Gaijin Games
411049,205080,2013,False,0,BIT.TRIP FATE,Gaijin Games
411050,205080,2013,False,0,BIT.TRIP FATE,Gaijin Games


In [42]:
agg_genero.drop(columns= 'item_id', inplace= True)

In [43]:
user_recom_dev_pos =agg_genero[(agg_genero['reviews_recommend'] == True) & (agg_genero['sentiment_analysis'] == 2)].drop(columns= 'app_name')

In [44]:
user_recom_dev_pos.drop(columns= 'reviews_recommend', inplace= True)

In [45]:
user_recom_dev_pos = user_recom_dev_pos.rename(columns= {'sentiment_analysis': 'most_recommended_developer'})

In [46]:
user_recom_dev_pos

Unnamed: 0,reviews_posted,most_recommended_developer,developer
0,2011,2,Tripwire Interactive
8,2015,2,Tripwire Interactive
9,2014,2,Tripwire Interactive
11,2014,2,Tripwire Interactive
14,2013,2,Tripwire Interactive
...,...,...,...
411042,2012,2,Redlynx
411043,2012,2,Redlynx
411044,2012,2,Redlynx
411045,2012,2,Redlynx


In [47]:
# de la columna 'most_recommended_game'
user_recom_dev_pos = pd.DataFrame(user_recom_dev_pos.groupby(['reviews_posted', 'developer']).count().sort_values(by=['reviews_posted', 'most_recommended_developer'], ascending=[False, False]).reset_index())

In [48]:
user_recom_dev_pos

Unnamed: 0,reviews_posted,developer,most_recommended_developer
0,dato invalido,Facepunch Studios,1628
1,dato invalido,Smartly Dressed Games,1625
2,dato invalido,"Psyonix, Inc.",1120
3,dato invalido,"Studio Wildcard,Instinct Games,Efecto Studios,...",997
4,dato invalido,Valve,892
...,...,...,...
2842,2010,Nadeo,1
2843,2010,Team17 Digital Ltd,1
2844,2010,Zaratustra Productions,1
2845,2010,Zoë Mode,1


In [49]:
## tomamos los registros en donde si existe un valor numerico (un año) para el campo 'reviews_posted'
user_recom_dev_pos = user_recom_dev_pos[user_recom_dev_pos['reviews_posted'] != 'dato invalido'].reset_index(drop= True)

In [50]:
user_recom_dev_pos

Unnamed: 0,reviews_posted,developer,most_recommended_developer
0,2015,Facepunch Studios,3180
1,2015,Smartly Dressed Games,3000
2,2015,Valve,2357
3,2015,"Studio Wildcard,Instinct Games,Efecto Studios,...",1950
4,2015,Freejam,1908
...,...,...,...
2124,2010,Nadeo,1
2125,2010,Team17 Digital Ltd,1
2126,2010,Zaratustra Productions,1
2127,2010,Zoë Mode,1


### Usuario con mas horas de juego acumuladas de un genero

In [51]:
# Extraemos de df_items las columnas 'playtime_forever', 'item_id y 'user_id'

playtime= df_items[['playtime_forever', 'item_id', 'user_id']]

In [52]:
#de df_games extraemos se extraen las columnas 'id', 'genres' y 'release_date

genre_item = df_games[['genres', 'id', 'release_date']]
# Renombramos la columna 'id' para que coincida con la columna item_id de df_items y asi poder joinearlas
genre_item = genre_item.rename(columns={'id':'item_id'})

In [53]:
# unimos ambos dataframes

df_playtime_user = playtime.merge(genre_item, on= 'item_id')

In [54]:
df_playtime_user

Unnamed: 0,playtime_forever,item_id,user_id,genres,release_date
0,6,10,76561197970982479,Action,2000
1,0,10,js41637,Action,2000
2,0,10,Riot-Punch,Action,2000
3,93,10,doctr,Action,2000
4,108,10,corrupted_soul,Action,2000
...,...,...,...,...,...
9877299,164,354280,76561198107283457,Indie,2016
9877300,164,354280,76561198107283457,Simulation,2016
9877301,0,433920,inven,Adventure,2016
9877302,0,433920,inven,Indie,2016


In [55]:
#eliminamos la columna 'item_id', porque ya no es necesaria
df_playtime_user.drop(columns= 'item_id', inplace= True)

In [56]:
#agrupamos por genero primero, y por año despues. Realizamos a su vez, la sumatoria de las horas jugadas por genero por año
df_playtime_user = df_playtime_user.groupby(['genres', 'user_id', 'release_date'])['playtime_forever'].sum().reset_index()
df_playtime_user

Unnamed: 0,genres,user_id,release_date,playtime_forever
0,Action,--000--,2009,5329
1,Action,--000--,2010,22
2,Action,--000--,2011,6522
3,Action,--000--,2012,109346
4,Action,--000--,2013,363
...,...,...,...,...
3449993,Web Publishing,zepavil,2015,9010
3449994,Web Publishing,zeshirky,2007,1
3449995,Web Publishing,zevlupine,2012,4
3449996,Web Publishing,zilaman,2013,9


In [57]:
# para una mejor optimizacion, tomaremos de este dataframe, los registros del usuario con mayor valor de playtime_forever(horas de juego) total para cada genero
new_df= None
df_playtime_user_final = None
generos = df_playtime_user['genres'].unique()  # obtenemos los valores unicos de genero
for genero in generos:  
    temp = df_playtime_user[df_playtime_user['genres'] == genero]  # segmentamos y tomamos los registros de dicho genero iterado
    max_playtime_user = temp.groupby('user_id')['playtime_forever'].sum().idxmax()  # agrupamos nuevamente los datos por usuario y obtenemos el usuario con el valor mas alto de playtime_forever
    new_df= pd.DataFrame(temp[temp['user_id'] == max_playtime_user])  # creamos un dataframe con los valores del usuario que cuenta con mayor cantidad de horas jugadas para el genero asignado
    if df_playtime_user_final is None:  # Si es el primer dataframe que creamos, lo asignamos 
        df_playtime_user_final = new_df 
    else:
        df_playtime_user_final = pd.concat([df_playtime_user_final, new_df])  # concatenamos el dataframe generado a df_playtime_user_final

In [58]:
df_playtime_user_final = df_playtime_user_final.reset_index(drop= True)

In [59]:
df_playtime_user_final

Unnamed: 0,genres,user_id,release_date,playtime_forever
0,Action,Sp3ctre,1993,0
1,Action,Sp3ctre,1995,217
2,Action,Sp3ctre,1996,0
3,Action,Sp3ctre,1998,0
4,Action,Sp3ctre,1999,44
...,...,...,...,...
165,Utilities,76561198073642113,2014,207651
166,Video Production,ScottyG555,2015,168314
167,Web Publishing,Xyphien,2005,7296
168,Web Publishing,Xyphien,2012,64657


In [60]:
#  Entendemos que los valores registrados en playtime_forever son en minutos, por o que los pasamos a su equivalente en horas

df_playtime_user_final['playtime_forever'] = df_playtime_user_final['playtime_forever'].apply(lambda x: int(x/60))

In [61]:
# renombramos la columna playtime_forever por playtime_hours

df_playtime_user_final = df_playtime_user_final.rename(columns= {'playtime_forever': 'playtime_hours'})
df_playtime_user_final

Unnamed: 0,genres,user_id,release_date,playtime_hours
0,Action,Sp3ctre,1993,0
1,Action,Sp3ctre,1995,3
2,Action,Sp3ctre,1996,0
3,Action,Sp3ctre,1998,0
4,Action,Sp3ctre,1999,0
...,...,...,...,...
165,Utilities,76561198073642113,2014,3460
166,Video Production,ScottyG555,2015,2805
167,Web Publishing,Xyphien,2005,121
168,Web Publishing,Xyphien,2012,1077


In [62]:
df_games.dtypes

publisher        object
genres           object
app_name         object
release_date     object
price           float64
early_access    float64
id                int64
developer        object
dtype: object

### Cargamos los Datasets

In [63]:
dfs = [df_sentimiento_x_desarrollador, df_dev_free, df_games_reviews, df_playtime_user_final, user_recom_dev_pos]
# lista de nombres de cada dataframe
names = ['df_sentimiento_x_desarrollador', 'df_dev_free', 'df_games_reviews', 'df_playtime_user_final', 'df_user_recom_dev_pos']

for df, name in zip(dfs, names):
    archivo = f'../data/{name}_unido.csv'
    df.to_csv(archivo, index=False, encoding='utf-8')



In [64]:
for df, name in zip(dfs, names):
    archivo = f'../data/{name}.parquet'
    pq.write_table(pa.Table.from_pandas(df), archivo)