In [1]:
# Creamos el DataFrame
import ast
import pandas as pd

rows = []
with open("steam_games.json") as f:
    for line in f.readlines():
        rows.append(ast.literal_eval(line))

df = pd.DataFrame(rows)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32135 entries, 0 to 32134
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   publisher       24083 non-null  object 
 1   genres          28852 non-null  object 
 2   app_name        32133 non-null  object 
 3   title           30085 non-null  object 
 4   url             32135 non-null  object 
 5   release_date    30068 non-null  object 
 6   tags            31972 non-null  object 
 7   discount_price  225 non-null    float64
 8   reviews_url     32133 non-null  object 
 9   specs           31465 non-null  object 
 10  price           30758 non-null  object 
 11  early_access    32135 non-null  bool   
 12  id              32133 non-null  object 
 13  developer       28836 non-null  object 
 14  sentiment       24953 non-null  object 
 15  metascore       2677 non-null   object 
dtypes: bool(1), float64(1), object(14)
memory usage: 3.7+ MB


In [3]:
# Seleccionar las columnas deseadas
columnas_deseadas = ["release_date", "genres", "metascore", "price", "early_access"]

# Crear el nuevo DataFrame solo con las columnas deseadas
df_reduced = df[columnas_deseadas].copy()


In [4]:
df_reduced.tail()

Unnamed: 0,release_date,genres,metascore,price,early_access
32130,2018-01-04,"[Casual, Indie, Simulation, Strategy]",,1.99,False
32131,2018-01-04,"[Casual, Indie, Strategy]",,4.99,False
32132,2018-01-04,"[Indie, Racing, Simulation]",,1.99,False
32133,2017-09-02,"[Casual, Indie]",,4.99,False
32134,,,,4.99,True


In [5]:
# Convertir la columna "price" a numérica y reemplazar los valores no numéricos con 0
df_reduced['price'] = pd.to_numeric(df['price'], errors='coerce').fillna(0)

In [6]:
# Eliminar filas con valores nulos en la columna "price"
df_reduced.dropna(subset=['price'], inplace=True)

In [7]:
# Eliminar filas con valores nulos en la columna "metascore"
df_reduced.dropna(subset=['metascore'], inplace=True)

In [8]:


# Reemplazar los valores "NA" por NaN en la columna "metascore"
df_reduced["metascore"] = df_reduced["metascore"].replace("NA", pd.NA)

# Convertir la columna "metascore" a tipo numérico
df_reduced["metascore"] = pd.to_numeric(df_reduced["metascore"], errors='coerce')

# Eliminar filas con valores NaN en la columna "metascore" del DataFrame "df_reduced"
df_reduced.dropna(subset=["metascore"], inplace=True)


In [9]:
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2607 entries, 28 to 32117
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   release_date  2553 non-null   object 
 1   genres        2545 non-null   object 
 2   metascore     2607 non-null   float64
 3   price         2607 non-null   float64
 4   early_access  2607 non-null   bool   
dtypes: bool(1), float64(2), object(2)
memory usage: 104.4+ KB


In [10]:
# Transformación del año
df_reduced["year"] = pd.to_datetime(df_reduced["release_date"]).dt.year

# Transformación del género utilizando codificación one-hot
generos_dummies = df_reduced["genres"].str.join(",").str.get_dummies(sep=",")
df_reduced = pd.concat([df_reduced, generos_dummies], axis=1)

# Eliminar las columnas originales de "release_date" y "genres"
df_reduced.drop(["release_date", "genres"], axis=1, inplace=True)


In [11]:
# Eliminar filas con valores NaN en la columna "year" del DataFrame "df_reduced"
df_reduced.dropna(subset=["year"], inplace=True)

In [12]:
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2553 entries, 28 to 32117
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   metascore              2553 non-null   float64
 1   price                  2553 non-null   float64
 2   early_access           2553 non-null   bool   
 3   year                   2553 non-null   float64
 4   Action                 2553 non-null   int64  
 5   Adventure              2553 non-null   int64  
 6   Casual                 2553 non-null   int64  
 7   Early Access           2553 non-null   int64  
 8   Free to Play           2553 non-null   int64  
 9   Indie                  2553 non-null   int64  
 10  Massively Multiplayer  2553 non-null   int64  
 11  RPG                    2553 non-null   int64  
 12  Racing                 2553 non-null   int64  
 13  Simulation             2553 non-null   int64  
 14  Sports                 2553 non-null   int64  
 15  St

In [13]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Supongamos que tienes tus datos en un DataFrame llamado "df"
# Eliminar la columna "price" para utilizarla como variable objetivo para la predicción
X = df_reduced.drop(columns=['price'])
y = df_reduced['price']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el modelo de regresión lineal múltiple
model = LinearRegression()

# Entrenar el modelo con los datos de entrenamiento
model.fit(X_train, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test)


In [14]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Calcular el MSE
mse = mean_squared_error(y_test, y_pred)

# Calcular el RMSE
rmse = np.sqrt(mse)

# Mostrar el RMSE
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 8.364149912715206


In [15]:
import pickle

# Guardar el modelo en un archivo pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [16]:
# Calcular las diferencias (residuos)
diferencias = y_test - y_pred

# Crear un DataFrame para visualizar las diferencias junto con las predicciones y los valores reales
resultados = pd.DataFrame({'Precio Real': y_test, 'Predicción': y_pred, 'Diferencia': diferencias})

# Mostrar los ejemplos de 20 a 30
print(resultados.iloc[10:20])

       Precio Real  Predicción  Diferencia
2845         14.99   13.166942    1.823058
28690        14.99   15.319067   -0.329067
18950        44.99   19.424351   25.565649
31840         9.99   11.831373   -1.841373
838           9.99   19.055511   -9.065511
28385         9.99   12.933436   -2.943436
31381         9.99   16.409726   -6.419726
16046        19.99   22.064652   -2.074652
24354         9.99    9.582067    0.407933
142           9.99   13.527662   -3.537662
