In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import scipy.stats as st
import math
import datetime
import missingno as msno
from scipy.stats import norm, skew
from sklearn import metrics
from collections import Counter

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from mlxtend.regressor import StackingCVRegressor

### Adquisicion de datos



In [4]:
vg = pd.read_csv("vgsales.csv")
df = vg.copy()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16539 entries, 0 to 16538
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          16539 non-null  object 
 1   Platform      16539 non-null  object 
 2   Year          16290 non-null  float64
 3   Genre         16539 non-null  object 
 4   Publisher     16539 non-null  object 
 5   NA_Sales      16539 non-null  float64
 6   EU_Sales      16539 non-null  float64
 7   JP_Sales      16539 non-null  float64
 8   Other_Sales   16539 non-null  float64
 9   Global_Sales  16539 non-null  float64
dtypes: float64(6), object(4)
memory usage: 1.3+ MB


### Limpieza de datos


In [8]:
# Eliminar duplicados
df = df.drop_duplicates()

# Eliminar filas con valores nulos en 'Year'
df = df.dropna(subset=['Year'])

# Actualizar la columna 'Release_Date' después de limpiar
df['Release_Date'] = pd.to_datetime(df['Year'].astype(int).astype(str) + '-01-01', errors='coerce')

# Guardar una copia limpia por si el usuario desea usarla más adelante
cleaned_file_path = 'D:\escUelea\Facu\FCFM\\7° semesse\Minería de datos\Mineria\MineriaDeDatos1950644/vgsales_cleaned.csv'
df.to_csv(cleaned_file_path, index=False)

# Mostrar ruta del archivo limpio y primeras filas
df.shape, df.head()


  cleaned_file_path = 'D:\escUelea\Facu\FCFM\\7° semesse\Minería de datos\Mineria\MineriaDeDatos1950644/vgsales_cleaned.csv'


((16290, 11),
          Name Platform    Year     Genre   Publisher  NA_Sales  EU_Sales  \
 0     Kaboom!     2600  1980.0      Misc  Activision      1.07      0.07   
 1      Boxing     2600  1980.0  Fighting  Activision      0.72      0.04   
 2  Ice Hockey     2600  1980.0    Sports  Activision      0.46      0.03   
 3     Freeway     2600  1980.0    Action  Activision      0.32      0.02   
 4      Bridge     2600  1980.0      Misc  Activision      0.25      0.02   
 
    JP_Sales  Other_Sales  Global_Sales Release_Date  
 0       0.0         0.01          1.15   1980-01-01  
 1       0.0         0.01          0.77   1980-01-01  
 2       0.0         0.01          0.49   1980-01-01  
 3       0.0         0.00          0.34   1980-01-01  
 4       0.0         0.00          0.27   1980-01-01  )

### Estadisticas descriptivas


In [9]:
# Cargar el dataset limpio
df = pd.read_csv("vgsales_cleaned.csv")

# Estadísticas de columnas numéricas
print("Estadísticas numéricas:")
print(df.describe())

# Cantidad de juegos por género
print("\nCantidad de juegos por género:")
print(df['Genre'].value_counts())

# Cantidad de juegos por plataforma
print("\nCantidad de juegos por plataforma:")
print(df['Platform'].value_counts())

# Top 10 publishers
print("\nTop 10 publishers:")
print(df['Publisher'].value_counts().head(10))

# Correlaciones entre variables numéricas
print("\nCorrelación entre variables numéricas:")
print(df.corr(numeric_only=True))

Estadísticas numéricas:
               Year      NA_Sales      EU_Sales      JP_Sales   Other_Sales  \
count  16290.000000  16290.000000  16290.000000  16290.000000  16290.000000   
mean    2006.404727      0.265646      0.147740      0.078838      0.048428   
std        5.831618      0.822457      0.509317      0.311888      0.190089   
min     1980.000000      0.000000      0.000000      0.000000      0.000000   
25%     2003.000000      0.000000      0.000000      0.000000      0.000000   
50%     2007.000000      0.080000      0.020000      0.000000      0.010000   
75%     2010.000000      0.240000      0.110000      0.040000      0.040000   
max     2017.000000     41.490000     29.020000     10.220000     10.570000   

       Global_Sales  
count  16290.000000  
mean       0.540926  
std        1.567391  
min        0.010000  
25%        0.060000  
50%        0.170000  
75%        0.480000  
max       82.740000  

Cantidad de juegos por género:
Genre
Action          3251
Sports 