### Librerias

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from joblib import load
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

### Informacion general sobre DataFrame

In [2]:
# Cargar el dataset
url = "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/ironkaggle_notarget.csv"
df = pd.read_csv(url)

In [3]:
df.shape

(71205, 9)

In [4]:
# Mostrar las primeras filas y la información general del dataset
df.head()

Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday
0,7,764,4,2013-12-26,0,0,0,c,1
1,19,22,3,2013-05-22,449,1,0,0,1
2,31,1087,6,2013-06-29,622,1,0,0,0
3,45,139,6,2013-08-17,314,1,0,0,0
4,56,568,1,2014-04-07,356,1,0,0,0


### Transformacion de los datos

In [5]:
# Convertir la columna 'Date' a tipo datetime
df['Date'] = pd.to_datetime(df['Date'])

# Obtener la primera fecha
primera_fecha = df['Date'].min()

print("La primera fecha en la columna 'Date' es:", primera_fecha)


La primera fecha en la columna 'Date' es: 2013-01-01 00:00:00


In [6]:
df = pd.DataFrame(df)

# Fecha de referencia
fecha_referencia = pd.to_datetime('2013-01-01')

# Crear la nueva columna 'date -1' con la diferencia en días
df['date -1'] = (df['Date'] - fecha_referencia).dt.days

# Mostrar el DataFrame para verificar
df.head()


Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday,date -1
0,7,764,4,2013-12-26,0,0,0,c,1,359
1,19,22,3,2013-05-22,449,1,0,0,1,141
2,31,1087,6,2013-06-29,622,1,0,0,0,179
3,45,139,6,2013-08-17,314,1,0,0,0,228
4,56,568,1,2014-04-07,356,1,0,0,0,461


In [7]:
df = pd.DataFrame(df)

# Asegurarse de que 'Day_of_week' es numérico
df['Day_of_week'] = pd.to_numeric(df['Day_of_week'], errors='coerce')

# Calcular seno y coseno directamente usando Day_of_week
df['Seno'] = np.sin(2 * np.pi * df['Day_of_week'] / 7)
df['Coseno'] = np.cos(2 * np.pi * df['Day_of_week'] / 7)

# Mostrar el DataFrame resultante
df.head()


Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday,date -1,Seno,Coseno
0,7,764,4,2013-12-26,0,0,0,c,1,359,-0.433884,-0.900969
1,19,22,3,2013-05-22,449,1,0,0,1,141,0.433884,-0.900969
2,31,1087,6,2013-06-29,622,1,0,0,0,179,-0.781831,0.62349
3,45,139,6,2013-08-17,314,1,0,0,0,228,-0.781831,0.62349
4,56,568,1,2014-04-07,356,1,0,0,0,461,0.781831,0.62349


In [8]:
df.dtypes

True_index                      int64
Store_ID                        int64
Day_of_week                     int64
Date                   datetime64[ns]
Nb_customers_on_day             int64
Open                            int64
Promotion                       int64
State_holiday                  object
School_holiday                  int64
date -1                         int64
Seno                          float64
Coseno                        float64
dtype: object

In [9]:
# Crear variables dummy para State_holiday
df = pd.get_dummies(df, columns=['State_holiday'], prefix='State_holiday', drop_first=True)

# Verificar los nuevos tipos de datos
df.dtypes

True_index                      int64
Store_ID                        int64
Day_of_week                     int64
Date                   datetime64[ns]
Nb_customers_on_day             int64
Open                            int64
Promotion                       int64
School_holiday                  int64
date -1                         int64
Seno                          float64
Coseno                        float64
State_holiday_a                  bool
State_holiday_b                  bool
State_holiday_c                  bool
dtype: object

In [10]:
# Crear una columna vacía llamada 'Sales' 
df['Sales'] = None  

# Encontrar la posición de la columna 'School_holiday'
posicion = df.columns.get_loc('School_holiday') + 1  # +1 para insertar después

# Insertar la columna 'Sales' en la posición deseada
df.insert(posicion, 'Sales', df.pop('Sales'))

## Prediccion de los precios con el modelo Random Forest

In [11]:
# Cargar el modelo Random Forest
random_forest_model = load('random_forest_model.joblib')

# Escalar las columnas numéricas
columns_to_scale = ['Store_ID', 'Nb_customers_on_day', 'Open', 'Promotion',
                    'State_holiday_a', 'State_holiday_b', 'State_holiday_c',
                    'School_holiday', 'date -1', 'Seno', 'Coseno']

# Seleccionar solo las columnas a escalar
df_to_scale = df[columns_to_scale]

# Instanciar el escalador
scaler = StandardScaler()

# Escalar las características numéricas
df_scaled = scaler.fit_transform(df_to_scale)

# Convertir a DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=columns_to_scale)

# Hacer la predicción
result = random_forest_model.predict(df_scaled)

# Asignar las predicciones a la columna 'Sales' en el DataFrame original
df['Sales'] = result

# Imprimir el DataFrame actualizado
df




Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,School_holiday,Sales,date -1,Seno,Coseno,State_holiday_a,State_holiday_b,State_holiday_c
0,7,764,4,2013-12-26,0,0,0,1,0.00,359,-0.433884,-0.900969,False,False,True
1,19,22,3,2013-05-22,449,1,0,1,4049.59,141,0.433884,-0.900969,False,False,False
2,31,1087,6,2013-06-29,622,1,0,0,6158.87,179,-0.781831,0.623490,False,False,False
3,45,139,6,2013-08-17,314,1,0,0,3383.37,228,-0.781831,0.623490,False,False,False
4,56,568,1,2014-04-07,356,1,0,0,3759.97,461,0.781831,0.623490,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71200,712004,217,2,2015-01-13,633,1,1,0,6316.67,742,0.974928,-0.222521,False,False,False
71201,712018,604,3,2014-04-30,743,1,1,0,8378.18,484,0.433884,-0.900969,False,False,False
71202,712020,1021,5,2014-07-18,1852,1,1,1,16897.30,563,-0.974928,-0.222521,False,False,False
71203,712023,28,3,2014-08-27,0,0,0,1,0.00,603,0.433884,-0.900969,False,False,False


### Comparasion de los resultados reales con las predicciones

In [12]:
# Cargar el DataFrame desde el enlace
url = "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/ironkaggle_solutions.csv"
df_real = pd.read_csv(url)

# Combinar DataFrames 
comparison_df = pd.merge(df[['True_index', 'Sales']], df_real[['True_index', 'Sales']], on='True_index', suffixes=('_pred', '_real'))

# Calcular la diferencia entre las predicciones y los valores reales
comparison_df['Sales_difference'] = comparison_df['Sales_pred'] - comparison_df['Sales_real']

# Calcular métricas de evaluación
mae = mean_absolute_error(comparison_df['Sales_real'], comparison_df['Sales_pred'])
r2 = r2_score(comparison_df['Sales_real'], comparison_df['Sales_pred'])
rmse = mean_squared_error(comparison_df['Sales_real'], comparison_df['Sales_pred'], squared=False)  # RMSE

# Imprimir resultados
print(comparison_df[['True_index', 'Sales_pred', 'Sales_real', 'Sales_difference']])
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R² Score: {r2}')
print(f'Root Mean Square Error (RMSE): {rmse}')


       True_index  Sales_pred    Sales_real  Sales_difference
0               7        0.00      0.000000          0.000000
1              19     4049.59   3792.528564        257.061436
2              31     6158.87   5999.958008        158.911992
3              45     3383.37   3365.519287         17.850713
4              56     3759.97   3458.130127        301.839873
...           ...         ...           ...               ...
71200      712004     6316.67   5714.028320        602.641680
71201      712018     8378.18   9701.739258      -1323.559258
71202      712020    16897.30  15175.256836       1722.043164
71203      712023        0.00      0.000000          0.000000
71204      712027     4615.06   4220.416504        394.643496

[71205 rows x 4 columns]
Mean Absolute Error (MAE): 444.78218419569265
R² Score: 0.9612215663541407
Root Mean Square Error (RMSE): 734.7340196510895




# Mean Absolute Error (MAE): 444.78218419569265
# R² Score: 0.9612215663541407
# Root Mean Square Error (RMSE): 734.7340196510895