In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score
from sklearn import tree
from sklearn.base import BaseEstimator
from typing import Type, Callable
from sklearn.ensemble import RandomForestRegressor

np.random.seed(5)

In [2]:
def load_data():
    csv_path = "train.csv"
    return pd.read_csv(csv_path)

database = load_data()
database.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,07/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,02/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,03/09/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,02/02/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,05/09/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0


In [3]:
database.columns

Index(['Id', 'Open Date', 'City', 'City Group', 'Type', 'P1', 'P2', 'P3', 'P4',
       'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15',
       'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25',
       'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35',
       'P36', 'P37', 'revenue'],
      dtype='object')

In [4]:
num_attribs = database.columns[5:-1]
cat_attribs = ["City", "City Group", "Type"]

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Imputación de valores faltantes usando la mediana
    ('scaler', StandardScaler())  # Escalado de características para normalizar los datos
])

cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder())
])

full_pipeline = ColumnTransformer([
    ("categorical", cat_pipeline, cat_attribs),
    ("numerical", num_pipeline, num_attribs)
])

data_prepared = full_pipeline.fit_transform(database)
data_labels = database["revenue"].copy()

In [5]:
print("Número de columnas en data_prepared:", data_prepared.shape[1])
print("Nombres de las columnas en data_prepared:")

Número de columnas en data_prepared: 76
Nombres de las columnas en data_prepared:


In [6]:
data_prepared

array([[ 0.        ,  0.        ,  0.        , ...,  0.57563408,
         0.18982104,  1.61595083],
       [ 0.        ,  0.        ,  0.        , ..., -0.59268991,
        -0.53255348, -0.62592526],
       [ 0.        ,  0.        ,  0.        , ..., -0.59268991,
        -0.53255348, -0.62592526],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.59268991,
        -0.53255348, -0.62592526],
       [ 0.        ,  0.        ,  0.        , ..., -0.59268991,
        -0.53255348, -0.62592526],
       [ 0.        ,  0.        ,  0.        , ..., -0.59268991,
        -0.53255348, -0.62592526]])

In [7]:
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)

In [8]:
predictions_lin = lin_reg.predict(data_prepared)
r2_lin = r2_score(data_labels, predictions_lin)
print(f"Coeficiente de determinación de regresión lineal (R²): {r2_lin}")

Coeficiente de determinación de regresión lineal (R²): 0.5100986105191018


In [9]:
d_tree = tree.DecisionTreeRegressor(max_depth=8,max_features=0.75,random_state=5)
d_tree = d_tree.fit(data_prepared, data_labels)

In [10]:
predictions_dt = d_tree.predict(data_prepared)
r2_dt = r2_score(data_labels, predictions_dt)
print(f"Coeficiente de determinación decision tree (R²): {r2_dt}")

Coeficiente de determinación decision tree (R²): 0.8385518713888007


In [11]:
ridge_reg = Ridge(alpha=0.1, solver='cholesky')
ridge_reg = ridge_reg.fit(data_prepared, data_labels)
predictions_ridge = ridge_reg.predict(data_prepared)
r2_ridge = r2_score(data_labels, predictions_ridge)
print(f"Coeficiente de determinación de regresión ridge (R²): {r2_ridge}")

Coeficiente de determinación de regresión ridge (R²): 0.4984438831965218


In [12]:
type(data_prepared)

numpy.ndarray

In [13]:
def weighted_model_creator(models: list[Type[BaseEstimator]], models_proportions: np.array) -> Callable[[np.ndarray], np.ndarray]:
    '''
    Retorna una función que pondera distintos modelos de scikit para predecir un valor.

    Parámetros
    ----------
    models : list[type[BaseEstimator]]
        Lista de modelos de scikit-learn ya entrenados con los datos.

    models_proportions : np.array()
        Lista de ponderación de los distintos modelos (en orden respectivo) para la predicción.

    Returns
    -------
    weighted_model_predictor : function
        Función que recibe data y retorna una predicción ponderando los modelos.
    '''

    def weighted_model_predictor(data: np.ndarray) -> np.ndarray:
        number_of_rows = data.shape[0]
        accumulated_sum = np.zeros(number_of_rows)
        for model, proportion in zip(models, models_proportions):
            accumulated_sum += model.predict(data) * proportion
        ponderation = accumulated_sum / np.sum(models_proportions)
        return ponderation

    return weighted_model_predictor


In [14]:
list_of_models = [lin_reg, ridge_reg, d_tree]
list_of_different_ponderations = [
    [1, 1, 1], [1, 1, 2], [1, 1, 3], [1, 1, 4], [1, 1, 5],
    [1, 1, 6], [1, 1, 7], [1, 1, 8], [1, 1, 11], [2, 1, 10],
    [3, 1, 10], [2, 2, 10]
]

results = []

for i, ponderations in enumerate(list_of_different_ponderations):
    weighted_model = weighted_model_creator(list_of_models, ponderations)
    prediction = weighted_model(data_prepared)
    r2 = r2_score(data_labels, prediction)
    results.append({'Setting': i, 'W1': ponderations[0], 'W2': ponderations[1], 'W3': ponderations[2], 'R2 score (%)': r2*100})

df = pd.DataFrame(results)
df_transposed = df.set_index('Setting').T
df_transposed

Setting,0,1,2,3,4,5,6,7,8,9,10,11
W1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0
W2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
W3,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,11.0,10.0,10.0,10.0
R2 score (%),69.956587,76.503531,79.448563,81.002279,81.911436,82.483579,82.863557,83.126576,83.556498,82.775471,81.953679,81.911436


In [15]:
reg = RandomForestRegressor(n_estimators=20, max_depth=8, max_features=0.75, random_state=5)
random_f = reg.fit(data_prepared, data_labels)

In [16]:
predictions_rf = random_f.predict(data_prepared)
r2_rf = r2_score(data_labels, predictions_rf)
print(f"Coeficiente de determinación de random forest (R²): {r2_rf}")

Coeficiente de determinación de random forest (R²): 0.7584724776743246


In [17]:
predictions_baseModels = np.column_stack((predictions_lin, predictions_ridge, predictions_dt))

weights_lin_model = LinearRegression()
weights_lin_model.fit(predictions_baseModels, data_labels)

optimal_weights = weights_lin_model.coef_
intercept = weights_lin_model.intercept_

print("Pesos óptimos:", optimal_weights)
print("Término de sesgo:", intercept)

Pesos óptimos: [ 0.1819362  -0.10067317  0.94833319]
Término de sesgo: -131812.5602719877


In [18]:
secondStage_model = weighted_model_creator(list_of_models, optimal_weights)
predictions_sS = secondStage_model(data_prepared)
r2_sS = r2_score(data_labels, predictions_sS)
print(f"Coeficiente de determinación de Second Stage Model (R²): {r2_sS}")

Coeficiente de determinación de Second Stage Model (R²): 0.8397110227688542


In [19]:
print(f"Coeficiente de determinación de regresión lineal (R²): {r2_lin}")
print(f"Coeficiente de determinación de regresión ridge (R²): {r2_ridge}")
print(f"Coeficiente de determinación decision tree (R²): {r2_dt}")
print(f"Coeficiente de determinación de random forest (R²): {r2_rf}")
print(f"Coeficiente de determinación de Second Stage Model (R²): {r2_sS}")

Coeficiente de determinación de regresión lineal (R²): 0.5100986105191018
Coeficiente de determinación de regresión ridge (R²): 0.4984438831965218
Coeficiente de determinación decision tree (R²): 0.8385518713888007
Coeficiente de determinación de random forest (R²): 0.7584724776743246
Coeficiente de determinación de Second Stage Model (R²): 0.8397110227688542


In [20]:
final_model = secondStage_model