In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score
from sklearn import tree
from sklearn.base import BaseEstimator
from typing import Type, Callable
from sklearn.ensemble import RandomForestRegressor

np.random.seed(5)

In [2]:
def load_data():
    csv_path = "train.csv"
    return pd.read_csv(csv_path)

database = load_data()
database.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,07/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,02/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,03/09/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,02/02/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,05/09/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0


In [3]:
database.columns

Index(['Id', 'Open Date', 'City', 'City Group', 'Type', 'P1', 'P2', 'P3', 'P4',
       'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15',
       'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25',
       'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35',
       'P36', 'P37', 'revenue'],
      dtype='object')

In [4]:
for i in range(1,38):
    column_name = f'P{i}'
    database[column_name] += np.random.normal(loc=0, scale=0.1, size=len(database))

In [5]:
database.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,07/17/1999,İstanbul,Big Cities,IL,4.044123,5.07098,4.144295,3.8851,2.143271,...,2.926795,5.197823,2.894935,4.075853,4.876688,4.996042,3.993054,3.095695,4.039992,5653753.0
1,1,02/14/2008,Ankara,Big Cities,FC,3.966913,5.074715,3.8756,3.863484,0.935838,...,3.178976,0.011086,0.078483,0.030565,0.116402,-0.222511,-0.134362,-0.064968,0.092304,6923131.0
2,2,03/09/2013,Diyarbakır,Other,IL,2.243077,4.14631,2.062888,4.921076,1.889147,...,2.871157,0.063734,0.122052,-0.178183,0.103369,-0.137776,0.074223,-0.153699,-0.077217,2055379.0
3,3,02/02/2012,Tokat,Other,IL,5.974791,4.673845,5.957441,6.072996,3.81053,...,7.431931,24.935029,12.017341,9.817811,6.011682,17.979468,12.129175,11.887015,5.950585,2675511.0
4,4,05/09/2009,Gaziantep,Other,IL,3.010961,4.14652,3.100321,3.918612,2.000622,...,2.903065,4.95461,1.108884,2.958862,1.879918,2.896871,4.093422,3.147932,3.158667,4316715.0


In [6]:
num_attribs = database.columns[5:-1]
cat_attribs = ["City", "City Group", "Type"]

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Imputación de valores faltantes usando la mediana
    ('scaler', StandardScaler())  # Escalado de características para normalizar los datos
])

cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder())
])

full_pipeline = ColumnTransformer([
    ("categorical", cat_pipeline, cat_attribs),
    ("numerical", num_pipeline, num_attribs)
])

data_prepared = full_pipeline.fit_transform(database)
data_labels = database["revenue"].copy()

In [7]:
print("Número de columnas en data_prepared:", data_prepared.shape[1])
print("Nombres de las columnas en data_prepared:")

Número de columnas en data_prepared: 76
Nombres de las columnas en data_prepared:


In [8]:
data_prepared

array([[ 0.        ,  0.        ,  0.        , ...,  0.5744861 ,
         0.21588898,  1.64455541],
       [ 0.        ,  0.        ,  0.        , ..., -0.62874628,
        -0.54827291, -0.57608868],
       [ 0.        ,  0.        ,  0.        , ..., -0.56793913,
        -0.56972545, -0.67144697],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.59641685,
        -0.56288493, -0.65656255],
       [ 0.        ,  0.        ,  0.        , ..., -0.60946948,
        -0.58953493, -0.7263346 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.60641931,
        -0.51113641, -0.58291116]])

In [9]:
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)

In [10]:
predictions_lin = lin_reg.predict(data_prepared)
r2_lin = r2_score(data_labels, predictions_lin)
print(f"Coeficiente de determinación de regresión lineal (R²): {r2_lin}")

Coeficiente de determinación de regresión lineal (R²): 0.4773351275986826


In [11]:
d_tree = tree.DecisionTreeRegressor(max_depth=8,max_features=0.75,random_state=5)
d_tree = d_tree.fit(data_prepared, data_labels)

In [12]:
predictions_dt = d_tree.predict(data_prepared)
r2_dt = r2_score(data_labels, predictions_dt)
print(f"Coeficiente de determinación decision tree (R²): {r2_dt}")

Coeficiente de determinación decision tree (R²): 0.8472308286703703


In [13]:
ridge_reg = Ridge(alpha=0.1, solver='cholesky')
ridge_reg = ridge_reg.fit(data_prepared, data_labels)
predictions_ridge = ridge_reg.predict(data_prepared)
r2_ridge = r2_score(data_labels, predictions_ridge)
print(f"Coeficiente de determinación de regresión ridge (R²): {r2_ridge}")

Coeficiente de determinación de regresión ridge (R²): 0.46970545092571936


In [14]:
type(data_prepared)

numpy.ndarray

In [15]:
def weighted_model_creator(models: list[Type[BaseEstimator]], models_proportions: np.array) -> Callable[[np.ndarray], np.ndarray]:
    '''
    Retorna una función que pondera distintos modelos de scikit para predecir un valor.

    Parámetros
    ----------
    models : list[type[BaseEstimator]]
        Lista de modelos de scikit-learn ya entrenados con los datos.

    models_proportions : np.array()
        Lista de ponderación de los distintos modelos (en orden respectivo) para la predicción.

    Returns
    -------
    weighted_model_predictor : function
        Función que recibe data y retorna una predicción ponderando los modelos.
    '''

    def weighted_model_predictor(data: np.ndarray) -> np.ndarray:
        number_of_rows = data.shape[0]
        acumulated_sum = np.zeros(number_of_rows)
        for model, proportion in zip(models, models_proportions):
            acumulated_sum += model.predict(data) * proportion
        ponderation = acumulated_sum / np.sum(models_proportions)
        return ponderation

    return weighted_model_predictor


In [16]:
list_of_models = [lin_reg, ridge_reg, d_tree]
list_of_different_ponderations = [
    [1, 1, 1], [1, 1, 2], [1, 1, 3], [1, 1, 4], [1, 1, 5],
    [1, 1, 6], [1, 1, 7], [1, 1, 8], [1, 1, 11], [2, 1, 10],
    [3, 1, 10], [2, 2, 10]
]

results = []

for i, ponderations in enumerate(list_of_different_ponderations):
    weighted_model = weighted_model_creator(list_of_models, ponderations)
    prediction = weighted_model(data_prepared)
    r2 = r2_score(data_labels, prediction)
    results.append({'Setting': i, 'W1': ponderations[0], 'W2': ponderations[1], 'W3': ponderations[2], 'R2 score (%)': r2*100})

df = pd.DataFrame(results)
df_transposed = df.set_index('Setting').T
df_transposed

Setting,0,1,2,3,4,5,6,7,8,9,10,11
W1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0
W2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
W3,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,11.0,10.0,10.0,10.0
R2 score (%),71.940368,79.636783,82.814397,84.332707,85.123289,85.555485,85.796384,85.929093,86.034278,85.773825,85.198814,85.123289


In [17]:
reg = RandomForestRegressor(n_estimators=20, max_depth=8, max_features=0.75, random_state=5)
random_f = reg.fit(data_prepared, data_labels)

In [18]:
predictions_rf = random_f.predict(data_prepared)
r2_rf = r2_score(data_labels, predictions_rf)
print(f"Coeficiente de determinación de random forest (R²): {r2_rf}")

Coeficiente de determinación de random forest (R²): 0.8380794139195342


In [19]:
predictions_baseModels = np.column_stack((predictions_lin, predictions_ridge, predictions_dt))

weights_lin_model = LinearRegression()
weights_lin_model.fit(predictions_baseModels, data_labels)

optimal_weights = weights_lin_model.coef_
intercept = weights_lin_model.intercept_

print("Pesos óptimos:", optimal_weights)
print("Término de sesgo:", intercept)

Pesos óptimos: [ 0.4961639  -0.21511473  0.86572498]
Término de sesgo: -653448.5299222348


In [20]:
secondStage_model = weighted_model_creator(list_of_models, optimal_weights)
predictions_sS = secondStage_model(data_prepared)
r2_sS = r2_score(data_labels, predictions_sS)
print(f"Coeficiente de determinación de Second Stage Model (R²): {r2_sS}")

Coeficiente de determinación de Second Stage Model (R²): 0.8588087021660439


In [21]:
print(f"Coeficiente de determinación de regresión lineal (R²): {r2_lin}")
print(f"Coeficiente de determinación de regresión ridge (R²): {r2_ridge}")
print(f"Coeficiente de determinación decision tree (R²): {r2_dt}")
print(f"Coeficiente de determinación de random forest (R²): {r2_rf}")
print(f"Coeficiente de determinación de Second Stage Model (R²): {r2_sS}")

Coeficiente de determinación de regresión lineal (R²): 0.4773351275986826
Coeficiente de determinación de regresión ridge (R²): 0.46970545092571936
Coeficiente de determinación decision tree (R²): 0.8472308286703703
Coeficiente de determinación de random forest (R²): 0.8380794139195342
Coeficiente de determinación de Second Stage Model (R²): 0.8588087021660439


In [22]:
final_model = secondStage_model