In [1]:
import data_integration


In [2]:
X, y = data_integration.get_final_dataset()

print("Data loaded")
print("X shape:", X.shape)
print("y shape:", y.shape)

Data loaded
X shape: (50044, 10)
y shape: (50044,)


In [4]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object", "bool"]).columns
numerical_cols = X.select_dtypes(exclude=["object", "bool"]).columns

categorical_cols, numerical_cols


(Index(['activity_type', 'Insee_code', 'Studying', 'sex', 'Household',
        'Occupation_42', 'HIGHEST_DIPLOMA', 'JOB_SECURITY'],
       dtype='object'),
 Index(['AGE_2018', 'sport_member'], dtype='object'))

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("num", "passthrough", numerical_cols)
    ]
)


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor

model = HistGradientBoostingRegressor(
    max_depth=8,
    learning_rate=0.05,
    max_iter=300,
    random_state=42
)

pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ]
)


In [10]:
pipeline.fit(X, y)
print("Model trained with preprocessing")


Model trained with preprocessing


In [11]:
y_pred = pipeline.predict(X)

print("Predictions generated")
print("y_pred shape:", y_pred.shape)


Predictions generated
y_pred shape: (50044,)


In [12]:
import numpy as np
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y, y_pred))
rmse


np.float64(0.1973913939645815)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)
y_val_pred = pipeline.predict(X_val)

rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
rmse_val


np.float64(0.19905892602171898)

In [14]:
import pandas as pd

predictions = pd.DataFrame({
    "primary_key": X.index,
    "predicted_target": y_pred
})

predictions.to_csv("final_predictions.csv", index=False)
print("Predictions saved")


Predictions saved
