In [16]:
# ============================================================
# 1. Import librerie e caricamento dati
# ============================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_val_score
pd.set_option("display.max_columns", 50)

# Carica il dataset
df = pd.read_csv("Melbourne_housing.csv")

df.head()





  df = pd.read_csv("Melbourne_housing.csv")


Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longtitude,Regionname,Propertycount,ParkingArea,Price
0,Abbotsford,68 Studley St,2,h,SS,Jellis,3/9/2016,2.5,3067.0,2.0,1.0,1.0,126.0,inf,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0,Carport,
1,Airport West,154 Halsey Rd,3,t,PI,Nelson,3/9/2016,13.5,3042.0,3.0,2.0,1.0,303.0,225.0,2016.0,Moonee Valley City Council,-37.718,144.878,Western Metropolitan,3464.0,Detached Garage,840000.0
2,Albert Park,105 Kerferd Rd,2,h,S,hockingstuart,3/9/2016,3.3,3206.0,2.0,1.0,0.0,120.0,82.0,1900.0,Port Phillip City Council,-37.8459,144.9555,Southern Metropolitan,3280.0,Attached Garage,1275000.0
3,Albert Park,85 Richardson St,2,h,S,Thomson,3/9/2016,3.3,3206.0,2.0,1.0,0.0,159.0,inf,,Port Phillip City Council,-37.845,144.9538,Southern Metropolitan,3280.0,Indoor,1455000.0
4,Alphington,30 Austin St,3,h,SN,McGrath,3/9/2016,6.4,3078.0,3.0,2.0,1.0,174.0,122.0,2003.0,Darebin City Council,-37.7818,145.0198,Northern Metropolitan,2211.0,Parkade,


In [2]:
# ============================================================
# 2. Selezione colonne utili
# ============================================================

columns_to_use = [
    "Rooms", "Type", "Distance", "Bedroom", "Bathroom", "Car",
    "Latitude", "Longtitude", "Price",
    "Regionname", "Propertycount"
]

df_new = df[columns_to_use].copy()
df_new.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rooms          34857 non-null  int64  
 1   Type           34857 non-null  object 
 2   Distance       34856 non-null  float64
 3   Bedroom        26640 non-null  float64
 4   Bathroom       26631 non-null  float64
 5   Car            26129 non-null  float64
 6   Latitude       26881 non-null  float64
 7   Longtitude     26881 non-null  float64
 8   Price          27247 non-null  float64
 9   Regionname     34857 non-null  object 
 10  Propertycount  34854 non-null  float64
dtypes: float64(8), int64(1), object(2)
memory usage: 2.9+ MB


In [3]:
# ============================================================
# 3. Pulizia "neutra" prima dello split
#    (niente medie/mediane qui, solo righe impossibili)
# ============================================================

df_clean = df_new.copy()

# Rimuovo righe senza Price o Distance
df_clean = df_clean.dropna(subset=["Price", "Distance"])

# Rimuovo inf/-inf
df_clean = df_clean.replace([np.inf, -np.inf], np.nan)

# Vincoli logici di base
df_clean = df_clean[df_clean["Price"] > 0]
df_clean = df_clean[df_clean["Rooms"] >= 1]
df_clean = df_clean[df_clean["Distance"] >= 0]

# Bedroom, Bathroom, Car non negative (se negative, le scarto;
# i NaN li gestiamo dopo lo split)
df_clean = df_clean[df_clean["Bedroom"].fillna(0) >= 0]
df_clean = df_clean[df_clean["Bathroom"].fillna(0) >= 0]
df_clean = df_clean[df_clean["Car"].fillna(0) >= 0]

df_clean.shape


(27246, 11)

In [4]:
# ============================================================
# 4. Definizione X e y + train-test split
#    (split QUI per evitare leakage)
# ============================================================

X = df_clean.drop(columns=["Price"])
y = df_clean["Price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print("Train:", X_train.shape, y_train.shape)
print("Test :", X_test.shape, y_test.shape)


Train: (21796, 10) (21796,)
Test : (5450, 10) (5450,)


In [5]:
# ============================================================
# 5. Imputazione Latitude e Longtitude per mediana di Regionname
#    - calcolo mediana SOLO su train
#    - applico sia a train che a test
# ============================================================

X_train_imputed = X_train.copy()
X_test_imputed = X_test.copy()

# Mediana Latitude per regione (train)
lat_medians = X_train_imputed.groupby("Regionname")["Latitude"].median()

# Train
X_train_imputed["Latitude"] = X_train_imputed["Latitude"].fillna(
    X_train_imputed["Regionname"].map(lat_medians)
)

# Test: uso le stesse mediane; se una regione non è nel train, uso mediana globale
global_lat_median = X_train_imputed["Latitude"].median()
X_test_imputed["Latitude"] = X_test_imputed["Latitude"].fillna(
    X_test_imputed["Regionname"].map(lat_medians)
)
X_test_imputed["Latitude"] = X_test_imputed["Latitude"].fillna(global_lat_median)

# Mediana Longtitude per regione (train)
lon_medians = X_train_imputed.groupby("Regionname")["Longtitude"].median()

X_train_imputed["Longtitude"] = X_train_imputed["Longtitude"].fillna(
    X_train_imputed["Regionname"].map(lon_medians)
)

global_lon_median = X_train_imputed["Longtitude"].median()
X_test_imputed["Longtitude"] = X_test_imputed["Longtitude"].fillna(
    X_test_imputed["Regionname"].map(lon_medians)
)
X_test_imputed["Longtitude"] = X_test_imputed["Longtitude"].fillna(global_lon_median)


In [6]:
# ============================================================
# 6. Imputazione strutturale di Bedroom usando rapporto Bedroom/Rooms
#    - rapporto stimato SOLO sul train
# ============================================================

mask_br_train = X_train_imputed["Bedroom"].notnull() & X_train_imputed["Rooms"].notnull()
valid_ratio = X_train_imputed.loc[mask_br_train, "Bedroom"] / X_train_imputed.loc[mask_br_train, "Rooms"]

ratio_mean = valid_ratio.mean()
print("Rapporto medio Bedroom/Rooms (train) =", ratio_mean)

# Imputo Bedroom mancanti in train
mask_br_na_train = X_train_imputed["Bedroom"].isnull()
X_train_imputed.loc[mask_br_na_train, "Bedroom"] = (
    X_train_imputed.loc[mask_br_na_train, "Rooms"] * ratio_mean
).round()

# Imputo Bedroom mancanti in test usando lo stesso rapporto
mask_br_na_test = X_test_imputed["Bedroom"].isnull()
X_test_imputed.loc[mask_br_na_test, "Bedroom"] = (
    X_test_imputed.loc[mask_br_na_test, "Rooms"] * ratio_mean
).round()


Rapporto medio Bedroom/Rooms (train) = 0.9978981302109227


In [7]:
# ============================================================
# 7. Imputazione mediana per altre variabili numeriche
#    - mediane calcolate SOLO sul train
# ============================================================

num_to_fill = ["Bathroom", "Car", "Propertycount"]

for col in num_to_fill:
    median_val = X_train_imputed[col].median()
    X_train_imputed[col] = X_train_imputed[col].fillna(median_val)
    X_test_imputed[col] = X_test_imputed[col].fillna(median_val)

X_train_imputed[num_to_fill].isnull().sum(), X_test_imputed[num_to_fill].isnull().sum()


(Bathroom         0
 Car              0
 Propertycount    0
 dtype: int64,
 Bathroom         0
 Car              0
 Propertycount    0
 dtype: int64)

In [8]:
# ============================================================
# 8. One-Hot Encoding delle categoriche senza leakage
#    - fit delle dummies sul train
#    - il test viene riallineato alle colonne del train
# ============================================================

categorical_cols = ["Type", "Regionname"]

X_train_encoded = pd.get_dummies(X_train_imputed, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test_imputed, columns=categorical_cols, drop_first=True)

# Riallineo le colonne del test a quelle del train (colonne mancanti -> 0)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

print("Shape train encoded:", X_train_encoded.shape)
print("Shape test encoded :", X_test_encoded.shape)


Shape train encoded: (21796, 17)
Shape test encoded : (5450, 17)


In [9]:
# ============================================================
# 9. Standardizzazione delle feature
#    - scaler fit SOLO sul train
# ============================================================

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)


In [20]:
from sklearn.linear_model import LinearRegression,Lasso, Ridge


from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold



model = Ridge(alpha=1, random_state=42)

# --- 3.K-Fold ---
# Vogliamo 5 round di validazione.
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 4. Esecuzione della Cross-Validation ---
scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='neg_mean_squared_error')

# --- Conversione in Positivo e Calcolo RMSE ---
mse_scores = -scores # Togliamo il segno meno
rmse_scores = np.sqrt(mse_scores) # Facciamo la radice quadrata per avere l'errore

print("\n--- Risultati Cross-Validation ---")
for i, mse in enumerate(mse_scores):
    print(f"Fold {i+1}: MSE = {mse:,.0f} | RMSE = {np.sqrt(mse):,.0f}")

print("-" * 40)
print(f"MSE Medio: {mse_scores.mean():,.0f}")
print(f"RMSE Medio: {rmse_scores.mean():,.0f}")
print(f"Stabilità (Std RMSE): +/- {rmse_scores.std():,.0f}")




--- Risultati Cross-Validation ---
Fold 1: MSE = 199,513,679,628 | RMSE = 446,670
Fold 2: MSE = 186,695,187,290 | RMSE = 432,082
Fold 3: MSE = 173,477,194,792 | RMSE = 416,506
Fold 4: MSE = 169,836,840,592 | RMSE = 412,113
Fold 5: MSE = 194,375,627,657 | RMSE = 440,881
----------------------------------------
MSE Medio: 184,779,705,992
RMSE Medio: 429,650
Stabilità (Std RMSE): +/- 13,432


In [12]:
# ============================================================
# RidgeCV (L2) - scelta automatica di alpha + MSE, RMSE, R2
# ============================================================

ridge_alphas = [0.001, 0.01, 0.1, 1, 10, 100]

ridge_cv = RidgeCV(alphas=ridge_alphas, cv=5)
ridge_cv.fit(X_train_scaled, y_train)

print("=== RidgeCV (L2) ===")
print("Miglior alpha:", ridge_cv.alpha_)

# Predizioni
y_train_pred_ridge_cv = ridge_cv.predict(X_train_scaled)
y_test_pred_ridge_cv = ridge_cv.predict(X_test_scaled)

# MSE
mse_train_ridge_cv = mean_squared_error(y_train, y_train_pred_ridge_cv)
mse_test_ridge_cv = mean_squared_error(y_test, y_test_pred_ridge_cv)

# RMSE
rmse_train_ridge_cv = np.sqrt(mse_train_ridge_cv)
rmse_test_ridge_cv = np.sqrt(mse_test_ridge_cv)

# R2
r2_train_ridge_cv = r2_score(y_train, y_train_pred_ridge_cv)
r2_test_ridge_cv = r2_score(y_test, y_test_pred_ridge_cv)

print("MSE Train:", mse_train_ridge_cv)
print("MSE Test :", mse_test_ridge_cv)
print("RMSE Train:", rmse_train_ridge_cv)
print("RMSE Test :", rmse_test_ridge_cv)
print("R2  Train:", r2_train_ridge_cv)
print("R2  Test :", r2_test_ridge_cv)



=== RidgeCV (L2) ===
Miglior alpha: 10.0
MSE Train: 184461997135.0624
MSE Test : 159385068612.02213
RMSE Train: 429490.3923664212
RMSE Test : 399230.5957864729
R2  Train: 0.5592749634747562
R2  Test : 0.5840163859743919


In [13]:
# ============================================================
# LassoCV (L1) - scelta automatica di alpha + MSE, RMSE, R2
# ============================================================

lasso_cv = LassoCV(
    alphas=None,      # lascia che scelga la griglia da solo
    cv=5,
    random_state=42,
    max_iter=10000
)

lasso_cv.fit(X_train_scaled, y_train)

print("=== LassoCV (L1) ===")
print("Miglior alpha:", lasso_cv.alpha_)

# Predizioni
y_train_pred_lasso_cv = lasso_cv.predict(X_train_scaled)
y_test_pred_lasso_cv = lasso_cv.predict(X_test_scaled)

# MSE
mse_train_lasso_cv = mean_squared_error(y_train, y_train_pred_lasso_cv)
mse_test_lasso_cv = mean_squared_error(y_test, y_test_pred_lasso_cv)

# RMSE
rmse_train_lasso_cv = np.sqrt(mse_train_lasso_cv)
rmse_test_lasso_cv = np.sqrt(mse_test_lasso_cv)

# R2
r2_train_lasso_cv = r2_score(y_train, y_train_pred_lasso_cv)
r2_test_lasso_cv = r2_score(y_test, y_test_pred_lasso_cv)

print("MSE Train:", mse_train_lasso_cv)
print("MSE Test :", mse_test_lasso_cv)
print("RMSE Train:", rmse_train_lasso_cv)
print("RMSE Test :", rmse_test_lasso_cv)
print("R2  Train:", r2_train_lasso_cv)
print("R2  Test :", r2_test_lasso_cv)


=== LassoCV (L1) ===
Miglior alpha: 302.1478594836664
MSE Train: 184463843475.6697
MSE Test : 159387572671.61517
RMSE Train: 429492.5418161178
RMSE Test : 399233.7318809812
R2  Train: 0.559270552113369
R2  Test : 0.584009850558193


In [14]:
# ============================================================
# Confronto finale tra modelli con MSE, RMSE e R2
# (aggiungi qui anche i valori di Ridge/Lasso "fixed" se li usi)
# ============================================================

results = pd.DataFrame({
    "Model": [
        "RidgeCV (L2) tuned alpha",
        "LassoCV (L1) tuned alpha"
    ],
    "MSE Train": [
        mse_train_ridge_cv,
        mse_train_lasso_cv
    ],
    "MSE Test": [
        mse_test_ridge_cv,
        mse_test_lasso_cv
    ],
    "RMSE Train": [
        rmse_train_ridge_cv,
        rmse_train_lasso_cv
    ],
    "RMSE Test": [
        rmse_test_ridge_cv,
        rmse_test_lasso_cv
    ],
    "R2 Train": [
        r2_train_ridge_cv,
        r2_train_lasso_cv
    ],
    "R2 Test": [
        r2_test_ridge_cv,
        r2_test_lasso_cv
    ],
})

results


Unnamed: 0,Model,MSE Train,MSE Test,RMSE Train,RMSE Test,R2 Train,R2 Test
0,RidgeCV (L2) tuned alpha,184462000000.0,159385100000.0,429490.392366,399230.595786,0.559275,0.584016
1,LassoCV (L1) tuned alpha,184463800000.0,159387600000.0,429492.541816,399233.731881,0.559271,0.58401
