In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# -----------------------------
# 1️⃣ Chargement du dataset
# -----------------------------
df = pd.read_csv(r'C:\Users\gopit\OneDrive\Documents\MASTER2SISE\m2_Enedis\m2_enedis\Dataset_Model\donnees_ml_preparees.csv',sep=',')
df.columns = df.columns.str.strip().str.lower()

y = df['etiquette_dpe']
X = df.drop(columns=['etiquette_dpe'])

# -----------------------------
# 2️⃣ Split complet train/test
# -----------------------------
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# -----------------------------
# 3️⃣ Sous-échantillon stratifié
# -----------------------------
sample_train_size = 5000
sample_test_size = 1000

X_train, _, y_train, _ = train_test_split(
    X_train_full, y_train_full,
    train_size=sample_train_size,
    stratify=y_train_full,
    random_state=42
)

X_test, _, y_test, _ = train_test_split(
    X_test_full, y_test_full,
    train_size=sample_test_size,
    stratify=y_test_full,
    random_state=42
)

# -----------------------------
# 4️⃣ Standardisation
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------
# 5️⃣ RandomizedSearchCV pour hyperparamètres
# -----------------------------
param_dist = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

logreg = LogisticRegression(solver='saga', max_iter=5000, random_state=42)

random_search = RandomizedSearchCV(
    estimator=logreg,
    param_distributions=param_dist,
    n_iter=5,
    scoring='f1_weighted',
    cv=2,
    n_jobs=1,
    verbose=1,
    random_state=42
)

random_search.fit(X_train_scaled, y_train)

best_params = random_search.best_params_
print("✅ Meilleurs paramètres :", best_params)



Fitting 2 folds for each of 5 candidates, totalling 10 fits


KeyboardInterrupt: 

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

# -----------------------------
# 1️⃣ Chargement du dataset
# -----------------------------
df = pd.read_csv(
    r'C:\Users\gopit\OneDrive\Documents\MASTER2SISE\m2_Enedis\m2_enedis\donnees_ml_preparees.csv',
    sep=','
)
df.columns = df.columns.str.strip().str.lower()

X = df.drop(columns=['etiquette_dpe'])
y = df['etiquette_dpe']

# -----------------------------
# 2️⃣ Split complet train/test
# -----------------------------
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# -----------------------------
# 3️⃣ Sous-échantillon stratifié
# -----------------------------
sample_train_size = 10000
sample_test_size = 3000

X_train, _, y_train, _ = train_test_split(
    X_train_full, y_train_full,
    train_size=sample_train_size,
    stratify=y_train_full,
    random_state=42
)

X_test, _, y_test, _ = train_test_split(
    X_test_full, y_test_full,
    train_size=sample_test_size,
    stratify=y_test_full,
    random_state=42
)

# -----------------------------
# 4️⃣ Standardisation
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------
# 5️⃣ Hyperparamètres (à adapter)
# -----------------------------
best_params = {'C': 10, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 5000}

# -----------------------------
# 6️⃣ Entraînement sur 10 runs
# -----------------------------
n_runs = 10
scores = []

print("🔁 Entraînement sur 10 runs...\n")

for i in range(n_runs):
    model = LogisticRegression(**best_params, random_state=i, n_jobs=1)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    f1 = f1_score(y_test, y_pred, average='weighted')
    scores.append(f1)
    print(f"Run {i+1:2d}: F1-score = {f1:.4f}")

mean_f1 = np.mean(scores)
std_f1 = np.std(scores)

print("\n✅ Moyenne des F1-scores :", round(mean_f1, 4))
print("📉 Écart-type :", round(std_f1, 4))



🔁 Entraînement sur 10 runs...

Run  1: F1-score = 0.7804
Run  2: F1-score = 0.7804
Run  3: F1-score = 0.7804
Run  4: F1-score = 0.7804
Run  5: F1-score = 0.7804
Run  6: F1-score = 0.7804
Run  7: F1-score = 0.7804
Run  8: F1-score = 0.7804
Run  9: F1-score = 0.7804
Run 10: F1-score = 0.7804

✅ Moyenne des F1-scores : 0.7804
📉 Écart-type : 0.0
