✅ Objectif :
Charger ton dataset et vérifier les colonnes.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8,5)

df = pd.read_csv("../data/synthetic_satellite_logs.csv")
print("✅ Data loaded:", df.shape)
df.head()


ModuleNotFoundError: No module named 'seaborn'

💡 Regarde les ordres de grandeur : SNR, RSSI, latences, etc.

In [None]:
df.describe()

Corrélation et premières visualisations

Tu verras probablement :
    SNR, RSSI, battery corrélés positivement à success
    distance, humidity, latency_sat corrélés négativement

In [None]:
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation heatmap")
plt.show()

Tu vérifies visuellement ce que ton intuition (et le logit) disait.

In [None]:
sns.boxplot(x="success", y="snr", data=df)
plt.title("SNR distribution by success")
plt.show()

sns.boxplot(x="success", y="rssi", data=df)
plt.title("RSSI distribution by success")
plt.show()

sns.boxplot(x="success", y="distance", data=df)
plt.title("Distance vs success")
plt.show()


Préparation des données pour le modèle

In [None]:
# Variables explicatives et cible
X = df.drop(columns=["success"])
y = df["success"]

# Définir colonnes numériques et catégorielles
num_cols = ["snr","rssi","distance","latency_bt","latency_sat",
             "temperature","humidity","battery","latitude","longitude","altitude"]
cat_cols = ["firmware","time_of_day"]

# Préprocesseur
preproc = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(X_train.shape, X_test.shape)


Modèle baseline : Logistic Regression

Interprétation :
    Accuracy = pourcentage de bonnes prédictions
    ROC AUC = capacité à classer les cas bons / mauvais (1 = parfait)
    Tu devrais être entre 0.80 et 0.90 (normal, car ton modèle apprend la logique du logit simulé).

clf = Pipeline([
    ("preproc", preproc),
    ("model", LogisticRegression(max_iter=1000))
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]

print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("ROC AUC:", round(roc_auc_score(y_test, y_proba), 3))
print(classification_report(y_test, y_pred))

Tu vois quelles variables influencent le plus la probabilité de succès :
→ snr, rssi, distance, firmware_v2.0, etc.

In [None]:
model = clf.named_steps["model"]
preprocessor = clf.named_steps["preproc"]

feature_names = list(preprocessor.named_transformers_["num"].get_feature_names_out(num_cols)) + \
                list(preprocessor.named_transformers_["cat"].get_feature_names_out(cat_cols))

importance = pd.DataFrame({
    "feature": feature_names,
    "coef": model.coef_[0]
}).sort_values("coef", ascending=False)

sns.barplot(data=importance, y="feature", x="coef", palette="coolwarm")
plt.title("Feature importance (Logistic Regression)")
plt.show()

importance.head(10)


Sauvegarde du modèle

In [None]:
import joblib
import os

os.makedirs("../models", exist_ok=True)
joblib.dump(clf, "../models/qc_baseline_model.joblib")

print("✅ Model saved as qc_baseline_model.joblib")