# 1-Librairies

In [None]:
# Standard
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Dataset
import kagglehub

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler

# Division du dataset
from sklearn.model_selection import train_test_split

# Modèle de regression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

# Métriques de performance
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error, r2_score

# Optimisation du modèle testé
from sklearn.model_selection import GridSearchCV

import scipy.stats as stats

# Suppression des warnings
import warnings
warnings.filterwarnings("ignore")

# 2-Dataset

In [None]:
# Charger les données
path = kagglehub.dataset_download("miadul/brain-tumor-dataset")
df = pd.read_csv(f"{path}/brain_tumor_dataset.csv")


# 3- Analyse Exploratoire des Données (EDA)

## 3.1-Information sur les données

In [None]:
# Visualisation du dataset
df.head()

In [None]:
# Supression de colonne inutile
df=df.drop('Patient_ID',axis=1)
df.head()

In [None]:
# Information du dataset
df.info()

In [None]:
# Verification si donnéées manquantes
df.isnull().sum()

## 3.2-Valeurs Catégorielles

In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    cardinality = df[col].nunique()
    print(f"Column: {col}, Cardinality: {cardinality}")

## 3.3-Valeurs Numériques

In [None]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in numerical_cols:
    print(f"Column: {col}, Type: {df[col].dtype}")

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, 2, i + 1)
    sns.histplot(data=df, x=col, kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# 4-Preprocessing

## 4.1-OneHotEncoder sur les données Object

In [None]:
# Appliquer One-Hot Encoding
df_processed = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Afficher les premières lignes du DataFrame avec One-Hot Encoding
print("DataFrame after One-Hot Encoding:")
display(df_processed.head())

## 4.2-Features et Target

In [None]:
# Définir les Features (X) et la Target (y)
X = df_processed.drop('Survival_Rate', axis=1)
y = df_processed['Survival_Rate']

## 4.3-Split Train/Test

In [None]:
# Diviser les données en sous-ensemble d'entrainement et de test (sans stratification pour la régression)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Afficher les dimensions des ensembles d'entraînement et de test
print("Dimensions de l'ensemble d'entraînement (X_train, y_train):", X_train.shape, y_train.shape)
print("Dimensions de l'ensemble de test (X_test, y_test):", X_test.shape, y_test.shape)

## 4.4-StandardsScaler sur les données Features numériques du df_processed

In [None]:
# Mise a l'echelle des features numériques
# Identifier les colonnes numériques à scaler (exclure les colonnes booléennes créées par one-hot encoding)
numerical_cols_to_scale = X_train.select_dtypes(include=['float64', 'int64']).columns

scaler = StandardScaler()
X_train[numerical_cols_to_scale] = scaler.fit_transform(X_train[numerical_cols_to_scale])
X_test[numerical_cols_to_scale] = scaler.transform(X_test[numerical_cols_to_scale])

# Afficher les premières lignes des données mises à l'échelle
print("\nPremières lignes de X_train après mise à l'échelle:")
display(X_train.head())

# 5-Entraînement et évaluation


## 5.1-Modeles

In [None]:
models = {
    'LinearRegression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Support Vector Regressor' : SVR(),
    'KNN': KNeighborsRegressor(),
}

## 5.2-Entrainements et résultats

In [None]:
# Creer un dictionnaire pour stocker les résultats
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    # Train the models
    model.fit(X_train, y_train)

    # Predict on the Test data
    y_pred = model.predict(X_test)

    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    accuracy = 100 - mape

    # Store the metrics in the results dictionary
    results[name] = {
        'MAE': mae,
        'MSE': mse,
        'R2': r2,
        'Accuracy': accuracy
    }

    # Print performance metrics
    print(f"Performance de {name}:")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R^2 Score: {r2:.4f}")
    print(f"Accuracy: {accuracy:.2f}%\n")


# 6-OPtimsation

## 6.1-Parametres

In [None]:
# Récupération des paramètres pour chaque modèle
for model_name, model in models.items():
    print(f"Paramètres de {model_name}:")
    print(model.get_params())
    print("-" * 50)

In [None]:
params_grid = {
    "LinearRegression": {'copy_X': [True,False], 'fit_intercept': [True,False], 'positive': [True,False]},
    "Decision Tree": {'max_depth': [3, 5, 10, None],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]},
    "Random Forest": {"n_estimators": [10, 50, 100], "max_depth": [None, 10, 20]},
    "Support Vector Regressor": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "KNN": {'n_neighbors': [3, 5, 10, 20],'weights': ['uniform', 'distance'],'p': [1, 2]}
}

## 6.2 Entrainements et résultats

In [17]:
results_grid = {}

for name, model in models.items():
    # Entrainement des modeles avec GridSearchCV
    grid_search = GridSearchCV(
        estimator=models[name],
        param_grid=params_grid[name],
        cv=5,
        verbose=1,
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)

    y_pred_grid = grid_search.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    accuracy = 100 - (mape*100)

    results_grid[name] = {
        'MAE': mae,
        'MSE': mse,
        'R2': r2,
        'Accuracy': accuracy
    }
    
    # Affichage des performances
    print(f"Performance de {name}:")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R^2 Score: {r2:.4f}")
    print(f"Accuracy: {accuracy:.2f}%\n")

KeyboardInterrupt: 