In [1]:

import pandas as pd
import numpy as np
import os 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor

def train_random_forest(test_size=0.2, random_state=42, param_tuning=False):
    """
    Entraîne un modèle Random Forest pour un problème de régression.

    Args:
        data (pd.DataFrame): Le dataset contenant les features et la cible.
        target_column (str): Le nom de la colonne cible pour la régression.
        test_size (float): La proportion de données utilisées pour le test (default=0.2).
        random_state (int): La graine aléatoire pour la reproductibilité (default=42).
        param_tuning (bool): Si True, effectue une recherche d'hyperparamètres (GridSearchCV).

    Returns:
        model (RandomForestRegressor): Le modèle entraîné.
        metrics (dict): Les métriques de performance (MSE, R²) sur l'ensemble de test.
    """
    # Diviser les features (X) et la cible (y)
    X_train, X_test, y_train = preprocessing_v1(apply_one_hot=True, apply_scaling=True)
    X_train = X_train.drop(columns=['sample_name'])
    X_test = X_test.drop(columns=['sample_name'])
    target_column = 'PURITY'
    
    print(f"Taille X_train : {X_train.shape}, X_test : {X_test.shape}, y_train : {y_train.shape}")

    # Entraîner le modèle Random Forest
    if param_tuning:
        # Recherche d'hyperparamètres
        param_grid = {
            'n_estimators': [50, 100, 200, 500],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5],
            'max_features': ['sqrt', 'log2', None]
        }
        grid_search = GridSearchCV(RandomForestRegressor(random_state=random_state), param_grid, cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        model = grid_search.best_estimator_
        print("Meilleurs hyperparamètres :", grid_search.best_params_)
    else:
        # Modèle par défaut
        model = RandomForestRegressor(n_estimators=500, max_depth=None, random_state=random_state)
        model.fit(X_train, y_train)

    # Prédictions sur l'ensemble de test
    y_pred = model.predict(X_test)
    
    print(f"Taille Y_Pred", {y_pred.shape})

    # Calcul des métriques
    #mse = mean_squared_error(X_test, y_pred)
    #r2 = r2_score(X_test, y_pred)
    #metrics = {"MSE": mse, "R2": r2}

    # Affichage des métriques
    #print(f"Mean Squared Error (MSE): {mse}")
    #print(f"Coefficient of Determination (R²): {r2}")
    
    # Générer le fichier de soumission
    #submission = pd.DataFrame({
    #    'ID': X_test.index,  # Utilise l'index des lignes comme ID
    #    target_column: y_pred  # Ajoute les prédictions
    #})
    submission = submission_file(y_pred)

    # Vérifier si le répertoire de sortie existe
    output_dir = '/Users/alicepriolet/Desktop/ML/epfl-bio-322-2024'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Sauvegarder le fichier de soumission
    submission_path = os.path.join(output_dir, 'submission_file_random_forest.csv')
    submission.to_csv(submission_path, index=False)
    print(f"Submission file saved successfully at {submission_path}")

    return model, #metrics


def main():
    train_random_forest()
 
   

if __name__ == '__main__':
    main() 

NameError: name 'preprocessing_v1' is not defined