In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("mental_health_social_media_dataset.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Configuration du style
sns.set_theme(style="whitegrid")

# Sélection des colonnes numériques pour la corrélation
numeric_cols = [
    'age', 'daily_screen_time_min', 'social_media_time_min',
    'negative_interactions_count', 'positive_interactions_count',
    'sleep_hours', 'physical_activity_min', 
    
]

plt.figure(figsize=(12, 10))
correlation_matrix = df[numeric_cols].corr()

# Heatmap
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Corrélation entre Usage Numérique et Santé Mentale")
plt.show()


In [None]:
plt.figure(figsize=(14, 6))

# Boxplot du niveau d'anxiété par plateforme
plt.subplot(1, 2, 1)
sns.boxplot(x='platform', y='anxiety_level',legend='auto', data=df, palette="Set2",hue=20)
plt.title("Niveau d'Anxiété par Plateforme")

# Boxplot des heures de sommeil par plateforme
plt.subplot(1, 2, 2)
sns.boxplot(x='platform', y='sleep_hours',legend='auto', data=df, palette="Set2",hue=34)
plt.title("Heures de Sommeil par Plateforme")

plt.tight_layout()
plt.show()


In [None]:
# Nettoyage du dataset selon les consignes du projet
print("Suppression des colonnes anxiété, stress, humeur, nom de la personne et date pour eviter la fuite de données")
print("Colonnes avant nettoyage:", df.columns.tolist())

# Supprimer les colonnes demandées
columns_to_drop = ['person_name', 'date', 'anxiety_level', 'stress_level', 'mood_level']
columns_to_drop = [col for col in columns_to_drop if col in df.columns]

if columns_to_drop:
    df = df.drop(columns=columns_to_drop)
    print("\nColonnes après nettoyage:", df.columns.tolist())

else: 
    print("\nColonnes après nettoyage:", df.columns.tolist())
    print("Aucune colonne à supprimer")

df.head()

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# 1. Encodage des variables catégorielles (Gender, Platform)
le_gender = LabelEncoder()
la_platform = LabelEncoder()
# On garde une copie pour le ML
df_ml = df.copy()
df_ml['gender_encoded'] = le_gender.fit_transform(df_ml['gender'])
df_ml['platform_encoded'] = la_platform.fit_transform(df_ml['platform'])

# 2. Définition des features (X) et de la cible (y)
# On retire les colonnes "résultats" trop évidentes (anxiety_level, stress_level, mood_level) 
# pour voir si l'usage SEUL peut prédire l'état mental.
features = [
    'age', 'gender_encoded', 'platform_encoded', 
    'daily_screen_time_min', 'social_media_time_min',
    'negative_interactions_count', 'positive_interactions_count',
    'sleep_hours', 'physical_activity_min'
]
target = 'mental_state'

X = df_ml[features]
y = df_ml[target]

# 3. Séparation Train/Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Entraînement du modèle
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5. Évaluation
y_pred = model.predict(X_test)
print("--- Rapport de Classification ---")
print(classification_report(y_test, y_pred))

# 6. Importance des variables
feature_imp = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_imp, y=feature_imp.index, palette="viridis",hue=12)
plt.title("Quels facteurs influencent le plus l'état mental ?")
plt.xlabel("Score d'importance")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np

# --- Paramètres modifiables ---
SCORING = 'accuracy'
# SCORING = 'recall_macro'   
# SCORING = 'f1_macro'   
RANDOM_STATE = 42
CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [None]:
#  Préparation et nettoyage 
df_ml = df.copy()
df_ml = df_ml.drop_duplicates()
df_ml['gender_encoded'] = LabelEncoder().fit_transform(df_ml['gender'])
df_ml['platform_encoded'] = LabelEncoder().fit_transform(df_ml['platform'])

features = ['age','gender_encoded','platform_encoded',
            'daily_screen_time_min','social_media_time_min',
            'negative_interactions_count','positive_interactions_count',
            'sleep_hours','physical_activity_min']
X = df_ml[features]
y = df_ml['mental_state']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=RANDOM_STATE)

In [None]:
def run_grid(name, estimator, grid):
    print(f"\n--- {name} GridSearch ---")
    gs = GridSearchCV(estimator, grid, cv=CV, scoring=SCORING, n_jobs=-1, verbose=1)
    gs.fit(X_train, y_train)
    best = gs.best_estimator_
    print("Best params:", gs.best_params_)
    print(f"CV best score: {gs.best_score_:.4f}")
    y_pred = best.predict(X_test)
    print(f"Test accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))
    return best, gs

In [None]:
# Régression logistique
pipe_lr = Pipeline([('scaler', StandardScaler()),
                    ('model', LogisticRegression(solver='saga', max_iter=5000, random_state=RANDOM_STATE))])
grid_lr = {
    'model__C': [1, 10, 100],
    'model__penalty': ['l1','l2'],
    'model__class_weight': [None, 'balanced']
}
best_lr, gs_lr = run_grid("LogisticRegression", pipe_lr, grid_lr)

In [None]:
# Arbre de décision
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
grid_dt = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced']
}
best_dt, gs_dt = run_grid("DecisionTree", dt, grid_dt)

In [None]:
# SVM
pipe_svc = Pipeline([('scaler', StandardScaler()),
                     ('model', SVC(probability=True, random_state=RANDOM_STATE))])
grid_svc = {
    'model__C': [0.1, 1, 10],
    'model__kernel': ['rbf', 'linear'],
    'model__gamma': ['scale', 'auto']
}
best_svc, gs_svc = run_grid("SVM", pipe_svc, grid_svc)

In [None]:
# Comparaison finale
print("\nComparaison finale :")
for name, model in [('- Régression logistique', best_lr), ('- Arbre de décision', best_dt), ('- SVM', best_svc)]:
    acc = accuracy_score(y_test, model.predict(X_test))
    print(f"{name}: {acc:.4f}")

In [None]:
import joblib

# Export du meilleur modèle (DecisionTreeClassifier)
model_filename = 'best_mental_health_model.pkl'
joblib.dump(best_dt, model_filename)
print(f"✓ Modèle exporté : {model_filename}")
print(f"  - Type: {type(best_dt).__name__}")
print(f"  - Accuracy: 100%")

# Export des encodeurs pour pouvoir faire des prédictions futures
encoders = {
    'gender': LabelEncoder().fit(df['gender']),
    'platform': LabelEncoder().fit(df['platform'])
}
encoders_filename = 'label_encoders.pkl'
joblib.dump(encoders, encoders_filename)
print(f"\n✓ Encodeurs exportés : {encoders_filename}")
