In [None]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import StandardScaler


#Caricamento del dataset di train
train_cat = pd.read_excel ("/TRAIN_NEW/TRAIN_CATEGORICAL_METADATA_new.xlsx")
train_quant = pd.read_excel("/TRAIN_NEW/TRAIN_QUANTITATIVE_METADATA_new.xlsx")
train_fmri = pd.read_csv("/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv")
train_sol = pd.read_excel("/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx")

#One hot encoding dei dati categorici
for col in train_cat.select_dtypes(include='int').columns:
    train_cat[col] = train_cat[col].astype('category')

columns_to_encode = train_cat.columns[1:].tolist()
train_encoded = pd.get_dummies(train_cat[columns_to_encode], drop_first=True)
train_encoded = train_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))

cat_train_final = pd.concat([train_cat.drop(columns=columns_to_encode), train_encoded], axis=1)
cat_train_final.head()


#Merging dei dati categorici (dopo aver fatto l'encoding) con i dati quantitativi e le soluzioni, per ottenere il dataset dei dati demografici
train_demo = pd.merge(cat_train_final, train_quant, on = 'participant_id')
train_final_demo = pd.merge(train_demo, train_sol, on = 'participant_id')
train_final_demo.head()

#Merging dei dati fmri le soluzioni, per ottenere il dataset dei dati di connettività
train_final_connett = pd.merge(train_fmri, train_sol, on = 'participant_id')
train_final_connett.head()

In [None]:
#Sostituisco i valori mancanti delle variabili categoriche con la moda
train_final_demo['PreInt_Demos_Fam_Child_Ethnicity'] = train_final_demo['PreInt_Demos_Fam_Child_Ethnicity'].fillna(train_final_demo['PreInt_Demos_Fam_Child_Ethnicity'].mode()[0])
train_final_demo['PreInt_Demos_Fam_Child_Race'] = train_final_demo['PreInt_Demos_Fam_Child_Race'].fillna(train_final_demo['PreInt_Demos_Fam_Child_Race'].mode()[0])
train_final_demo['MRI_Track_Scan_Location'] = train_final_demo['MRI_Track_Scan_Location'].fillna(train_final_demo['MRI_Track_Scan_Location'].mode()[0])
train_final_demo['Barratt_Barratt_P1_Edu'] = train_final_demo['Barratt_Barratt_P1_Edu'].fillna(train_final_demo['Barratt_Barratt_P1_Edu'].mode()[0])
train_final_demo['Barratt_Barratt_P1_Occ'] = train_final_demo['Barratt_Barratt_P1_Occ'].fillna(train_final_demo['Barratt_Barratt_P1_Occ'].mode()[0])
train_final_demo['Barratt_Barratt_P2_Edu'] = train_final_demo['Barratt_Barratt_P2_Edu'].fillna(train_final_demo['Barratt_Barratt_P2_Edu'].mode()[0])
train_final_demo['Barratt_Barratt_P2_Occ'] = train_final_demo['Barratt_Barratt_P2_Occ'].fillna(train_final_demo['Barratt_Barratt_P2_Occ'].mode()[0])

#Sostituisco i valori mancanti delle variabili quantitative con la media
train_final_demo.fillna({'EHQ_EHQ_Total':train_final_demo['EHQ_EHQ_Total'].mean()}, inplace = True)
train_final_demo.fillna({'ColorVision_CV_Score':train_final_demo['ColorVision_CV_Score'].mean()}, inplace = True)
train_final_demo.fillna({'APQ_P_APQ_P_CP':train_final_demo['APQ_P_APQ_P_CP'].mean()}, inplace = True)
train_final_demo.fillna({'APQ_P_APQ_P_ID':train_final_demo['APQ_P_APQ_P_ID'].mean()}, inplace = True)
train_final_demo.fillna({'APQ_P_APQ_P_INV':train_final_demo['APQ_P_APQ_P_INV'].mean()}, inplace = True)
train_final_demo.fillna({'APQ_P_APQ_P_OPD':train_final_demo['APQ_P_APQ_P_OPD'].mean()}, inplace = True)
train_final_demo.fillna({'APQ_P_APQ_P_PM':train_final_demo['APQ_P_APQ_P_PM'].mean()}, inplace = True)
train_final_demo.fillna({'APQ_P_APQ_P_PP':train_final_demo['APQ_P_APQ_P_PP'].mean()}, inplace = True)
train_final_demo.fillna({'SDQ_SDQ_Conduct_Problems':train_final_demo['SDQ_SDQ_Conduct_Problems'].mean()}, inplace = True)
train_final_demo.fillna({'SDQ_SDQ_Difficulties_Total':train_final_demo['SDQ_SDQ_Difficulties_Total'].mean()}, inplace = True)
train_final_demo.fillna({'SDQ_SDQ_Emotional_Problems':train_final_demo['SDQ_SDQ_Emotional_Problems'].mean()}, inplace = True)
train_final_demo.fillna({'SDQ_SDQ_Externalizing':train_final_demo['SDQ_SDQ_Externalizing'].mean()}, inplace = True)
train_final_demo.fillna({'SDQ_SDQ_Generating_Impact':train_final_demo['SDQ_SDQ_Generating_Impact'].mean()}, inplace = True)
train_final_demo.fillna({'SDQ_SDQ_Hyperactivity':train_final_demo['SDQ_SDQ_Hyperactivity'].mean()}, inplace = True)
train_final_demo.fillna({'SDQ_SDQ_Internalizing':train_final_demo['SDQ_SDQ_Internalizing'].mean()}, inplace = True)
train_final_demo.fillna({'SDQ_SDQ_Peer_Problems':train_final_demo['SDQ_SDQ_Peer_Problems'].mean()}, inplace = True)
train_final_demo.fillna({'SDQ_SDQ_Prosocial':train_final_demo['SDQ_SDQ_Prosocial'].mean()}, inplace = True)
train_final_demo.fillna({'MRI_Track_Age_at_Scan':train_final_demo['MRI_Track_Age_at_Scan'].mean()}, inplace = True)

#Sostituisco i valori mancanti delle variabili categoriche con la moda
test_final_demo['PreInt_Demos_Fam_Child_Ethnicity'] = test_final_demo['PreInt_Demos_Fam_Child_Ethnicity'].fillna(test_final_demo['PreInt_Demos_Fam_Child_Ethnicity'].mode()[0])
test_final_demo['PreInt_Demos_Fam_Child_Race'] = test_final_demo['PreInt_Demos_Fam_Child_Race'].fillna(test_final_demo['PreInt_Demos_Fam_Child_Race'].mode()[0])
test_final_demo['Barratt_Barratt_P1_Edu'] = test_final_demo['Barratt_Barratt_P1_Edu'].fillna(test_final_demo['Barratt_Barratt_P1_Edu'].mode()[0])
test_final_demo['Barratt_Barratt_P1_Occ'] = test_final_demo['Barratt_Barratt_P1_Occ'].fillna(test_final_demo['Barratt_Barratt_P1_Occ'].mode()[0])
test_final_demo['Barratt_Barratt_P2_Edu'] = test_final_demo['Barratt_Barratt_P2_Edu'].fillna(test_final_demo['Barratt_Barratt_P2_Edu'].mode()[0])
test_final_demo['Barratt_Barratt_P2_Occ'] = test_final_demo['Barratt_Barratt_P2_Occ'].fillna(test_final_demo['Barratt_Barratt_P2_Occ'].mode()[0])

# Sostituisco i valori mancanti delle variabili quantitative con la media
test_final_demo.fillna({'EHQ_EHQ_Total': test_final_demo['EHQ_EHQ_Total'].mean()}, inplace=True)
test_final_demo.fillna({'ColorVision_CV_Score': test_final_demo['ColorVision_CV_Score'].mean()}, inplace=True)
test_final_demo.fillna({'APQ_P_APQ_P_CP': test_final_demo['APQ_P_APQ_P_CP'].mean()}, inplace=True)
test_final_demo.fillna({'APQ_P_APQ_P_ID': test_final_demo['APQ_P_APQ_P_ID'].mean()}, inplace=True)
test_final_demo.fillna({'APQ_P_APQ_P_INV': test_final_demo['APQ_P_APQ_P_INV'].mean()}, inplace=True)
test_final_demo.fillna({'APQ_P_APQ_P_OPD': test_final_demo['APQ_P_APQ_P_OPD'].mean()}, inplace=True)
test_final_demo.fillna({'APQ_P_APQ_P_PM': test_final_demo['APQ_P_APQ_P_PM'].mean()}, inplace=True)
test_final_demo.fillna({'APQ_P_APQ_P_PP': test_final_demo['APQ_P_APQ_P_PP'].mean()}, inplace=True)
test_final_demo.fillna({'SDQ_SDQ_Conduct_Problems': test_final_demo['SDQ_SDQ_Conduct_Problems'].mean()}, inplace=True)
test_final_demo.fillna({'SDQ_SDQ_Difficulties_Total': test_final_demo['SDQ_SDQ_Difficulties_Total'].mean()}, inplace=True)
test_final_demo.fillna({'SDQ_SDQ_Emotional_Problems': test_final_demo['SDQ_SDQ_Emotional_Problems'].mean()}, inplace=True)
test_final_demo.fillna({'SDQ_SDQ_Externalizing': test_final_demo['SDQ_SDQ_Externalizing'].mean()}, inplace=True)
test_final_demo.fillna({'SDQ_SDQ_Generating_Impact': test_final_demo['SDQ_SDQ_Generating_Impact'].mean()}, inplace=True)
test_final_demo.fillna({'SDQ_SDQ_Hyperactivity': test_final_demo['SDQ_SDQ_Hyperactivity'].mean()}, inplace=True)
test_final_demo.fillna({'SDQ_SDQ_Internalizing': test_final_demo['SDQ_SDQ_Internalizing'].mean()}, inplace=True)
test_final_demo.fillna({'SDQ_SDQ_Peer_Problems': test_final_demo['SDQ_SDQ_Peer_Problems'].mean()}, inplace=True)
test_final_demo.fillna({'SDQ_SDQ_Prosocial': test_final_demo['SDQ_SDQ_Prosocial'].mean()}, inplace=True)
test_final_demo.fillna({'MRI_Track_Age_at_Scan': test_final_demo['MRI_Track_Age_at_Scan'].mean()}, inplace=True)

participant_id                        0
PreInt_Demos_Fam_Child_Ethnicity     43
PreInt_Demos_Fam_Child_Race          54
MRI_Track_Scan_Location               3
Barratt_Barratt_P1_Edu               15
Barratt_Barratt_P1_Occ               31
Barratt_Barratt_P2_Edu              198
Barratt_Barratt_P2_Occ              222
Basic_Demos_Enroll_Year_2016          0
Basic_Demos_Enroll_Year_2017          0
Basic_Demos_Enroll_Year_2018          0
Basic_Demos_Enroll_Year_2019          0
Basic_Demos_Enroll_Year_2020          0
Basic_Demos_Study_Site_2              0
Basic_Demos_Study_Site_3              0
Basic_Demos_Study_Site_4              0
EHQ_EHQ_Total                        13
ColorVision_CV_Score                 23
APQ_P_APQ_P_CP                       12
APQ_P_APQ_P_ID                       12
APQ_P_APQ_P_INV                      12
APQ_P_APQ_P_OPD                      12
APQ_P_APQ_P_PM                       12
APQ_P_APQ_P_PP                       12
SDQ_SDQ_Conduct_Problems              9


In [4]:
# === FEATURE SELECTION MANUALE PER DATI DEMOGRAFICI ===
# DATI DI TRAIN
from sklearn.feature_selection import SelectKBest, f_classif


# Lista delle features (colonne) che vuoi eliminare
features_da_eliminare = ["Basic_Demos_Study_Site_2", "Basic_Demos_Study_Site_3", "Basic_Demos_Study_Site_4", "MRI_Track_Scan_Location"]

# Salva numero di feature iniziali (INCLUDENDO ID e target)
n_feature_iniziali = train_final_demo.shape[1]

# Elimina le features specificate dal DataFrame
train_final_demo_selected = train_final_demo.drop(columns=features_da_eliminare, errors='ignore')

# Salva numero di feature dopo rimozione manuale
n_feature_post_manuale = train_final_demo_selected.shape[1]

In [None]:
# === MODELLAZIONE CON RANDOM FOREST (300 estimatori) PER ADHD USANDO DATI DEMOGRAFICI ===

# Features e target
X = train_final_demo_selected.drop(columns=['participant_id', 'ADHD_Outcome'])
y = train_final_demo_selected['ADHD_Outcome'].astype(int)

# Cross-validation stratificata su ADHD_Outcome
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Liste per salvare i risultati di ogni fold
acc_scores_adhd = []
f1_scores_adhd = []

# Cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"\n=== Fold {fold} ===")

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    print(f"  Dimensione X_train: {X_train.shape}")
    print(f"  Dimensione X_test: {X_test.shape}")
    print(f"  Dimensione y_train: {y_train.shape}")
    print(f"  Dimensione y_test: {y_test.shape}")
    print(f"  Valori unici in y_train: {np.unique(y_train)}")
    print(f"  Valori unici in y_test: {np.unique(y_test)}")

    # === Standardizzazione === (facoltativa per Random Forest, ma la manteniamo per coerenza)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # === Modello Random Forest ===
    rf = RandomForestClassifier(
        n_estimators=300,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train_scaled, y_train)

    y_pred = rf.predict(X_test_scaled)

    print(f"  Predizioni y_pred (prime 5): {y_pred[:5]}")
    print(f"  Valori reali y_test (prime 5): {y_test[:5].values}")

    # Metriche per questo fold
    acc_scores_adhd.append(accuracy_score(y_test, y_pred))
    f1_scores_adhd.append(f1_score(y_test, y_pred))


# === RISULTATI FINALI ===
print("\n=== Media delle metriche su 10 fold ===")
print(f"F1-score medio ADHD_Outcome: {np.mean(f1_scores_adhd):.4f}")
print(f"Deviazione standard F1-score ADHD_Outcome: {np.std(f1_scores_adhd):.4f}")


# === BOXPLOT dei risultati ===

# Dati per boxplot F1-score
f1_data = []
for f1 in f1_scores_adhd:
    f1_data.append({'Target': 'ADHD_Outcome', 'Metrica': 'F1-score', 'Valore': f1})
f1_df_ADHD = pd.DataFrame(f1_data)


=== Fold 1 ===
  Dimensione X_train: (1091, 30)
  Dimensione X_test: (122, 30)
  Dimensione y_train: (1091,)
  Dimensione y_test: (122,)
  Valori unici in y_train: [0 1]
  Valori unici in y_test: [0 1]
  Predizioni y_pred (prime 5): [1 0 0 1 0]
  Valori reali y_test (prime 5): [1 0 1 1 1]

=== Fold 2 ===
  Dimensione X_train: (1091, 30)
  Dimensione X_test: (122, 30)
  Dimensione y_train: (1091,)
  Dimensione y_test: (122,)
  Valori unici in y_train: [0 1]
  Valori unici in y_test: [0 1]
  Predizioni y_pred (prime 5): [1 1 1 0 1]
  Valori reali y_test (prime 5): [1 1 0 0 1]

=== Fold 3 ===
  Dimensione X_train: (1091, 30)
  Dimensione X_test: (122, 30)
  Dimensione y_train: (1091,)
  Dimensione y_test: (122,)
  Valori unici in y_train: [0 1]
  Valori unici in y_test: [0 1]
  Predizioni y_pred (prime 5): [1 1 1 0 1]
  Valori reali y_test (prime 5): [1 0 1 0 1]

=== Fold 4 ===
  Dimensione X_train: (1092, 30)
  Dimensione X_test: (121, 30)
  Dimensione y_train: (1092,)
  Dimensione y_te

In [6]:
# === FEATURE SELECTION MANUALE PER DATI DI CONNETTIVITÀ ===
#DATI DI TRAIN

# Numeri interi che vuoi cercare (come stringhe)
numeri_da_cercare_str = ['76', '185', '75', '184', '80', '81', '82', '187', '188', '189', '77', '78', '79', '186', '191', '88', '89', '90', '91', '192', '87', '83', '84', '85', '86', '190', '92', '93', '94', '95', '193', '96', '194', '98', '196', '97', '195', '61', '167', '58', '59', '60', '165', '166', '63', '64', '65', '168', '169', '62', '66', '170', '57', '68', '173', '174', '175', '176', '180', '69', '70', '71', '177', '178', '179', '67', '171', '172', '74', '183', '72', '73', '181', '182', '31', '132', '32', '33', '34', '133', '134', '135', '136', '29', '30', '131', '39', '141', '35', '36', '37', '38', '137', '138', '139', '140']

# Lista per memorizzare i nomi delle colonne da mantenere
colonne_da_mantenere = ['participant_id', 'Sex_F', 'ADHD_Outcome']

# Itera sui nomi delle colonne di connettività
colonne_connettivita = [col for col in train_final_connett.columns if col not in ['participant_id', 'Sex_F', 'ADHD_Outcome']]

for col in colonne_connettivita:
    for numero_str in numeri_da_cercare_str:
        # Cerchiamo il numero come sottostringa intera, evitando '10', '22', ecc.
        if numero_str in col:
            try:
                # Tentiamo di estrarre la parte prima e dopo il numero trovato
                index = col.find(numero_str)
                parte_prima = col[:index]
                parte_dopo = col[index + len(numero_str):]

                # Verifichiamo che il numero trovato sia "intero", ovvero delimitato da non-cifre o inizio/fine stringa
                if (not parte_prima or not parte_prima[-1].isdigit()) and (not parte_dopo or not parte_dopo[0].isdigit()):
                    colonne_da_mantenere.append(col)
                    break # Se troviamo il numero intero, manteniamo la colonna e passiamo alla successiva
            except ValueError:
                pass

# Rimuovi i duplicati
colonne_da_mantenere = list(set(colonne_da_mantenere))

# Crea il DataFrame filtrato
train_connett_manual = train_final_connett[colonne_da_mantenere]
train_final_connett = train_connett_manual

In [None]:
# === MODELLAZIONE CON RANDOM FOREST PER SESSO UTILIZZANDO DATI DI CONNETTIVITA' ===

# Features e target
X = train_final_connett.drop(columns=['participant_id', 'Sex_F'])
y = train_final_connett['Sex_F']  # y ora è una Series, non un DataFrame

# Cross-validation stratificata su Sex_F
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Liste per salvare i risultati di ogni fold
acc_scores_sex = []
f1_scores_sex = []

# Cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"\n=== Fold {fold} ===")

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    print(f"  Dimensione X_train: {X_train.shape}")
    print(f"  Dimensione X_test: {X_test.shape}")
    print(f"  Dimensione y_train: {y_train.shape}")
    print(f"  Dimensione y_test: {y_test.shape}")
    print(f"  Valori unici in y_train: {np.unique(y_train)}")
    print(f"  Valori unici in y_test: {np.unique(y_test)}")

    # === Standardizzazione ===
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # === Modello Random Forest ===
    rf = RandomForestClassifier(class_weight='balanced', random_state=42)
    rf.fit(X_train_scaled, y_train)

    y_pred = rf.predict(X_test_scaled)

    print(f"  Predizioni y_pred (prime 5): {y_pred[:5]}")
    print(f"  Valori reali y_test (prime 5): {y_test[:5].values}")

    # Metriche per questo fold
    acc_scores_sex.append(accuracy_score(y_test, y_pred))
    f1_scores_sex.append(f1_score(y_test, y_pred))

# === RISULTATI FINALI ===
print("\n=== Media delle metriche su 10 fold ===")
print(f"F1-score medio Sex_F: {np.mean(f1_scores_sex):.4f}")
print(f"Deviazione standard F1-score Sex_F: {np.std(f1_scores_sex):.4f}")

# === BOXPLOT dei risultati ===

# Dati per boxplot F1-score
f1_data = []
for f1 in f1_scores_sex:
    f1_data.append({'Target': 'Sex_F', 'Metrica': 'F1-score', 'Valore': f1})
f1_df_sesso = pd.DataFrame(f1_data)


=== Fold 1 ===
  Dimensione X_train: (1091, 14545)
  Dimensione X_test: (122, 14545)
  Dimensione y_train: (1091,)
  Dimensione y_test: (122,)
  Valori unici in y_train: [0 1]
  Valori unici in y_test: [0 1]
  Predizioni y_pred (prime 5): [0 0 0 0 0]
  Valori reali y_test (prime 5): [0 0 0 0 0]

=== Fold 2 ===
  Dimensione X_train: (1091, 14545)
  Dimensione X_test: (122, 14545)
  Dimensione y_train: (1091,)
  Dimensione y_test: (122,)
  Valori unici in y_train: [0 1]
  Valori unici in y_test: [0 1]
  Predizioni y_pred (prime 5): [0 0 0 0 0]
  Valori reali y_test (prime 5): [1 0 1 0 1]

=== Fold 3 ===
  Dimensione X_train: (1091, 14545)
  Dimensione X_test: (122, 14545)
  Dimensione y_train: (1091,)
  Dimensione y_test: (122,)
  Valori unici in y_train: [0 1]
  Valori unici in y_test: [0 1]
  Predizioni y_pred (prime 5): [0 0 0 0 0]
  Valori reali y_test (prime 5): [1 1 1 0 0]

=== Fold 4 ===
  Dimensione X_train: (1092, 14545)
  Dimensione X_test: (121, 14545)
  Dimensione y_train: (

In [8]:
f1_df_sesso.to_csv('tesi_4.3_RF_sex.csv', index=False)
print(f"File 'tesi_4.3_RF_sex.csv' salvato con successo!")
f1_df_ADHD.to_csv('tesi_4.3_RF_ADHD.csv', index=False)
print(f"File 'tesi_4.3_RF_ADHD.csv' salvato con successo!")
import os
print(os.listdir('.')) # Mostra i file nella directory corrente

File 'tesi_4.3_RF_sex.csv' salvato con successo!
File 'tesi_4.3_RF_ADHD.csv' salvato con successo!
['tesi_4.3_RF_ADHD.csv', 'tesi_4.3_RF_sex.csv', '__notebook__.ipynb']
