In [None]:
import pandas as pd
import numpy as np

# Paths and filenames
data_partition_path = '/data/'
study_data_path = f'{data_partition_path}/UNSAM/CovidProject2/'
results_path = f'{study_data_path}/DataAnalysis/'
response_excel_filename = f'{study_data_path}/Respuestas.xlsx'
summary_excel_filename = f'{study_data_path}/ResumenRespuestas.xlsx'
volunteers_excel_filename = f'{study_data_path}/VoluntariosProyectoCovidProlongado.xlsx'

# Load ASL CSVs
asl_csv_paths = '/home/martin/data/UNSAM/CovidProject2/DataAnalysis/ASL/CleanData/'
scov_gm_filename = f'{asl_csv_paths}/scov_total_gm_clean.csv'

scov_table = pd.read_csv(scov_gm_filename)

# Match groups
summary_table = pd.read_excel(summary_excel_filename, sheet_name='resumenTotal')
indices_scov = []
indices_non_cov = []

for i, pid in enumerate(scov_table['participant_id']):
    ind = summary_table.index[summary_table['ID'] == pid].tolist()
    if ind:
        indices_scov.append(ind[0])
    else:
        print(f'Warning: Subject {pid} not found')
        indices_non_cov.append(i)

summary_table_matched = summary_table.loc[indices_scov].reset_index(drop=True)
group = summary_table_matched['Grupo']
scov_table['group'] = group
scov_table['sex'] = summary_table_matched['Genero']
scov_table['age'] = summary_table_matched['Edad']

print("COVID count:", group.value_counts().get('COVID', 0))
print("CONTROL count:", group.value_counts().get('CONTROL', 0))


: 

In [None]:
# Create a scov table only with the main variables:
main_vars = ['group', 'sex', 'age', 'GM_vol', 'WM_vol', 'CSF_vol', 'WMH_vol', 'WMH_count', 'TotalGM_B', 'TotalGM_L', 'TotalGM_R']
scov_table_main = scov_table[main_vars]
print(scov_table_main)

In [None]:
from imblearn.over_sampling import SMOTE, SMOTENC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Prepare features and labels
X = scov_table_main.drop(columns=['group'])
y = scov_table_main['group']

# numeric + categorical data
num_cols = ['age', 'GM_vol', 'WM_vol', 'CSF_vol', 'WMH_vol', 'WMH_count', 'TotalGM_B']#, 'TotalGM_L', 'TotalGM_R']
cat_cols = ['sex']

# Pre‑processing, rescale numeric features and one-hot encode categorical features
preprocess = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])
# Pre‑processing, rescale numeric features and one-hot encode categorical features
X_preprocessed = preprocess.fit_transform(X)

# Find categorical feature indices in the transformed array
cat_indices = [i for i, name in enumerate(preprocess.get_feature_names_out()) if 'cat__' in name]
# Apply SMOTE with categorical variables
smote = SMOTENC(categorical_features=[len(num_cols) + i for i in range(len(cat_cols))],
                sampling_strategy='auto',
                random_state=42)
print(smote)
X_resampled, y_resampled = smote.fit_resample(X_preprocessed, y)

# Update scov_table and group_shuffled with resampled data
scov_table_smote = X_resampled.copy()
group_smote = y_resampled.copy()
scov_table_smote = pd.DataFrame(scov_table_smote, columns=preprocess.get_feature_names_out())
#scov_table_smote['group'] = group_smote 
indices_shuffle_smote = np.random.permutation(scov_table_smote.shape[0])
scov_table_shuffled_smote = scov_table_smote.iloc[indices_shuffle_smote].reset_index(drop=True)
group_shuffled_smote = group_smote.iloc[indices_shuffle_smote].reset_index(drop=True)

print(scov_table_shuffled_smote)
print("COVID count:", group_smote.value_counts().get('COVID', 0))
print("CONTROL count:", group_smote.value_counts().get('CONTROL', 0))

In [None]:
import numpy, matplotlib, seaborn
print(numpy.__version__, matplotlib.__version__, seaborn.__version__)
import seaborn as sns

import matplotlib.pyplot as plt
# Full table:
scov_full_table_smote = scov_table_smote.copy()
scov_full_table_smote['group'] = group_smote 
print(scov_full_table_smote.shape)
print(scov_table.shape)

# Prepare data for violin plots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# scov_table_smote: standardized columns
sns.violinplot(
    ax=axes[0],
    x='group',
    y='num__TotalGM_B',
    data=scov_full_table_smote,
    palette='Set2', inner="point"
)
axes[0].set_title('TotalGM_B (SMOTE)')

sns.violinplot(
    ax=axes[1],
    x='group',
    y='num__age',
    data=scov_full_table_smote,
    palette='Set2', inner="point"
)
axes[1].set_title('Age (SMOTE)')

sns.violinplot(
    ax=axes[2],
    x='group',
    y='num__WMH_count',
    data=scov_full_table_smote,
    palette='Set2', inner="point"
)
axes[2].set_title('WMH_count (SMOTE)')

plt.tight_layout()
plt.show()

# scov_table: raw columns
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.violinplot(
    ax=axes[0],
    x='group',
    y='TotalGM_B',
    data=scov_table,
    palette='Set2', inner="point"
)
axes[0].set_title('TotalGM_B (Raw)')

sns.violinplot(
    ax=axes[1],
    x='group',
    y='age',
    data=scov_table,
    palette='Set2', inner="point"
)
axes[1].set_title('Age (Raw)')

sns.violinplot(
    ax=axes[2],
    x='group',
    y='WMH_count',
    data=scov_table,
    palette='Set2', inner="point"
)
axes[2].set_title('WMH_count (Raw)')

plt.tight_layout()
plt.show()

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.inspection import permutation_importance

# Prepare features and labels from SMOTE-augmented data
X_aug = scov_table_shuffled_smote
y_aug = group_shuffled_smote

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_aug, y_aug, test_size=0.2, random_state=42, stratify=y_aug)

# Train SVM classifier
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = svm_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Compute feature importances using permutation importance
result = permutation_importance(svm_clf, X_test, y_test, n_repeats=10, random_state=42)
importances = result.importances_mean

# Display feature importances
feature_names = X_aug.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)
print(importance_df)

In [None]:
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Prepare features and labels
X = scov_table_main.drop(columns=['group'])
y = scov_table_main['group']

# numeric + categorical data
num_cols = ['age', 'GM_vol', 'WM_vol', 'CSF_vol', 'WMH_vol', 'WMH_count', 'TotalGM_B']#, 'TotalGM_L', 'TotalGM_R']
cat_cols = ['sex']

# Pre‑processing, rescale numeric features and one-hot encode categorical features
preprocess = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])
# Pre‑processing, rescale numeric features and one-hot encode categorical features
#X_preprocessed = preprocess.fit_transform(X)

# Apply SMOTE with categorical variables
#smote = SMOTENC(categorical_features=[len(num_cols) + i for i in range(len(cat_cols))],
#                sampling_strategy='auto',
#                random_state=42)

# ------------------------------------------------------------------
# 2.  Build pipeline:  preprocess  →  SMOTE  →  SVM
# ------------------------------------------------------------------
pipe = Pipeline(steps=[
    ('prep',  preprocess),
    ('smote', SMOTENC(categorical_features=[len(num_cols) + i for i in range(len(cat_cols))],
                sampling_strategy='auto',
                random_state=42)),
    ('svm',   SVC())                      # kernel & params set later by GridSearch
])
pipe.set_params(svm__class_weight='balanced')
# ------------------------------------------------------------------
# 3.  Hyper‑parameter grid
#     – two kernels   (rbf, poly‑degree3)
#     – a modest sweep of C and gamma
# ------------------------------------------------------------------
param_grid = {
    'svm__kernel': ['rbf', 'poly'],
    'svm__degree': [3],                  # ignored for 'rbf'
    'svm__C':      [0.1, 1, 10],
    'svm__gamma':  ['scale', 0.01, 0.001],
    'svm__class_weight': [None,
                          {'CONTROL': 2, 'COVID': 1},
                          {'CONTROL': 3, 'COVID': 1}]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe,
                    param_grid=param_grid,
                    scoring='balanced_accuracy',   # good for slight imbalance
                    cv=cv,
                    n_jobs=-1,                     # parallel if CPU allows
                    verbose=2)

grid.fit(X, y)                    # ⬅ whole dataset goes in; CV handles split

print(f"Best model:  {grid.best_params_}")
print(f"CV balanced accuracy: {grid.best_score_:.3f}")

# ------------------------------------------------------------------
# 4.  Final evaluation on a held‑out test set (optional)
# ------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

best_model = grid.best_estimator_
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print("\nHeld‑out accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Compute feature importances using permutation importance
result = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42)
importances = result.importances_mean

# Display feature importances
feature_names = X.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)
print(importance_df)

In [None]:
# Now with weights instead of SMOTE
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Prepare features and labels
X = scov_table_main.drop(columns=['group'])
y = scov_table_main['group']

# ------------------------------------------------------------------
# 2.  Build pipeline:  preprocess  →  SVM
# ------------------------------------------------------------------
pipe = Pipeline(steps=[
    ('prep',  preprocess),
    ('svm',  SVC(class_weight='balanced'))
])
# ------------------------------------------------------------------
# 3.  Hyper‑parameter grid
#     – two kernels   (rbf, poly‑degree3)
#     – a modest sweep of C and gamma
# ------------------------------------------------------------------
param_grid = {
    'svm__kernel': ['rbf', 'poly'],
    'svm__degree': [3],                  # ignored for 'rbf'
    'svm__C':      [0.1, 1, 10],
    'svm__gamma':  ['scale', 0.01, 0.001]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe,
                    param_grid=param_grid,
                    scoring='balanced_accuracy',   # good for slight imbalance
                    cv=cv,
                    n_jobs=-1,                     # parallel if CPU allows
                    verbose=2)

grid.fit(X, y)                    # ⬅ whole dataset goes in; CV handles split

print(f"Best model:  {grid.best_params_}")
print(f"CV balanced accuracy: {grid.best_score_:.3f}")

# ------------------------------------------------------------------
# 4.  Final evaluation on a held‑out test set (optional)
# ------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

best_model = grid.best_estimator_
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print("\nHeld‑out accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Compute feature importances using permutation importance
result = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42)
importances = result.importances_mean

# Display feature importances
feature_names = X.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)
print(importance_df)

In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, roc_curve

# ---------------------------------------------------------------
# 0. Load / define your data  (Long‑COVID = 1, Control = 0)
# ---------------------------------------------------------------
# Prepare features and labels
X = scov_table_main.drop(columns=['group'])
y = scov_table_main['group'].map({'CONTROL': 0, 'COVID': 1})  # ensure 0/1 labels

# numeric + categorical data
num_cols = ['age', 'GM_vol', 'WM_vol', 'CSF_vol', 'WMH_vol', 'WMH_count', 'TotalGM_B']#, 'TotalGM_L', 'TotalGM_R']
cat_cols = ['sex']

X = scov_table_main[['sCOV', 'age', 'sex']]   # or whatever your DataFrame is called
y = scov_table_main['group'].map({'CONTROL': 0, 'COVID': 1})  # ensure 0/1 labels

# Identify numeric vs categorical columns
num_cols = ['sCOV', 'age']
cat_cols = ['sex']                            # will be one‑hot encoded

# ---------------------------------------------------------------
# 1. Pre‑processing: scale numerics, one‑hot categoricals
# ---------------------------------------------------------------
preprocess = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first'), cat_cols)   # avoid dummy trap
])

# ---------------------------------------------------------------
# 2. Logistic‑lasso model  (L1 penalty → automatic feature shrinkage)
# ---------------------------------------------------------------
logit = LogisticRegression(
    penalty='l1',
    solver='saga',
    class_weight='balanced',   # compensates for imbalance
    max_iter=5000,
    random_state=42
)

pipe = Pipeline([
    ('prep', preprocess),
    ('clf',  logit)
])

# ---------------------------------------------------------------
# 3. Cross‑validation performance
# ---------------------------------------------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    'bal_acc': 'balanced_accuracy',
    'roc_auc': 'roc_auc',
}

cv_results = cross_validate(pipe, X, y, cv=cv, scoring=scoring,
                            return_estimator=True, n_jobs=-1)

print(f"Balanced accuracy (CV mean ± SD): "
      f"{cv_results['test_bal_acc'].mean():.3f} ± {cv_results['test_bal_acc'].std():.3f}")
print(f"ROC‑AUC          (CV mean ± SD): "
      f"{cv_results['test_roc_auc'].mean():.3f} ± {cv_results['test_roc_auc'].std():.3f}")

# ---------------------------------------------------------------
# 4. Fit on the full data to inspect coefficients / odds ratios
# ---------------------------------------------------------------
pipe.fit(X, y)

# Extract the trained coefficients (after preprocessing)
feature_names = (
    pipe.named_steps['prep']
        .get_feature_names_out(num_cols + cat_cols)
)

coefs = pipe.named_steps['clf'].coef_.flatten()
odds_ratios = np.exp(coefs)           # OR = e^β

odds_df = pd.DataFrame({
    'feature': feature_names,
    'coef (β)': coefs,
    'odds_ratio': odds_ratios
}).sort_values('odds_ratio', ascending=False)

print("\nOdds ratios (L1‑regularised model):")
print(odds_df.to_string(index=False, float_format="%.3f"))

# ---------------------------------------------------------------
# 5. (Optional) Plot ROC curve for visual appraisal
# ---------------------------------------------------------------
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay

RocCurveDisplay.from_estimator(pipe, X, y)
plt.title('Logistic‑lasso ROC curve (five‑fold CV pooled)')
plt.show()