# Let's begin! 
### First we import some useful python libraries...

In [None]:
## Imports
from nilearn import datasets
from nilearn.connectome import ConnectivityMeasure
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns

## Load data

In [None]:
data_dir = "../../data"

pheno_data_tsv = f"{data_dir}/participants.tsv"
brain_data_tsv = f"{data_dir}/abide_nbsub-100_atlas-ho_meas-correlation_relmat.tsv"


pheno_df = pd.read_csv(pheno_data_tsv, sep="\t", index_col=0)
brain_df = pd.read_csv(brain_data_tsv, sep="\t", index_col=0)

pheno_df.head()

In [None]:
brain_df.head()

## ML-ready data

In [None]:
# input
X = brain_df.values

# output
outcome = "DX_GROUP"
y = pheno_df[outcome]
y_counts = y.value_counts()

print(f"Unique output clasess:\n{y_counts}")

# Encode labels to integer categories
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

## Create train-test split
- 80/20 ratio
- Stratify 

In [None]:
from sklearn.model_selection import train_test_split

test_subset_fraction = 0.2  #
stratification = y

X_train, X_test, y_train, y_test = train_test_split(
    X,  # input features
    y,  # output labels
    test_size=test_subset_fraction,
    shuffle=True,  # shuffle dataset
    # before splitting
    stratify=stratification,
    random_state=123,  # same shuffle each time
)

# print the size of our training and test groups
print("training:", len(X_train), "testing:", len(X_test))

## Okay finally, let's train your first model!

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

model = "LR"  # 'LR' or 'RF'

if model == "RF":
    clf = RandomForestClassifier(max_depth=3, class_weight="balanced", random_state=0)
elif model == "LR":
    clf = LogisticRegression(
        penalty="l1", C=1, class_weight="balanced", solver="saga", random_state=0
    )
else:
    print(f"Unknown model: {model}")

print(f"Using model: {model}")

clf.fit(X_train, y_train)

train_acc = clf.score(X_train, y_train)
print(f"train acc: {train_acc:.3f}")

## Evaluate on test set
- accuracy
- confusion_matrix
- precision_recall_fscore 

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)

test_acc = clf.score(X_test, y_test)
print(f"test acc: {test_acc:.3f}")

test_cm = confusion_matrix(y_test, y_pred)

### Note the difference between train and test accuracy. If the difference is large, we are most likely overfitting the model to the train set. 

#### Things to try:
- Increase regularization
- Reduce dimensionality of X

In [None]:
sns.set_theme(font_scale=3)
with sns.axes_style("whitegrid"):
    f, ax = plt.subplots(figsize=(15, 10))

    g = sns.heatmap(test_cm, ax=ax, annot=True, annot_kws={"fontsize": 35}, cmap="Reds")
    g.set_title("Confusion matrix")
    g.set_ylabel("True label")
    g.set_xlabel("Pred label")

In [None]:
p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted")

print(
    f"model: {model}, outcome: {outcome}\n Acc:{test_acc:.2f}, precision: {p:.2f}, recall: {r:.2f}, f1: {f1:.2f}"
)

## Now let's predict scanning site!

In [None]:
outcome = "SITE_ID"
y = pheno_df[outcome]
y_counts = y.value_counts()

print(f"Unique output clasess:\n{y_counts}")

# Encode labels to integer categories
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

## Create train-test split
- 80/20 ratio
- Stratify 

In [None]:
from sklearn.model_selection import train_test_split

test_subset_fraction = 0.2  #
stratification = y

X_train, X_test, y_train, y_test = train_test_split(
    X,  # input features
    y,  # output labels
    test_size=test_subset_fraction,
    shuffle=True,  # shuffle dataset
    # before splitting
    stratify=stratification,
    random_state=123,  # same shuffle each time
)

# print the size of our training and test groups
print("training:", len(X_train), "testing:", len(X_test))

## Fit the model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

model = "LR"  # 'LR' or 'RF'

if model == "RF":
    clf = RandomForestClassifier(max_depth=3, class_weight="balanced", random_state=0)
elif model == "LR":
    clf = LogisticRegression(
        penalty="l2", C=1, class_weight="balanced", solver="saga", random_state=0
    )
else:
    print(f"Unknown model: {model}")

print(f"Using model: {model}")

clf.fit(X_train, y_train)

train_acc = clf.score(X_train, y_train)
print(f"train acc: {train_acc:.3f}")

## Evaluate on test set
- accuracy
- confusion_matrix
- precision_recall_fscore 

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)

test_acc = clf.score(X_test, y_test)
print(f"test acc: {test_acc:.3f}")

test_cm = confusion_matrix(y_test, y_pred)

In [None]:
sns.set_theme(font_scale=3)
with sns.axes_style("whitegrid"):
    f, ax = plt.subplots(figsize=(15, 10))
    g = sns.heatmap(test_cm, annot=True, ax=ax, annot_kws={"fontsize": 40})
    g.set_title("Confusion matrix", fontsize=40)
    g.set_ylabel("True label", fontsize=40)
    g.set_xlabel("Pred label", fontsize=40)

In [None]:
p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted")

print(
    f"model: {model}, outcome: {outcome}\n Acc:{test_acc:.2f}, precision: {p:.2f}, recall: {r:.2f}, f1: {f1:.2f}"
)