### Import required libraries

In [871]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

### Read source file(s)

In [872]:
sd = pd.read_csv("data\DTI-AAL-SD.csv")
mean = pd.read_csv("data\DTI-AAL-Mean.csv")
volume = pd.read_csv("data\DTI-AAL-Volume.csv")

### Remove empty rows

In [873]:
sd = sd.dropna()
mean = mean.dropna()
volume = volume.dropna()

### Eliminate replicated columns

In [874]:
columns=["age", "gender", "handedness", "affected hemisphere", "post-stroke time", "group"]
sd = sd.drop(columns=columns)
mean = mean.drop(columns=columns)

### Fix the name of the columns
- Remove the number that have been added by pandas
- Attach the related parameter's name to first of the columns
- Attach the data set name to the fist of each column due to the merge action that will be done

In [875]:
DTI_PARAMETERS=["FA", "MD", "AxD", "RD", "RA", "omid"]
datasets ={
    'mean':mean,
    'sd':sd
}
for dataset_key in datasets.keys():
    dataset=datasets.get(dataset_key)
    parameter_index=0
    columns=dataset.columns
    columns=[re.sub('.\d{1}$', '', index) for index in columns]
    for index in range(1, len(columns)):
        if columns[index].startswith(DTI_PARAMETERS[parameter_index+1]):
            parameter_index+=1
        elif not columns[index].startswith(DTI_PARAMETERS[parameter_index]):
            columns[index]="{0}_{1}".format(DTI_PARAMETERS[parameter_index], columns[index])
        columns[index]="{0}_{1}".format(dataset_key, columns[index])
    globals()[dataset_key]=pd.DataFrame(data=dataset.to_numpy(), columns=columns)

### Fix data type of some columns

In [876]:
columns = ["file code", "post-stroke time", "age"]
volume[columns] = volume[columns].applymap(np.int64)
columns = mean.columns
mean[columns] = mean[columns].applymap(np.float64)

### Merge three dataframe into a comprehensive dataframe

In [877]:
df = pd.merge(volume, mean, on="file code", how="outer")
df = pd.merge(df, sd, on="file code", how="outer")

### Encode some columns

In [878]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df["group"] = encoder.fit_transform(df["group"])
df["gender"] = encoder.fit_transform(df["gender"])
df["handedness"] = encoder.fit_transform(df["handedness"])
df["affected hemisphere"] = encoder.fit_transform(df["affected hemisphere"])

### Split the target and source columns

In [879]:
x = df.drop(columns=["group"])
y = df["group"]

### Split dataset into train and test

In [880]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

### Define a custom confusion matrix illustrator

In [881]:
from sklearn.metrics import confusion_matrix
def show_confusion_matrix(y_test, y_pred):
    clr_plt = sns.light_palette("#FF9F29", reverse=True, as_cmap=True)
    cf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, fmt='.2%',cmap=clr_plt)
    plt.ylabel('True label')
    plt.xlabel('Predicted Label')