### Import required libraries

In [None]:
import re
import pickle
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

### Read source file(s)

In [None]:
sd = pd.read_csv("data\DTI-AAL-SD.csv")
mean = pd.read_csv("data\DTI-AAL-Mean.csv")
volume = pd.read_csv("data\DTI-AAL-Volume.csv")

### Remove empty rows

In [None]:
sd.dropna(inplace=True)
mean.dropna(inplace=True)
volume.dropna(inplace=True)

### Eliminate replicated columns

In [None]:
columns=["age", "gender", "handedness", "affected hemisphere", "post-stroke time", "group"]
sd.drop(columns=columns, inplace=True)
mean.drop(columns=columns, inplace=True)

### Fix the name of the columns
- Remove the number that have been added by pandas
- Attach the related parameter's name to first of the columns
- Attach the data set name to the fist of each column due to the merge action that will be done

In [None]:
DTI_PARAMETERS=["FA", "MD", "AxD", "RD", "RA", "TEMP"]
datasets ={
    'mean':mean,
    'sd':sd
}
for dataset_key in datasets.keys():
    dataset=datasets.get(dataset_key)
    parameter_index=0
    columns=dataset.columns
    columns=[re.sub('.\d{1}$', '', index) for index in columns]
    for index in range(1, len(columns)):
        if columns[index].startswith(DTI_PARAMETERS[parameter_index+1]):
            parameter_index+=1
        elif not columns[index].startswith(DTI_PARAMETERS[parameter_index]):
            columns[index]="{0}_{1}".format(DTI_PARAMETERS[parameter_index], columns[index])
        columns[index]="{0}_{1}".format(dataset_key, columns[index])
    globals()[dataset_key]=pd.DataFrame(data=dataset.to_numpy(), columns=columns)

### Fix data type of some columns

In [None]:
columns = ["file code", "post-stroke time", "age"]
volume[columns] = volume[columns].applymap(np.int64)
columns = mean.columns
mean[columns] = mean[columns].applymap(np.float64)

### Merge three dataframe into a comprehensive dataframe

In [None]:
df = pd.merge(volume, mean, on="file code", how="outer")
df = pd.merge(df, sd, on="file code", how="outer")

### Encode some columns

In [None]:
encoder = LabelEncoder()
df["group"] = encoder.fit_transform(df["group"])
df["gender"] = encoder.fit_transform(df["gender"])
df["handedness"] = encoder.fit_transform(df["handedness"])
df["affected hemisphere"] = encoder.fit_transform(df["affected hemisphere"])

### Split the target and source columns

In [None]:
with open("config/train_params.json") as file:
    columns=json.load(file)
x = df[columns]
y = df["group"]

### Compute the combined columns

In [None]:
x["mean_AxD_Frontal_Sup_L * mean_RD_Frontal_Inf_Oper_R"] = df["mean_AxD_Frontal_Sup_L"] * df["mean_RD_Frontal_Inf_Oper_R"]
x["Occipital_Sup_R * mean_AxD_Heschl_L"] = df["Occipital_Sup_R"]*df["mean_AxD_Heschl_L"]
x["Volume Occipital_Sup_R * mean_FA_Frontal_Inf_Tri_L"] = df["Occipital_Sup_R"]*df["mean_FA_Frontal_Inf_Tri_L"]
x["mean_MD_Caudate_L * Paracentral_Lobule_L"] = df["mean_MD_Caudate_L"]*df["Paracentral_Lobule_L"]
x["mean_AxD_Rolandic_Oper_R * mean_MD_Frontal_Mid_L"] = df["mean_AxD_Rolandic_Oper_R"]*df["mean_MD_Frontal_Mid_L"]

### Save the data frame into a pickle file

In [None]:
with open("data\DataFrame.pickle", "wb") as file:
    pickle.dump([x, y, df], file, protocol=pickle.HIGHEST_PROTOCOL)