In [127]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

# Data Preprocessing

In [128]:
dataset = pd.read_csv('data.csv')

In [129]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [130]:
X.head()

Unnamed: 0,Gender,Age,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
0,M,44,6.8,64,4.9,4.9,2.8,2.0,1.8,1.2,21.0
1,F,35,3.9,38,5.4,3.8,5.9,0.5,4.3,1.0,22.0
2,M,40,5.0,63,4.0,4.8,2.5,1.1,2.7,1.1,23.0
3,F,35,2.8,50,5.0,4.7,2.5,1.3,2.4,1.1,20.0
4,F,42,2.1,45,4.9,3.0,1.1,1.1,1.4,1.4,21.0


## Data Encoding

In [131]:
X_encoded = pd.get_dummies(X, columns=['Gender'])

In [132]:
print(X_encoded)

     Age  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL   BMI  Gender_F  \
0     44   6.8  64    4.9   4.9  2.8  2.0  1.8   1.2  21.0     False   
1     35   3.9  38    5.4   3.8  5.9  0.5  4.3   1.0  22.0      True   
2     40   5.0  63    4.0   4.8  2.5  1.1  2.7   1.1  23.0     False   
3     35   2.8  50    5.0   4.7  2.5  1.3  2.4   1.1  20.0      True   
4     42   2.1  45    4.9   3.0  1.1  1.1  1.4   1.4  21.0      True   
..   ...   ...  ..    ...   ...  ...  ...  ...   ...   ...       ...   
995   52   4.7  34    9.9   5.1  4.5  0.6  2.7   2.0  26.0      True   
996   38   5.8  59    6.7   5.3  2.0  1.6  2.9  14.0  40.5     False   
997   54   5.0  67    6.9   3.8  1.7  1.1  3.0   0.7  33.0     False   
998   30   7.1  81    6.7   4.1  1.1  1.2  2.4   8.1  27.4     False   
999   31   3.0  60   12.3   4.1  2.2  0.7  2.4  15.4  37.2     False   

     Gender_M  Gender_f  
0        True     False  
1       False     False  
2        True     False  
3       False     False  
4    

In [133]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

## Standard Feature Scaling

In [134]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

## ANOVA F-Value Feature Selection

In [135]:
k = 5
selector = SelectKBest(score_func=f_classif, k=k)
X_selected = selector.fit_transform(X_scaled, y_encoded)

## Training and Test Set Allocation

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size = 0.2, random_state=42)

# Model Training and Prediction

## Hyperparameter Optimization

In [137]:
# param_grid = [
#     {
#      'n_estimators': [100, 1000, 10000],
#      'max_depth': [50, 75, 100, 150, 200],
#      'bootstrap': [True, False],
#      'criterion': ['gini', 'entropy'] 
#     }
# ]

# optimal_params = GridSearchCV(
#     RandomForestClassifier(),
#     param_grid,
#     cv = 5,
#     scoring = 'accuracy',
#     verbose = 0
# )

In [138]:
# optimal_params.fit(X_train, y_train)
# print(optimal_params.best_params_)

## Training

In [139]:
rf = RandomForestClassifier(n_estimators = 100, max_depth = 100, bootstrap = True, criterion = "entropy")
rf.fit(X_train, y_train)

## Prediction

In [140]:
y_pred = rf.predict(X_test)

# Results

## Decoding

In [141]:
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

## Confusion Matrix

In [142]:
cm = confusion_matrix(y_test, prediction)
cm_fig = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['ND', 'PD', 'D'])
cm_fig.plot()
plt.show()

## Classification Report

In [144]:
print(classification_report(y_test_decoded, y_pred_decoded))

              precision    recall  f1-score   support

           N       1.00      1.00      1.00        21
           P       1.00      1.00      1.00         6
           Y       1.00      1.00      1.00       173

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

