In [1]:
import pandas as pd

df = pd.read_csv('./data/5v_cleandf_truncated.csv')
df

Unnamed: 0,dep_name,esi,age,gender,ethnicity,race,lang,religion,maritalstatus,employstatus,...,cc_vaginaldischarge,cc_vaginalpain,cc_weakness,cc_wheezing,cc_withdrawal-alcohol,cc_woundcheck,cc_woundinfection,cc_woundre-evaluation,cc_wristinjury,cc_wristpain
0,B,4.0,40.0,Male,Hispanic or Latino,White or Caucasian,English,,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,B,4.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,B,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A,3.0,84.0,Female,Hispanic or Latino,Other,Other,Pentecostal,Widowed,Retired,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,B,3.0,100.0,Female,Non-Hispanic,White or Caucasian,English,Catholic,Widowed,Retired,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,B,4.0,26.0,Female,Unknown,White or Caucasian,English,Catholic,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,B,3.0,25.0,Male,Hispanic or Latino,White or Caucasian,English,,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,C,4.0,26.0,Male,Hispanic or Latino,White or Caucasian,English,,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
# Import necessary packages
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [3]:
len(df.columns)

972

# inspect

In [4]:
# slice out the esi and disposition columns
df_slice_esi = df[["esi", "disposition"]]

discharge_counts = df_slice_esi[df_slice_esi['disposition'] == 'Discharge']['esi'].value_counts()
admit_counts = df_slice_esi[df_slice_esi['disposition'] == 'Admit']['esi'].value_counts()

import pprint

print("number of patients with each ESI level that were discharged")
pprint.pprint(discharge_counts) 
print()

print("number of patients with each ESI level that were admitted")
pprint.pprint(admit_counts)

# the lower ESI: more severe
# the higher ESI: less severe 

number of patients with each ESI level that were discharged
esi
3.0    1496
4.0     991
2.0     570
5.0     181
1.0      10
Name: count, dtype: int64

number of patients with each ESI level that were admitted
esi
2.0    893
3.0    773
1.0     44
4.0     26
5.0      1
Name: count, dtype: int64


In [5]:
print("ratio of patients with each ESI level that were discharged")
pprint.pprint(round(discharge_counts/ discharge_counts.sum() * 100, 2))
print()

print("ratio of patients with each ESI level that were admitted")
pprint.pprint(round(admit_counts/ admit_counts.sum() * 100, 2))


ratio of patients with each ESI level that were discharged
esi
3.0    46.06
4.0    30.51
2.0    17.55
5.0     5.57
1.0     0.31
Name: count, dtype: float64

ratio of patients with each ESI level that were admitted
esi
2.0    51.41
3.0    44.50
1.0     2.53
4.0     1.50
5.0     0.06
Name: count, dtype: float64


In [6]:
# slice out the esi and disposition columns
df_slice_esi = df[["esi", "disposition"]]

# Function to calculate count and ratio
def calculate_count_and_ratio(df, esi_level):
    df_filtered = df[df['esi'] == esi_level]
    count = df_filtered['disposition'].value_counts()
    ratio = count / count.sum()
    result_df = pd.DataFrame({'count': count, 'ratio': round(ratio * 100,2)})
    return result_df

# Apply the function to each ESI level
esi_1 = calculate_count_and_ratio(df, 1)
esi_2 = calculate_count_and_ratio(df, 2)
esi_3 = calculate_count_and_ratio(df, 3)
esi_4 = calculate_count_and_ratio(df, 4)
esi_5 = calculate_count_and_ratio(df, 5)

# Print the results
print("ESI 1:")
pprint.pprint(esi_1)
print("\nESI 2:")
pprint.pprint(esi_2)
print("\nESI 3:")
pprint.pprint(esi_3)
print("\nESI 4:")
pprint.pprint(esi_4)
print("\nESI 5:")
pprint.pprint(esi_5)

# the lower ESI: more severe
# the higher ESI: less severe 

ESI 1:
             count  ratio
disposition              
Admit           44  81.48
Discharge       10  18.52

ESI 2:
             count  ratio
disposition              
Admit          893  61.04
Discharge      570  38.96

ESI 3:
             count  ratio
disposition              
Discharge     1496  65.93
Admit          773  34.07

ESI 4:
             count  ratio
disposition              
Discharge      991  97.44
Admit           26   2.56

ESI 5:
             count  ratio
disposition              
Discharge      181  99.45
Admit            1   0.55


In [7]:
type(esi_1)

pandas.core.frame.DataFrame

# test one, drop all columns with null values

In [8]:
missing_values = df.isnull().sum() # default value for the axis parameter in isnull() is 0, which means it operates column-wise.
columns_with_missing_values = missing_values[missing_values > 0].index
print(len(columns_with_missing_values))

# drop all columns with null values
df_missing = df[df.isnull().any(axis=1)][columns_with_missing_values]
df_missing


590


Unnamed: 0,esi,religion,arrivalmode,absolutelymphocytecount_last,acetonebld_last,alanineaminotransferase(alt)_last,albumin_last,alkphos_last,anc(absneutrophilcount)_last,aniongap_last,...,cc_vaginaldischarge,cc_vaginalpain,cc_weakness,cc_wheezing,cc_withdrawal-alcohol,cc_woundcheck,cc_woundinfection,cc_woundre-evaluation,cc_wristinjury,cc_wristpain
0,4.0,,Walk-in,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,Pentecostal,Car,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,Pentecostal,Walk-in,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,Pentecostal,Car,1.9,,12.0,,71.0,9.2,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,Pentecostal,Walk-in,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3.0,Catholic,ambulance,0.9,,,,,5.5,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,4.0,Catholic,Car,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,3.0,,Car,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,4.0,,Car,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# drop columns with missing values > 20%
columns_to_drop= missing_values[missing_values > 0.2 * len(df)].index.tolist()
len(columns_to_drop)

387

In [10]:
columns_to_drop = columns_to_drop + ["religion"]
len(columns_to_drop)

388

In [11]:
# drop `columns_to_drop`
df_small = df.drop(columns=columns_to_drop)
df_small

Unnamed: 0,dep_name,esi,age,gender,ethnicity,race,lang,maritalstatus,employstatus,insurance_status,...,cc_vaginaldischarge,cc_vaginalpain,cc_weakness,cc_wheezing,cc_withdrawal-alcohol,cc_woundcheck,cc_woundinfection,cc_woundre-evaluation,cc_wristinjury,cc_wristpain
0,B,4.0,40.0,Male,Hispanic or Latino,White or Caucasian,English,Single,Full Time,Other,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,B,4.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Married,Not Employed,Commercial,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,B,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Married,Not Employed,Commercial,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Married,Not Employed,Commercial,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A,3.0,84.0,Female,Hispanic or Latino,Other,Other,Widowed,Retired,Medicare,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,B,3.0,100.0,Female,Non-Hispanic,White or Caucasian,English,Widowed,Retired,Medicare,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,B,4.0,26.0,Female,Unknown,White or Caucasian,English,Single,Full Time,Commercial,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,B,3.0,25.0,Male,Hispanic or Latino,White or Caucasian,English,Single,Full Time,Commercial,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,C,4.0,26.0,Male,Hispanic or Latino,White or Caucasian,English,Single,Full Time,Commercial,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df_numeric = df_small.select_dtypes(include='number')
df_numeric


Unnamed: 0,esi,age,2ndarymalig,abdomhernia,abdomnlpain,abortcompl,acqfootdef,acrenlfail,acutecvd,acutemi,...,cc_vaginaldischarge,cc_vaginalpain,cc_weakness,cc_wheezing,cc_withdrawal-alcohol,cc_woundcheck,cc_woundinfection,cc_woundre-evaluation,cc_wristinjury,cc_wristpain
0,4.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,4.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,3.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,4.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:


df_numeric.dropna(inplace=True)
print(df_numeric.isnull().any().any())

False


In [15]:
X = df_numeric.drop(['esi'], axis=1)
y = df_numeric['esi']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree Classifier

In [16]:
def train_decision_tree(X_train, X_test, y_train, y_test, model):
    # Train the model
    model.fit(X_train, y_train)

    # Predict the target variable for the test set
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)

    
    print("=====FOR TRAINING:=====")
    

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_train, y_pred_train)
    print("Accuracy of each class:\n", class_accuracy)

    # Calculate accuracy of all classes
    accuracy = accuracy_score(y_train, y_pred_train)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_train, y_pred_train, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_train, y_pred_train, average='micro')
    print("Micro F1 score:", micro_f1)

    print("====FOR TESTING:====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_test, y_pred)
    print("Accuracy of each class:\n", class_accuracy)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    print("Micro F1 score:", micro_f1)

In [106]:
from sklearn.exceptions import UndefinedMetricWarning
import warnings

# Suppress the warning for zero division
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning, module='sklearn.metrics._classification')

from sklearn.model_selection import train_test_split


model = DecisionTreeClassifier(min_samples_split=20, max_depth=10, min_samples_leaf=2)
# model = DecisionTreeClassifier()

train_decision_tree(X_train, X_test, y_train, y_test, model)


=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

         1.0       0.83      0.11      0.20        44
         2.0       0.76      0.47      0.58      1158
         3.0       0.60      0.74      0.66      1813
         4.0       0.46      0.56      0.50       809
         5.0       0.00      0.00      0.00       146

    accuracy                           0.59      3970
   macro avg       0.53      0.38      0.39      3970
weighted avg       0.60      0.59      0.58      3970

Accuracy of all classes: 0.5919395465994962
Macro F1 score: 0.3893401962358779
Micro F1 score: 0.5919395465994962
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00        10
         2.0       0.65      0.38      0.48       300
         3.0       0.54      0.67      0.60       442
         4.0       0.44      0.58      0.50       205
         5.0       0.00      0.

# data handling 2: fill in missing value



In [17]:
df_filled = df.fillna(df.mode().iloc[0])
df_filled

Unnamed: 0,dep_name,esi,age,gender,ethnicity,race,lang,religion,maritalstatus,employstatus,...,cc_vaginaldischarge,cc_vaginalpain,cc_weakness,cc_wheezing,cc_withdrawal-alcohol,cc_woundcheck,cc_woundinfection,cc_woundre-evaluation,cc_wristinjury,cc_wristpain
0,B,4.0,40.0,Male,Hispanic or Latino,White or Caucasian,English,Catholic,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,B,4.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,B,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A,3.0,84.0,Female,Hispanic or Latino,Other,Other,Pentecostal,Widowed,Retired,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,B,3.0,100.0,Female,Non-Hispanic,White or Caucasian,English,Catholic,Widowed,Retired,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,B,4.0,26.0,Female,Unknown,White or Caucasian,English,Catholic,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,B,3.0,25.0,Male,Hispanic or Latino,White or Caucasian,English,Catholic,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,C,4.0,26.0,Male,Hispanic or Latino,White or Caucasian,English,Catholic,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df_filled = df_filled.select_dtypes(include='number')
df_filled.dropna(inplace=True)
print(df_filled.isnull().any().any())

False


In [19]:
X = df_numeric.drop('esi', axis=1)
y = df_numeric['esi']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = DecisionTreeClassifier()

train_decision_tree(X_train, X_test, y_train, y_test, model)



=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

         1.0       1.00      1.00      1.00        44
         2.0       1.00      1.00      1.00      1158
         3.0       1.00      1.00      1.00      1813
         4.0       1.00      1.00      1.00       809
         5.0       1.00      1.00      1.00       146

    accuracy                           1.00      3970
   macro avg       1.00      1.00      1.00      3970
weighted avg       1.00      1.00      1.00      3970

Accuracy of all classes: 0.998992443324937
Macro F1 score: 0.9992215209136383
Micro F1 score: 0.998992443324937
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

         1.0       0.11      0.10      0.11        10
         2.0       0.57      0.52      0.54       300
         3.0       0.57      0.59      0.58       442
         4.0       0.46      0.49      0.48       205
         5.0       0.15      0.17

# logistic regression

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score

def train_logistic_regression(X_train, X_test, y_train, y_test, model, n_components):
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # Train the model
    model.fit(X_train_pca, y_train)

    # Predict the target variable for the test set
    y_pred = model.predict(X_test_pca)
    y_pred_train = model.predict(X_train_pca)

    # Predict probabilities for AUC calculation
    y_pred_proba = model.predict_proba(X_test_pca)
    y_pred_train_proba = model.predict_proba(X_train_pca)

    print("=====FOR TRAINING:=====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_train, y_pred_train)
    print("Accuracy of each class:\n", class_accuracy)

    # Calculate accuracy of all classes
    accuracy = accuracy_score(y_train, y_pred_train)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_train, y_pred_train, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_train, y_pred_train, average='micro')
    print("Micro F1 score:", micro_f1)

    # Calculate AUC
    auc_train = roc_auc_score(y_train, y_pred_train_proba, multi_class='ovr')
    print("AUC:", auc_train)

    print("====FOR TESTING:====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_test, y_pred)
    print("Accuracy of each class:\n", class_accuracy)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    print("Micro F1 score:", micro_f1)

    # Calculate AUC
    auc_test = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    print("AUC:", auc_test)

In [21]:
# Example usage
model = LogisticRegression(max_iter=300, random_state=42)
train_logistic_regression(X_train, X_test, y_train, y_test, model, n_components=10)

=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00        44
         2.0       0.60      0.28      0.38      1158
         3.0       0.48      0.85      0.62      1813
         4.0       0.50      0.13      0.21       809
         5.0       0.00      0.00      0.00       146

    accuracy                           0.50      3970
   macro avg       0.32      0.25      0.24      3970
weighted avg       0.50      0.50      0.44      3970

Accuracy of all classes: 0.4987405541561713
Macro F1 score: 0.24128635209567156
Micro F1 score: 0.4987405541561713
AUC: 0.717917376410141
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00        10
         2.0       0.58      0.28      0.38       300
         3.0       0.46      0.83      0.60       442
         4.0       0.48      0.15      0.22       205
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# XGBoost

In [22]:
from sklearn.ensemble import RandomForestClassifier

def train_random_forest(X_train, X_test, y_train, y_test, model):
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train the model
    model.fit(X_train_scaled, y_train)

    # Predict the target variable for the test set
    y_pred = model.predict(X_test_scaled)
    y_pred_train = model.predict(X_train_scaled)

    # Predict probabilities for AUC calculation
    y_pred_proba = model.predict_proba(X_test_scaled)
    y_pred_train_proba = model.predict_proba(X_train_scaled)

    print("=====FOR TRAINING:=====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_train, y_pred_train)
    print("Accuracy of each class:\n", class_accuracy)

    # Calculate accuracy of all classes
    accuracy = accuracy_score(y_train, y_pred_train)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_train, y_pred_train, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_train, y_pred_train, average='micro')
    print("Micro F1 score:", micro_f1)

    # Calculate AUC
    auc_train = roc_auc_score(y_train, y_pred_train_proba, multi_class='ovr')
    print("AUC:", auc_train)

    print("====FOR TESTING:====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_test, y_pred)
    print("Accuracy of each class:\n", class_accuracy)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    print("Micro F1 score:", micro_f1)

    # Calculate AUC
    auc_test = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    print("AUC:", auc_test)



In [23]:
# model = RandomForestClassifier(random_state=42, max_depth=10, min_samples_leaf=2)
model = RandomForestClassifier()
train_random_forest(X_train, X_test, y_train, y_test, model)

=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

         1.0       1.00      1.00      1.00        44
         2.0       1.00      1.00      1.00      1158
         3.0       1.00      1.00      1.00      1813
         4.0       1.00      1.00      1.00       809
         5.0       1.00      1.00      1.00       146

    accuracy                           1.00      3970
   macro avg       1.00      1.00      1.00      3970
weighted avg       1.00      1.00      1.00      3970

Accuracy of all classes: 0.998992443324937
Macro F1 score: 0.9992229975964593
Micro F1 score: 0.998992443324937
AUC: 0.9999990709946249
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

         1.0       1.00      0.10      0.18        10
         2.0       0.63      0.55      0.59       300
         3.0       0.59      0.70      0.64       442
         4.0       0.57      0.51      0.53       205
         

# use neural network to predict esi

In [24]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score, f1_score

def train_neural_network(X_train, X_test, y_train, y_test, model, n_components):
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # Train the model
    model.fit(X_train_pca, y_train)

    # Predict the target variable for the test set
    y_pred = model.predict(X_test_pca)
    y_pred_train = model.predict(X_train_pca)

    print("=====FOR TRAINING:=====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_train, y_pred_train)
    print("Accuracy of each class:\n", class_accuracy)

    # Calculate accuracy of all classes
    accuracy = accuracy_score(y_train, y_pred_train)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_train, y_pred_train, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_train, y_pred_train, average='micro')
    print("Micro F1 score:", micro_f1)

    print("====FOR TESTING:====")

    # Calculate accuracy of each class
    class_accuracy = classification_report(y_test, y_pred)
    print("Accuracy of each class:\n", class_accuracy)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy of all classes:", accuracy)

    # Calculate macro F1 score
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    print("Macro F1 score:", macro_f1)

    # Calculate micro F1 score
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    print("Micro F1 score:", micro_f1)

In [25]:
# Example usage
model = MLPClassifier(hidden_layer_sizes=(64, 16), max_iter=100, random_state=42)
train_neural_network(X_train, X_test, y_train, y_test, model, n_components=500)

=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

         1.0       1.00      0.98      0.99        44
         2.0       0.99      0.98      0.99      1158
         3.0       0.99      0.99      0.99      1813
         4.0       0.96      0.98      0.97       809
         5.0       0.97      0.95      0.96       146

    accuracy                           0.98      3970
   macro avg       0.98      0.97      0.98      3970
weighted avg       0.98      0.98      0.98      3970

Accuracy of all classes: 0.9828715365239294
Macro F1 score: 0.9778440737526495
Micro F1 score: 0.9828715365239294
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

         1.0       0.60      0.30      0.40        10
         2.0       0.60      0.53      0.56       300
         3.0       0.59      0.66      0.62       442
         4.0       0.56      0.55      0.56       205
         5.0       0.16      0.



# Neural network pytorch

In [35]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score, f1_score

def train_neural_network(model, X_train, X_test, y_train, y_test, epochs=100, lr=0.001):
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values - 1, dtype=torch.long)
    y_test_tensor = torch.tensor(y_test.values - 1, dtype=torch.long)

    # Define the model, loss function, and optimizer
    criterion = nn.CrossEntropyLoss() # input is raw logits, target label is class label [0, C)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Training loop
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred_train = model(X_train_tensor).argmax(dim=1).numpy()
        y_pred = model(X_test_tensor).argmax(dim=1).numpy()

    print("=====FOR TRAINING:=====")
    print("Accuracy of each class:\n", classification_report(y_train_tensor, y_pred_train))
    print("Accuracy of all classes:", round(accuracy_score(y_train_tensor, y_pred_train), 2))
    print("Macro F1 score:", round(f1_score(y_train_tensor, y_pred_train, average='macro'), 2))
    print("Micro F1 score:", round(f1_score(y_train_tensor, y_pred_train, average='micro'), 2))

    print("====FOR TESTING:====")
    print("Accuracy of each class:\n", classification_report(y_test_tensor, y_pred))
    print("Accuracy of all classes:", round(accuracy_score(y_test_tensor, y_pred), 2))
    print("Macro F1 score:", round(f1_score(y_test_tensor, y_pred, average='macro'), 2))
    print("Micro F1 score:", round(f1_score(y_test_tensor, y_pred, average='micro'), 2))

In [36]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)  # Add dropout layer
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # self.batch_norm = nn.BatchNorm1d(hidden_dim)  # Add batch normalization layer
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        # out = self.batch_norm(out)
        return out
    
# Example usage
# Assuming X_train, X_test, y_train, y_test are already defined
hidden_dim = 100  # Example hidden layer size
model = NeuralNetwork(input_dim=X_train.shape[1], hidden_dim=hidden_dim, output_dim=len(set(y_train)))

train_neural_network(model, X_train, X_test, y_train, y_test, epochs=100, lr=0.001)

=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

           0       1.00      0.45      0.62        44
           1       0.86      0.82      0.84      1158
           2       0.84      0.88      0.86      1813
           3       0.80      0.86      0.83       809
           4       0.89      0.50      0.64       146

    accuracy                           0.84      3970
   macro avg       0.88      0.70      0.76      3970
weighted avg       0.84      0.84      0.84      3970

Accuracy of all classes: 0.84
Macro F1 score: 0.76
Micro F1 score: 0.84
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

           0       0.80      0.40      0.53        10
           1       0.64      0.58      0.61       300
           2       0.60      0.66      0.63       442
           3       0.58      0.61      0.59       205
           4       0.29      0.11      0.16        36

    accuracy      

In [39]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # self.batch_norm = nn.BatchNorm1d(hidden_dim)  # Add batch normalization layer
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        # out = self.batch_norm(out)
        return out
    
# Example usage
# Assuming X_train, X_test, y_train, y_test are already defined
hidden_dim = 32  # Example hidden layer size
model = NeuralNetwork(input_dim=X_train.shape[1], hidden_dim=hidden_dim, output_dim=len(set(y_train)))

train_neural_network(model, X_train, X_test, y_train, y_test, epochs=100, lr=0.001)

=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

           0       1.00      0.16      0.27        44
           1       0.81      0.78      0.80      1158
           2       0.79      0.84      0.82      1813
           3       0.78      0.81      0.79       809
           4       0.87      0.51      0.64       146

    accuracy                           0.80      3970
   macro avg       0.85      0.62      0.66      3970
weighted avg       0.80      0.80      0.79      3970

Accuracy of all classes: 0.8
Macro F1 score: 0.66
Micro F1 score: 0.8
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.66      0.57      0.61       300
           2       0.60      0.67      0.63       442
           3       0.55      0.59      0.57       205
           4       0.33      0.14      0.20        36

    accuracy        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# hanlde imbalanced data

In [65]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE

def train_neural_network_handle_imbalanced(model, X_train, X_test, y_train, y_test, epochs=100, lr=0.001):
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Apply SMOTE to handle data imbalance
    smote = SMOTE()
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
    print(y_train_resampled.value_counts())

    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_resampled, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_resampled.values - 1, dtype=torch.long)
    y_test_tensor = torch.tensor(y_test.values - 1, dtype=torch.long)

    # Define the model, loss function, and optimizer
    criterion = nn.CrossEntropyLoss() # input is raw logits, target label is class label [0, C)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Training loop
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred_train = model(X_train_tensor).argmax(dim=1).numpy()
        y_pred = model(X_test_tensor).argmax(dim=1).numpy()

    print("=====FOR TRAINING:=====")
    print("Accuracy of each class:\n", classification_report(y_train_tensor, y_pred_train))
    print("Accuracy of all classes:", round(accuracy_score(y_train_tensor, y_pred_train), 2))
    print("Macro F1 score:", round(f1_score(y_train_tensor, y_pred_train, average='macro'), 2))
    print("Micro F1 score:", round(f1_score(y_train_tensor, y_pred_train, average='micro'), 2))

    print("====FOR TESTING:====")
    print("Accuracy of each class:\n", classification_report(y_test_tensor, y_pred))
    print("Accuracy of all classes:", round(accuracy_score(y_test_tensor, y_pred), 2))
    print("Macro F1 score:", round(f1_score(y_test_tensor, y_pred, average='macro'), 2))
    print("Micro F1 score:", round(f1_score(y_test_tensor, y_pred, average='micro'), 2))

In [66]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)  # Add dropout layer
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # self.batch_norm = nn.BatchNorm1d(hidden_dim)  # Add batch normalization layer
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        # out = self.batch_norm(out)
        return out
    

hidden_dim = 128  # Example hidden layer size
model = NeuralNetwork(input_dim=X_train.shape[1], hidden_dim=hidden_dim, output_dim=len(set(y_train)))

train_neural_network_handle_imbalanced(model, X_train, X_test, y_train, y_test, epochs=100, lr=0.001)

esi
3.0    1813
4.0    1813
2.0    1813
5.0    1813
1.0    1813
Name: count, dtype: int64
=====FOR TRAINING:=====
Accuracy of each class:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1813
           1       0.87      0.86      0.87      1813
           2       0.87      0.78      0.82      1813
           3       0.86      0.86      0.86      1813
           4       0.87      0.95      0.91      1813

    accuracy                           0.89      9065
   macro avg       0.89      0.89      0.89      9065
weighted avg       0.89      0.89      0.89      9065

Accuracy of all classes: 0.89
Macro F1 score: 0.89
Micro F1 score: 0.89
====FOR TESTING:====
Accuracy of each class:
               precision    recall  f1-score   support

           0       0.26      0.50      0.34        10
           1       0.61      0.60      0.60       300
           2       0.63      0.57      0.60       442
           3       0.56      0.62    