In [1]:
import pandas as pd
import math
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.utils import resample

---
---

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# import data
row_data = pd.read_csv('/content/drive/MyDrive/MedicalProj/DATA.csv')

In [6]:
row_data.head()

Unnamed: 0,seqn,slq310,slq300,whd010,whd050,whd020,mcq010new,mcq053new,mcq092new,mcq160anew,...,ds2tmfat,ds2tpfat,ds2tlyco,ds2tlz,ds2tvc,ds2tzinc,ds2tsodi,ds2tpota,ds2tsele,ds2tcaff
0,93703,,,,,,2.0,2.0,,,...,,,,,,,,,,
1,93704,,,,,,2.0,2.0,,,...,,,,,,,,,,
2,93705,7:00,23:00,63.0,165.0,165.0,1.0,2.0,2.0,1.0,...,,,300.0,250.0,730.0,17.48,,80.0,24.1,
3,93706,10:00,23:30,68.0,145.0,145.0,2.0,2.0,2.0,,...,,,,,,,,,,
4,93707,,,,,,2.0,2.0,2.0,,...,,,,,,,,,,


In [7]:
# drop id columns
row_data = row_data.drop('seqn', axis=1)

In [8]:
# drop minute part of time columns and change them to int
for column in ['slq310', 'slq300']:
    for idx, row in row_data.iterrows():
        if row[column] == '.':
            row_data[column][idx] = float('Nan')
        elif type(row[column]) == str and row[column] != '.':
            row_data[column][idx] = int(row[column].split(':')[0])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row_data[column][idx] = int(row[column].split(':')[0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row_data[column][idx] = float('Nan')


In [9]:
row_data.head()

Unnamed: 0,slq310,slq300,whd010,whd050,whd020,mcq010new,mcq053new,mcq092new,mcq160anew,mcq160nnew,...,ds2tmfat,ds2tpfat,ds2tlyco,ds2tlz,ds2tvc,ds2tzinc,ds2tsodi,ds2tpota,ds2tsele,ds2tcaff
0,,,,,,2.0,2.0,,,,...,,,,,,,,,,
1,,,,,,2.0,2.0,,,,...,,,,,,,,,,
2,7.0,23.0,63.0,165.0,165.0,1.0,2.0,2.0,1.0,2.0,...,,,300.0,250.0,730.0,17.48,,80.0,24.1,
3,10.0,23.0,68.0,145.0,145.0,2.0,2.0,2.0,,,...,,,,,,,,,,
4,,,,,,2.0,2.0,2.0,,,...,,,,,,,,,,


---
---

In [10]:
# utils
def impute_missing_values(df, num_neighbors):
    imputer = KNNImputer(n_neighbors=num_neighbors, weights='distance')
    imputed = imputer.fit_transform(df)
    imputed_data = pd.DataFrame(imputed, columns=df.columns)
    return imputed_data

def extract_int_cols(df):
    int_cols = []
    columns = df.columns
    for col in columns:
        is_int = True
        for item in df[col]:

            if  (not math.isnan(item)) and (float(int(item)) != float(item)): #(type(item) != str) and
                is_int = False
                break
        if is_int:
            int_cols.append(col)
    return int_cols


def handle_int_columns(df, int_columns):
    for column in int_columns:
        df[column] = df[column].apply(lambda x: round(x))
    df['mcq010new'] = df['mcq010new'].apply(lambda x: 0 if x == 2 else 1)
    return df


def up_sampler(df):
    #create two different dataframe of majority and minority class
    df_majority = df[(df['mcq010new']==0)]
    df_minority = df[(df['mcq010new']==1)]
    # upsample minority class
    df_minority_upsampled = resample(df_minority,
                                     replace=True,    # sample with replacement
                                     n_samples= len(df[(df['mcq010new']==0)]), # to match majority class
                                     random_state=42)  # reproducible results
    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_minority_upsampled, df_majority])
    return df_upsampled

def trainer(model, upsampled_train_data, Test_data):

    X_train_upsampled = upsampled_train_data.drop('mcq010new', axis=1)
    y_train_upsampled = upsampled_train_data['mcq010new']

    X_test = Test_data.drop('mcq010new', axis=1)
    y_test = Test_data['mcq010new']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_upsampled)
    X_test_scaled = scaler.transform(X_test)


    # Train the model
    model.fit(X_train_scaled, y_train_upsampled)

    # Make predictions on the test set
    y_pred = model.predict(X_test_scaled)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    # Print the results
    print(f'Accuracy on Test Data: {accuracy * 100:.2f}%')
    print('Confusion Matrix:\n', conf_matrix)
    print('Classification Report:\n', classification_rep)
    print(f'ROC AUC on Test Data: {roc_auc:.2f}')


In [11]:
imputed_data = impute_missing_values(row_data, 10)
int_columns = extract_int_cols(imputed_data)
rounded_data = handle_int_columns(imputed_data, int_columns)

In [13]:
Train_data, Test_data = train_test_split(rounded_data, test_size=0.2, stratify=rounded_data['mcq010new'], random_state=42)

In [14]:
upsampled_train_data = up_sampler(Train_data)

In [15]:
# Support Vector Machine
from sklearn.svm import SVC
model = SVC(kernel='linear', C=1)
trainer(model, upsampled_train_data, Test_data)

Accuracy on Test Data: 72.88%
Confusion Matrix:
 [[1146  399]
 [ 103  203]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.74      0.82      1545
           1       0.34      0.66      0.45       306

    accuracy                           0.73      1851
   macro avg       0.63      0.70      0.63      1851
weighted avg       0.82      0.73      0.76      1851

ROC AUC on Test Data: 0.70


In [26]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=20, random_state=42)
trainer(model, upsampled_train_data, Test_data)

Accuracy on Test Data: 84.87%
Confusion Matrix:
 [[1520   25]
 [ 255   51]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.98      0.92      1545
           1       0.67      0.17      0.27       306

    accuracy                           0.85      1851
   macro avg       0.76      0.58      0.59      1851
weighted avg       0.83      0.85      0.81      1851

ROC AUC on Test Data: 0.58


In [23]:
# K nearest neghbours
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=15)
trainer(model, upsampled_train_data, Test_data)

Accuracy on Test Data: 65.32%
Confusion Matrix:
 [[1051  494]
 [ 148  158]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.68      0.77      1545
           1       0.24      0.52      0.33       306

    accuracy                           0.65      1851
   macro avg       0.56      0.60      0.55      1851
weighted avg       0.77      0.65      0.69      1851

ROC AUC on Test Data: 0.60


In [29]:
# Xgboost
import xgboost as xgb
model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,  # To avoid a warning in newer versions
)
trainer(model, upsampled_train_data, Test_data)

Accuracy on Test Data: 83.25%
Confusion Matrix:
 [[1452   93]
 [ 217   89]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.94      0.90      1545
           1       0.49      0.29      0.36       306

    accuracy                           0.83      1851
   macro avg       0.68      0.62      0.63      1851
weighted avg       0.81      0.83      0.81      1851

ROC AUC on Test Data: 0.62


In [34]:
# Adaboost
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators=15, random_state=42)
trainer(model, upsampled_train_data, Test_data)

Accuracy on Test Data: 70.88%
Confusion Matrix:
 [[1116  429]
 [ 110  196]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.72      0.81      1545
           1       0.31      0.64      0.42       306

    accuracy                           0.71      1851
   macro avg       0.61      0.68      0.61      1851
weighted avg       0.81      0.71      0.74      1851

ROC AUC on Test Data: 0.68


In [35]:
# Naive Bayes
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
trainer(model, upsampled_train_data, Test_data)

Accuracy on Test Data: 60.72%
Confusion Matrix:
 [[927 618]
 [109 197]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.60      0.72      1545
           1       0.24      0.64      0.35       306

    accuracy                           0.61      1851
   macro avg       0.57      0.62      0.53      1851
weighted avg       0.79      0.61      0.66      1851

ROC AUC on Test Data: 0.62


---
---

In [37]:
upsampled_train_data.shape

(12354, 131)

In [20]:
from torch.utils.data import Dataset, DataLoader

class Dataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self, ind):
        x = torch.tensor(self.data.drop('mcq010new', axis=1).iloc[ind])
        y = torch.tensor(self.data['mcq010new'].iloc[ind])
        return x, y


train_set = Dataset(upsampled_train_data)
test_set  = Dataset(Test_data)

batch_size = 128
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_set,  batch_size=batch_size, shuffle=False)

In [47]:

import torch
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, output_size)


    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x


input_size = 130
hidden_size1 = 256
hidden_size2 = 512
output_size = 2

# Create an instance of the model
model = MLP(input_size, hidden_size1, hidden_size2, output_size).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
from sklearn.metrics import f1_score

epochs = 10

# Training loop

for epoch in range(epochs):
    model.train()
    losses = []
    for batch_num, input_data in enumerate(train_loader):
        optimizer.zero_grad()
        x, y = input_data
        x = x.to(device).float()
        y = y.to(device)
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        losses.append(loss.item())
        optimizer.step()

    print('Epoch %d | Training Loss %6.2f' % (epoch, sum(losses)/len(losses)))

    # Evaluation loop
    model.eval()
    val_losses = []
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for val_batch_num, val_data in enumerate(test_loader):
            val_x, val_y = val_data
            val_x = val_x.to(device).float()
            val_y = val_y.to(device)

            val_output = model(val_x)
            val_loss = criterion(val_output.squeeze(), val_y)
            val_losses.append(val_loss.item())

            # Calculate F1-score
            _, predicted = val_output.max(1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(val_y.cpu().numpy())

    f1 = f1_score(all_labels, all_predictions)
    print('Epoch %d | Validation Loss %6.2f | Validation F1-Score: %6.2f' % (epoch, sum(val_losses)/len(val_losses), f1))


Epoch 0 | Training Loss  16.50
Epoch 0 | Validation Loss  20.07 | Validation F1-Score:   0.28
Epoch 1 | Training Loss  10.14
Epoch 1 | Validation Loss   5.16 | Validation F1-Score:   0.28
Epoch 2 | Training Loss   9.43


In [43]:

import torch
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.LeakyReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.LeakyReLU()
        self.fc3 = nn.Linear(hidden_size2, output_size)
        #self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x


input_size = 130
hidden_size1 = 256
hidden_size2 = 512
output_size = 2

# Create an instance of the model
model = MLP(input_size, hidden_size1, hidden_size2, output_size).to(device)

# Loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)

In [42]:
from sklearn.metrics import f1_score

epochs = 10

# Training loop

for epoch in range(epochs):
    model.train()
    losses = []
    for batch_num, input_data in enumerate(train_loader):
        optimizer.zero_grad()
        x, y = input_data
        x = x.to(device).float()
        y = y.to(device)

        output = model(x)
        # print(output.squeeze())
        # print(y)
        loss = criterion(output.squeeze(), y.float())
        loss.backward()
        losses.append(loss.item())
        optimizer.step()

    print('Epoch %d | Training Loss %6.2f' % (epoch, sum(losses)/len(losses)))

    # Evaluation loop
    model.eval()
    val_losses = []
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for val_batch_num, val_data in enumerate(test_loader):
            val_x, val_y = val_data
            val_x = val_x.to(device).float()
            val_y = val_y.to(device)

            val_output = model(val_x)
            val_loss = criterion(val_output.squeeze(), val_y.float())
            val_losses.append(val_loss.item())

            # Calculate F1-score
            predictions = (val_output > 0.5).float()
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(val_y.cpu().numpy())

    f1 = f1_score(all_labels, all_predictions)
    print('Epoch %d | Validation Loss %6.2f | Validation F1-Score: %6.2f' % (epoch, sum(val_losses)/len(val_losses), f1))


Epoch 0 | Training Loss  19.85
Epoch 0 | Validation Loss  24.48 | Validation F1-Score:   0.26
Epoch 1 | Training Loss  13.22
Epoch 1 | Validation Loss  15.31 | Validation F1-Score:   0.26
Epoch 2 | Training Loss  10.22
Epoch 2 | Validation Loss  15.45 | Validation F1-Score:   0.26
Epoch 3 | Training Loss   8.37
Epoch 3 | Validation Loss  12.51 | Validation F1-Score:   0.25
Epoch 4 | Training Loss   7.20
Epoch 4 | Validation Loss  10.42 | Validation F1-Score:   0.27
Epoch 5 | Training Loss   6.01
Epoch 5 | Validation Loss   8.05 | Validation F1-Score:   0.25
Epoch 6 | Training Loss   5.39
Epoch 6 | Validation Loss   8.16 | Validation F1-Score:   0.25
Epoch 7 | Training Loss   5.03
Epoch 7 | Validation Loss   8.18 | Validation F1-Score:   0.26
Epoch 8 | Training Loss   4.46
Epoch 8 | Validation Loss   6.62 | Validation F1-Score:   0.26
Epoch 9 | Training Loss   4.12
Epoch 9 | Validation Loss   6.14 | Validation F1-Score:   0.25
