In [None]:
import pandas as pd
import math
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.utils import resample

---
---

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import data
row_data = pd.read_csv('/content/drive/MyDrive/MedicalProj/DATA.csv')

In [None]:
row_data.head()

Unnamed: 0,seqn,slq310,slq300,whd010,whd050,whd020,mcq010new,mcq053new,mcq092new,mcq160anew,...,ds2tmfat,ds2tpfat,ds2tlyco,ds2tlz,ds2tvc,ds2tzinc,ds2tsodi,ds2tpota,ds2tsele,ds2tcaff
0,93703,,,,,,2.0,2.0,,,...,,,,,,,,,,
1,93704,,,,,,2.0,2.0,,,...,,,,,,,,,,
2,93705,7:00,23:00,63.0,165.0,165.0,1.0,2.0,2.0,1.0,...,,,300.0,250.0,730.0,17.48,,80.0,24.1,
3,93706,10:00,23:30,68.0,145.0,145.0,2.0,2.0,2.0,,...,,,,,,,,,,
4,93707,,,,,,2.0,2.0,2.0,,...,,,,,,,,,,


In [None]:
# drop id columns
row_data = row_data.drop('seqn', axis=1)

In [None]:
# drop time columns
row_data = row_data.drop(['slq310', 'slq300'], axis=1)

In [None]:
row_data.head()

Unnamed: 0,whd010,whd050,whd020,mcq010new,mcq053new,mcq092new,mcq160anew,mcq160nnew,mcq160bnew,mcq160cnew,...,ds2tmfat,ds2tpfat,ds2tlyco,ds2tlz,ds2tvc,ds2tzinc,ds2tsodi,ds2tpota,ds2tsele,ds2tcaff
0,,,,2.0,2.0,,,,,,...,,,,,,,,,,
1,,,,2.0,2.0,,,,,,...,,,,,,,,,,
2,63.0,165.0,165.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,...,,,300.0,250.0,730.0,17.48,,80.0,24.1,
3,68.0,145.0,145.0,2.0,2.0,2.0,,,,,...,,,,,,,,,,
4,,,,2.0,2.0,2.0,,,,,...,,,,,,,,,,


---
---

In [None]:
# utils
def drop_high_missing_column(df, threshold):
    '''
    drop columns which have missing value percentage above threshold
    '''

    low_missing_df = df.copy()
    # calculate nan item percentage for each columns
    col_missing_percentage = df.isnull().mean()

    for col in col_missing_percentage.index:
        if col_missing_percentage[col] > threshold:
            # print(f'before: {low_missing_df.shape}')
            low_missing_df = low_missing_df.drop(col, axis=1)
            # print(f'after: {low_missing_df.shape}')
    return low_missing_df

def impute_missing_values(df, num_neighbors):
    imputer = KNNImputer(n_neighbors=num_neighbors, weights='distance')
    imputed = imputer.fit_transform(df)
    imputed_data = pd.DataFrame(imputed, columns=df.columns)
    return imputed_data

def extract_int_cols(df):
    int_cols = []
    columns = df.columns
    for col in columns:
        is_int = True
        for item in df[col]:

            if  (not math.isnan(item)) and (float(int(item)) != float(item)): #(type(item) != str) and
                is_int = False
                break
        if is_int:
            int_cols.append(col)
    return int_cols


def handle_int_columns(df, int_columns):
    for column in int_columns:
        df[column] = df[column].apply(lambda x: round(x))
    df['mcq010new'] = df['mcq010new'].apply(lambda x: 0 if x == 2 else 1)
    return df


def up_sampler(df):
    #create two different dataframe of majority and minority class
    df_majority = df[(df['mcq010new']==0)]
    df_minority = df[(df['mcq010new']==1)]
    # upsample minority class
    df_minority_upsampled = resample(df_minority,
                                     replace=True,    # sample with replacement
                                     n_samples= len(df[(df['mcq010new']==0)]), # to match majority class
                                     random_state=42)  # reproducible results
    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_minority_upsampled, df_majority])
    return df_upsampled

def trainer(model, upsampled_train_data, Test_data):

    X_train_upsampled = upsampled_train_data.drop('mcq010new', axis=1)
    y_train_upsampled = upsampled_train_data['mcq010new']

    X_test = Test_data.drop('mcq010new', axis=1)
    y_test = Test_data['mcq010new']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_upsampled)
    X_test_scaled = scaler.transform(X_test)


    # Train the model
    model.fit(X_train_scaled, y_train_upsampled)

    # Make predictions on the test set
    y_pred = model.predict(X_test_scaled)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    # Print the results
    print(f'Accuracy on Test Data: {accuracy * 100:.2f}%')
    print('Confusion Matrix:\n', conf_matrix)
    print('Classification Report:\n', classification_rep)
    print(f'ROC AUC on Test Data: {roc_auc:.2f}')


In [None]:
low_missing_data = drop_high_missing_column(row_data, 1)
imputed_data = impute_missing_values(low_missing_data, 10)
int_columns = extract_int_cols(imputed_data)
rounded_data = handle_int_columns(imputed_data, int_columns)

In [None]:
# row_data.isnull().mean()

In [None]:
# low_missing_data = drop_high_missing_column(row_data, 0.3)
row_data.shape, low_missing_data.shape

((9254, 129), (9254, 129))

In [None]:
Train_data, Test_data = train_test_split(rounded_data, test_size=0.2, stratify=rounded_data['mcq010new'], random_state=42)

In [None]:
upsampled_train_data = up_sampler(Train_data)

In [None]:
from sklearn.svm import SVC
model = SVC(kernel='linear', C=1)
trainer(model, upsampled_train_data, Test_data)

Accuracy on Test Data: 72.83%
Confusion Matrix:
 [[1147  398]
 [ 105  201]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.74      0.82      1545
           1       0.34      0.66      0.44       306

    accuracy                           0.73      1851
   macro avg       0.63      0.70      0.63      1851
weighted avg       0.82      0.73      0.76      1851

ROC AUC on Test Data: 0.70


In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
trainer(model, upsampled_train_data, Test_data)

Accuracy on Test Data: 84.22%
Confusion Matrix:
 [[1522   23]
 [ 269   37]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.99      0.91      1545
           1       0.62      0.12      0.20       306

    accuracy                           0.84      1851
   macro avg       0.73      0.55      0.56      1851
weighted avg       0.81      0.84      0.80      1851

ROC AUC on Test Data: 0.55


In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=10)
trainer(model, upsampled_train_data, Test_data)

Accuracy on Test Data: 67.59%
Confusion Matrix:
 [[1117  428]
 [ 172  134]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.72      0.79      1545
           1       0.24      0.44      0.31       306

    accuracy                           0.68      1851
   macro avg       0.55      0.58      0.55      1851
weighted avg       0.76      0.68      0.71      1851

ROC AUC on Test Data: 0.58


---
---

In [None]:
upsampled_train_data.shape

(12354, 129)

In [None]:
from torch.utils.data import Dataset, DataLoader

class Dataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self, ind):
        x = torch.tensor(self.data.drop('mcq010new', axis=1).iloc[ind])
        y = torch.tensor(self.data['mcq010new'].iloc[ind])
        return x, y


train_set = Dataset(upsampled_train_data)
test_set  = Dataset(Test_data)

batch_size = 512
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_set,  batch_size=batch_size, shuffle=False)

In [None]:

import torch
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(128, 512),
            nn.Linear(512, 2)
        )
    def forward(self, x):
        out = self.mlp(x)
        return out

model = MLP().to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

print(model)

MLP(
  (mlp): Sequential(
    (0): Linear(in_features=128, out_features=512, bias=True)
    (1): Linear(in_features=512, out_features=2, bias=True)
  )
)


In [None]:
epochs = 10

model.train()
for epoch in range(epochs):
    losses = []
    for batch_num, input_data in enumerate(train_loader):
        optimizer.zero_grad()
        x, y = input_data
        x = x.to(device).float()
        y = y.to(device)

        output = model(x)
        # print(output)
        # print(y)
        loss = criterion(output, y)
        loss.backward()
        losses.append(loss.item())

        optimizer.step()

        if batch_num % 40 == 0:
            print('\tEpoch %d | Batch %d | Loss %6.2f' % (epoch, batch_num, loss.item()))
            print(torch.sum(output.argmax(dim=1) == y))
    print('Epoch %d | Loss %6.2f' % (epoch, sum(losses)/len(losses)))

	Epoch 0 | Batch 0 | Loss  12.63
tensor(265, device='cuda:0')
Epoch 0 | Loss  21.28
	Epoch 1 | Batch 0 | Loss  19.67
tensor(270, device='cuda:0')
Epoch 1 | Loss  27.55
	Epoch 2 | Batch 0 | Loss  51.52
tensor(264, device='cuda:0')
Epoch 2 | Loss  25.88
	Epoch 3 | Batch 0 | Loss  20.80
tensor(251, device='cuda:0')
Epoch 3 | Loss  18.65
	Epoch 4 | Batch 0 | Loss  13.20
tensor(290, device='cuda:0')
Epoch 4 | Loss  16.35
	Epoch 5 | Batch 0 | Loss  35.39
tensor(292, device='cuda:0')
Epoch 5 | Loss  20.14
	Epoch 6 | Batch 0 | Loss  16.61
tensor(262, device='cuda:0')
Epoch 6 | Loss  14.32
	Epoch 7 | Batch 0 | Loss  16.17
tensor(258, device='cuda:0')
Epoch 7 | Loss  11.24
	Epoch 8 | Batch 0 | Loss   5.61
tensor(303, device='cuda:0')
Epoch 8 | Loss  20.86
	Epoch 9 | Batch 0 | Loss  16.93
tensor(280, device='cuda:0')
Epoch 9 | Loss  14.74


In [None]:
model.eval()

with torch.no_grad():
    for batch_num, input_data in enumerate(test_loader):
        x, y = input_data
        x = x.to(device).float()
        y = y.to(device)
        output = model(x)

        print(output.argmax(dim=1).shape, y.shape)
        print(torch.sum(output.argmax(dim=1) == y))

torch.Size([512]) torch.Size([512])
tensor(129, device='cuda:0')
torch.Size([512]) torch.Size([512])
tensor(140, device='cuda:0')
torch.Size([512]) torch.Size([512])
tensor(151, device='cuda:0')
torch.Size([315]) torch.Size([315])
tensor(89, device='cuda:0')
