In [None]:
import pandas as pd
import matplotlib.pyplot as pls
import seaborn as sns
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import joblib

In [None]:
df = pd.read_csv("/content/Disease-Prediction-Data.csv")
df.head()

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
unique_diseases = df['diseases'].unique()
disease_df = pd.DataFrame(unique_diseases, columns=['Disease'])
disease_df["Description"] = None
disease_df["Symptoms"] = None
disease_df["Medicines and Treatment"] = None
disease_df.to_csv("DiseaseDetails.csv")

In [None]:
df.shape

(246945, 378)

In [None]:
df['diseases'].nunique()

773

In [None]:
# Removing diseases that only occur once
disease_counts = df['diseases'].value_counts()
rare_diseases = disease_counts[disease_counts == 1].index.tolist()
df = df[~df['diseases'].isin(rare_diseases)]
df['diseases'].value_counts().min()

2

In [None]:
df['diseases'].nunique()

754

In [None]:
le = LabelEncoder()
df['diseases'] = le.fit_transform(df['diseases'])
df.head()

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,516,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,516,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,516,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,516,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,516,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
joblib.dump(le, 'label_encoder.pkl')

['label_encoder.pkl']

In [None]:
df.shape

(246926, 378)

In [None]:
X = torch.from_numpy(df.iloc[:, 1:].to_numpy())
X

tensor([[1, 0, 1,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [None]:
y = torch.from_numpy(df['diseases'].to_numpy())
y

tensor([516, 516, 516,  ..., 491, 491, 491])

In [None]:
RANDOM_SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED)
X_train, X_test  = X_train.to(torch.float), X_test.to(torch.float)
X_train

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
class DiseasePredictor(nn.Module):
  def __init__(self, input_size=377, output_size=754):
    super().__init__()

    self.layers = nn.Sequential(
        nn.Linear(input_size, 512),
        nn.LeakyReLU(0.001),
        nn.BatchNorm1d(512),
        nn.Dropout(0.3),

        nn.Linear(512, 1024),
        nn.LeakyReLU(0.001),
        nn.BatchNorm1d(1024),
        nn.Dropout(0.3),

        nn.Linear(1024, 512),
        nn.LeakyReLU(0.01),
        nn.BatchNorm1d(512),
        nn.Dropout(0.3),

        nn.Linear(512, output_size)
    )

  def forward(self, X):
    return self.layers(X)

In [None]:
def accuracy_func(y_logits, y_true):
  y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1)
  accuracy = (y_pred.eq(y_true).sum() / len(y_true)) * 100
  return accuracy

In [None]:
def focal_loss(logits, targets, alpha=1, gamma=2):
    ce_loss = nn.functional.cross_entropy(logits, targets, reduction='none')
    pt = torch.exp(-ce_loss)
    return (alpha * (1 - pt) ** gamma * ce_loss).mean()

In [None]:
def train_model(model, epochs, lr, class_weights, X_train=X_train, y_train=y_train):
  torch.manual_seed(42)
  torch.cuda.manual_seed(42)

  model.to(device)
  X_train = X_train.to(device)
  y_train = y_train.to(device)

  loss_fn = nn.CrossEntropyLoss(weight=class_weights)
  optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, weight_decay=1e-5)

  model.train()

  for epoch in range(1, epochs + 1):
    y_logits = model(X_train)
    loss = loss_fn(y_logits, y_train)
    accuracy = accuracy_func(y_logits, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if ((epoch % 10) == 0):
      print(f"Epoch: {epoch} | Loss: {loss:.4f} | Accuracy: {accuracy:.4f}%")
      test_model(model, class_weights)
      print("-----------------------------------------------------------------")

In [None]:
def test_model(model, class_weights, X_test=X_test, y_test=y_test):
  model.eval()

  model.to(device)
  X_test = X_test.to(device)
  y_test = y_test.to(device)

  with torch.inference_mode():
    y_logits = model(X_test)

  loss_fn = nn.CrossEntropyLoss(weight=class_weights)
  loss = loss_fn(y_logits, y_test)
  accuracy = accuracy_func(y_logits, y_test)

  print(f"Test Loss: {loss:.4f} | Test Accuracy: {accuracy:.4f}%")

In [None]:
model = DiseasePredictor().to(device)

y_numpy = y.cpu().numpy()
class_weights = torch.tensor(compute_class_weight(class_weight='balanced', classes=np.unique(y_numpy), y=y_numpy), dtype=torch.float).to(device)

In [None]:
test_model(model, class_weights)

Test Loss: 6.6261 | Test Accuracy: 0.0324%


In [None]:
EPOCHS = 300
lr = 3e-4
y_numpy = y.cpu().numpy()
class_weights = torch.tensor(compute_class_weight(class_weight='balanced', classes=np.unique(y_numpy), y=y_numpy), dtype=torch.float).to(device)
train_model(model, EPOCHS, lr, class_weights)

Epoch: 10 | Loss: 5.2479 | Accuracy: 16.4508%
Test Loss: 6.5880 | Test Accuracy: 0.0790%
-----------------------------------------------------------------
Epoch: 20 | Loss: 6.5160 | Accuracy: 0.2835%
Test Loss: 6.5170 | Test Accuracy: 0.3847%
-----------------------------------------------------------------
Epoch: 30 | Loss: 6.4454 | Accuracy: 18.8316%
Test Loss: 6.4449 | Test Accuracy: 22.2411%
-----------------------------------------------------------------
Epoch: 40 | Loss: 6.3434 | Accuracy: 39.6654%
Test Loss: 6.3387 | Test Accuracy: 40.6674%
-----------------------------------------------------------------
Epoch: 50 | Loss: 6.1560 | Accuracy: 48.7172%
Test Loss: 6.1398 | Test Accuracy: 48.1898%
-----------------------------------------------------------------
Epoch: 60 | Loss: 5.7961 | Accuracy: 41.9677%
Test Loss: 5.7570 | Test Accuracy: 40.6532%
-----------------------------------------------------------------
Epoch: 70 | Loss: 5.1204 | Accuracy: 31.0504%
Test Loss: 5.0498 | T

In [None]:
test_model(model, class_weights)

Test Loss: 0.3550 | Test Accuracy: 86.6116%


In [None]:
model.to(device)
X_test = X_test.to(device)
y_test = y_test.to(device)

with torch.inference_mode():
  y_logits = model(X_test)

  loss_fn = nn.CrossEntropyLoss(weight=class_weights)
  loss = loss_fn(y_logits, y_test)
  accuracy = accuracy_func(y_logits, y_test)

  print(f"Test Loss: {loss:.4f} | Test Accuracy: {accuracy:.4f}%")

Test Loss: 0.3550 | Test Accuracy: 86.6116%


In [None]:
torch.save(model.state_dict(), 'model_weights.pth')