<a href="https://colab.research.google.com/github/omarabohammer/tuberculosis-classification/blob/main/building_svm_model_from_scratch_with_penguin_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import torch
import torch.nn as nn
import torch.optim as optim

warnings.filterwarnings('ignore')


In [45]:
import kagglehub

path = kagglehub.dataset_download("miadul/tuberculosis-x-ray-dataset-synthetic")

print("path to dataset files: ", path)

path to dataset files:  /root/.cache/kagglehub/datasets/miadul/tuberculosis-x-ray-dataset-synthetic/versions/1


In [46]:
import os


path = kagglehub.dataset_download("miadul/tuberculosis-x-ray-dataset-synthetic")

# Find the CSV file within the downloaded directory
for filename in os.listdir(path):
    if filename.endswith(".csv"):
        csv_file_path = os.path.join(path, filename)
        break  # Stop searching once a CSV file is found

# Now use the CSV file path to read the data
df = pd.read_csv(csv_file_path)
df.head()



Unnamed: 0,Patient_ID,Age,Gender,Chest_Pain,Cough_Severity,Breathlessness,Fatigue,Weight_Loss,Fever,Night_Sweats,Sputum_Production,Blood_in_Sputum,Smoking_History,Previous_TB_History,Class
0,PID000001,69,Male,Yes,1,2,3,2.37,Moderate,Yes,Medium,Yes,Former,Yes,Normal
1,PID000002,32,Female,Yes,3,0,9,6.09,Moderate,No,Medium,No,Current,Yes,Normal
2,PID000003,89,Male,No,7,0,3,2.86,Mild,Yes,Medium,No,Current,No,Tuberculosis
3,PID000004,78,Female,Yes,2,0,6,4.57,Moderate,No,High,Yes,Never,Yes,Tuberculosis
4,PID000005,38,Male,No,7,2,5,13.86,High,Yes,Low,No,Never,Yes,Tuberculosis


In [47]:
df.tail()

Unnamed: 0,Patient_ID,Age,Gender,Chest_Pain,Cough_Severity,Breathlessness,Fatigue,Weight_Loss,Fever,Night_Sweats,Sputum_Production,Blood_in_Sputum,Smoking_History,Previous_TB_History,Class
19995,PID019996,53,Male,No,0,4,6,11.72,Moderate,Yes,Medium,No,Never,Yes,Normal
19996,PID019997,27,Female,No,3,1,1,5.67,Moderate,Yes,Low,Yes,Current,Yes,Tuberculosis
19997,PID019998,23,Male,No,1,2,8,3.91,High,Yes,Low,Yes,Never,No,Normal
19998,PID019999,24,Female,No,8,4,6,8.77,Moderate,Yes,Medium,Yes,Never,Yes,Normal
19999,PID020000,48,Male,No,9,4,4,4.84,High,No,High,No,Never,Yes,Normal


In [48]:
df.dtypes


Unnamed: 0,0
Patient_ID,object
Age,int64
Gender,object
Chest_Pain,object
Cough_Severity,int64
Breathlessness,int64
Fatigue,int64
Weight_Loss,float64
Fever,object
Night_Sweats,object


In [49]:
print(df[['Class']]) # 'class' should be 'Class'

              Class
0            Normal
1            Normal
2      Tuberculosis
3      Tuberculosis
4      Tuberculosis
...             ...
19995        Normal
19996  Tuberculosis
19997        Normal
19998        Normal
19999        Normal

[20000 rows x 1 columns]


In [50]:
x = df.drop(columns=['Class'])
y = df['Class']


In [51]:
from sklearn.preprocessing import LabelEncoder

label_encoder = {}
for col in x.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    x[col] = le.fit_transform(x[col])
    label_encoder[col] = le


In [52]:
if y.dtype == 'object':
    y_le = LabelEncoder()
    y = y_le.fit_transform(y)

In [53]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)



In [39]:
print(x_train.shape)

(16000, 14)


In [40]:
print(x_train)

       Patient_ID  Age  Gender  Chest_Pain  Cough_Severity  Breathlessness  \
5894         5894   27       1           0               0               1   
3728         3728   42       0           0               9               3   
8958         8958   41       0           0               0               3   
7671         7671   51       0           1               8               2   
5999         5999   79       0           0               8               0   
...           ...  ...     ...         ...             ...             ...   
11284       11284   21       0           1               0               1   
11964       11964   46       1           1               8               3   
5390         5390   66       0           1               8               3   
860           860   74       0           0               2               1   
15795       15795   42       1           1               4               2   

       Fatigue  Weight_Loss  Fever  Night_Sweats  Sputum_Produc

In [57]:
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)
print(y_train)

[0. 1. 0. ... 1. 0. 0.]


In [58]:
x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)
print(x_train)

       Patient_ID   Age  Gender  Chest_Pain  Cough_Severity  Breathlessness  \
5894       5894.0  27.0     1.0         0.0             0.0             1.0   
3728       3728.0  42.0     0.0         0.0             9.0             3.0   
8958       8958.0  41.0     0.0         0.0             0.0             3.0   
7671       7671.0  51.0     0.0         1.0             8.0             2.0   
5999       5999.0  79.0     0.0         0.0             8.0             0.0   
...           ...   ...     ...         ...             ...             ...   
11284     11284.0  21.0     0.0         1.0             0.0             1.0   
11964     11964.0  46.0     1.0         1.0             8.0             3.0   
5390       5390.0  66.0     0.0         1.0             8.0             3.0   
860         860.0  74.0     0.0         0.0             2.0             1.0   
15795     15795.0  42.0     1.0         1.0             4.0             2.0   

       Fatigue  Weight_Loss  Fever  Night_Sweats  S

In [63]:
x_train = torch.tensor(x_train, dtype=torch.float32)
x_test = torch.tensor(x_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)


In [65]:
class SVM(nn.Module):
    def __init__(self, input_dim):
        super(SVM, self).__init__()
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
      return self.fc(x)

In [67]:
model = SVM(x_train.shape[1])
criterion = nn.HingeEmbeddingLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [69]:
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(x_train).squeeze()
    labels = 2 * y_train - 1  # Convert labels to {-1, 1} for hinge loss
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

In [71]:
from sklearn.metrics import accuracy_score
with torch.no_grad():
    y_pred = model(x_test).squeeze().numpy()
    y_pred = (y_pred > 0).astype(int)
accuracy = accuracy_score(y_test.numpy().astype(int), y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.70
