## Import relevant libraries

In [39]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

## Load the data

In [41]:
patients_data = pd.read_csv("patients_preprocessed.csv")
patients_data.head()

Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult,Diagnosis_Encoded
0,1001,Patient_1,20,Diabetes,167,0
1,1002,Patient_2,74,Diabetes,153,0
2,1003,Patient_3,40,Hypertension,196,1
3,1004,Patient_4,23,Unknown,157,2
4,1005,Patient_5,35,Diabetes,115,0


## Preprocess the data

### Separate the data that has known diagnosis and "Unknown" diagnosis

In [42]:
# Prepare the dataset for training
# We will remove rows with "Unknown" diagnosis to train the model and then predict for the "Unknown" ones
known_diagnosis_data = patients_data[patients_data['Diagnosis'] != 'Unknown']
unknown_diagnosis_data = patients_data[patients_data['Diagnosis'] == 'Unknown']

### Assign Independent and Dependent Variable

In [58]:
# Features and target
X = known_diagnosis_data[['Age', 'LabResult']]
y = known_diagnosis_data['Diagnosis']

### Split the data into training and testing

In [63]:
# Split the data into training and testing sets for model validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train the Model

In [94]:
# Train a Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [77]:
# Validate the model on the test set
accuracy = clf.score(X_test, y_test)
accuracy

0.0625

## Predict the diagnosis

In [82]:
# Predict the diagnosis for the rows with "Unknown" diagnosis
X_unknown = unknown_diagnosis_data[['Age', 'LabResult']]
predicted_diagnosis = clf.predict(X_unknown)
predicted_diagnosis

array(['Flu', 'Flu', 'Hypertension', 'Diabetes', 'Diabetes', 'Diabetes',
       'Diabetes', 'Common Cold', 'Flu', 'Diabetes', 'Flu', 'Diabetes',
       'Common Cold', 'Diabetes', 'Common Cold', 'Hypertension', 'Flu',
       'Common Cold', 'Common Cold', 'Common Cold', 'Common Cold'],
      dtype=object)

In [84]:
# Update the dataframe with the imputed values
unknown_diagnosis_data['Diagnosis'] = predicted_diagnosis

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_diagnosis_data['Diagnosis'] = predicted_diagnosis


In [109]:
# Combine the data back together
imputed_data = pd.concat([known_diagnosis_data, unknown_diagnosis_data], axis=0).sort_index()
imputed_data.head()

Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult,Diagnosis_Encoded
0,1001,Patient_1,20,Diabetes,167,0
1,1002,Patient_2,74,Diabetes,153,0
2,1003,Patient_3,40,Hypertension,196,1
3,1004,Patient_4,23,Flu,157,2
4,1005,Patient_5,35,Diabetes,115,0


In [107]:
accuracy, imputed_data[imputed_data['Diagnosis'] == 'Unknown'].shape[0]

(0.0625, 0)

## Standardize the features

In [97]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Since k-NN is sensitive to feature scales, we'll standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_unknown_scaled = scaler.transform(X_unknown)

In [99]:
# Train a k-NN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # We'll use the commonly chosen k=5 for simplicity
knn.fit(X_train_scaled, y_train)

In [101]:
# Validate the model on the test set
knn_accuracy = knn.score(X_test_scaled, y_test)

knn_accuracy

0.375