In [2]:
import pandas as pd

In [3]:
patients_data = pd.read_csv("patients_preprocessed.csv")

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Prepare the dataset for training
# We will remove rows with "Unknown" diagnosis to train the model and then predict for the "Unknown" ones
known_diagnosis_data = patients_data[patients_data['Diagnosis'] != 'Unknown']
unknown_diagnosis_data = patients_data[patients_data['Diagnosis'] == 'Unknown']

# Features and target
X = known_diagnosis_data[['Age', 'LabResult']]
y = known_diagnosis_data['Diagnosis']

# Split the data into training and testing sets for model validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Validate the model on the test set
accuracy = clf.score(X_test, y_test)

# Predict the diagnosis for the rows with "Unknown" diagnosis
X_unknown = unknown_diagnosis_data[['Age', 'LabResult']]
predicted_diagnosis = clf.predict(X_unknown)

# Update the dataframe with the imputed values
unknown_diagnosis_data['Diagnosis'] = predicted_diagnosis

# Combine the data back together
imputed_data = pd.concat([known_diagnosis_data, unknown_diagnosis_data], axis=0).sort_index()

accuracy, imputed_data[imputed_data['Diagnosis'] == 'Unknown'].shape[0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(0.0625, 0)

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Since k-NN is sensitive to feature scales, we'll standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_unknown_scaled = scaler.transform(X_unknown)

# Train a k-NN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # We'll use the commonly chosen k=5 for simplicity
knn.fit(X_train_scaled, y_train)

# Validate the model on the test set
knn_accuracy = knn.score(X_test_scaled, y_test)

knn_accuracy


0.375