In [79]:
import pandas
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [80]:
trainData = pandas.read_csv("NHANES_Data_P1_MI_train.csv")
testData = pandas.read_csv("NHANES_Data_P1_MI_test.csv")

In [81]:
# Remove rows that have missing "MI"
trainData = trainData.dropna(subset=["MI"])

# drop columns that are not needed
trainData = trainData.drop(columns = ["ID", "Income", "Edu", "Insurance"])


In [82]:
# clean data
imputer = KNNImputer(n_neighbors=10)
imputerTrainData = imputer.fit_transform(trainData)

imputerTrainData = pandas.DataFrame(imputerTrainData, columns=trainData.columns)

In [83]:
# Prepare the data
x = imputerTrainData[["Sex", "Age", "Race", "Systolic", "Diastolic", "Pulse", "BMI", "HDL", "Trig", "LDL", "TCHOL", "eGFR", "CurrentSmoker", "Diabetes"]]
y = imputerTrainData[["MI"]]


In [84]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=42)

In [85]:
# scale data
scaler = RobustScaler()
# fit_transform for train and transform for test
scaledXTrainData = scaler.fit_transform(xTrain)
scaledXTestData = scaler.transform(xTest)

In [86]:
knn = KNeighborsClassifier(n_neighbors=101)
# Pass y as a 1D array to avoid DataConversionWarning
knn.fit(scaledXTrainData, yTrain.values.ravel())

# print('score:', knn.score(scaledXTestData, yTest.values.ravel()))

pp = knn.predict_proba(scaledXTestData)

print('\n', pp)


 [[0.         1.        ]
 [0.00990099 0.99009901]
 [0.01980198 0.98019802]
 ...
 [0.10891089 0.89108911]
 [0.06930693 0.93069307]
 [0.         1.        ]]


In [87]:
testData = testData.drop(columns = ["Income", "Edu", "Insurance"])

# Impute missing values in testData using the same imputer as trainData
imputedTestData = imputer.fit_transform(testData)

imputedTestData = pandas.DataFrame(imputedTestData, columns=testData.columns)

# Select features for prediction (same as x)
xTestData = imputedTestData[["Sex", "Age", "Race", "Systolic", "Diastolic", "Pulse", "BMI", "HDL", "Trig", "LDL", "TCHOL", "eGFR", "CurrentSmoker", "Diabetes"]]

# Scale test data using the fitted scaler
scaledXTestDataForPrediction = scaler.transform(xTestData)

# Predict probabilities
mi_prob = knn.predict_proba(scaledXTestDataForPrediction)[:, 1]


# Add probability column to imputedTestData
imputedTestData["MI_Probability"] = mi_prob

finalData = imputedTestData[["ID", "MI_Probability"]]

# Save to CSV
finalData.to_csv("test_with_prob.csv", index=False)
