In [37]:
import pandas
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import log_loss

In [38]:
trainData = pandas.read_csv("P2_data_stroke_train.csv")
testData = pandas.read_csv("P2_data_stroke_test.csv")

# Participant ID,Sex,Age,Race,Income,Edu,Systolic,Diastolic,Pulse,BMI,HDL,Trig,LDL,TCHOL,eGFR,A1C,uACR,CurrentSmoker,Diabetes,Insurance,stroke
trainData = trainData.drop(columns = ["Participant ID","Race","Income","Edu","Insurance", "LDL", "Trig"])
trainData = trainData.dropna(subset=["stroke"])

imputer = KNNImputer(n_neighbors=15)
trainData = pandas.DataFrame(imputer.fit_transform(trainData), columns=trainData.columns)

haveStroke = trainData[trainData["stroke"] == 1]
noStroke = trainData[trainData["stroke"] == 2]
noStroke = noStroke.sample(n=len(haveStroke))
trainData = pandas.concat([haveStroke, noStroke])
trainData = trainData.sample(frac=1,)
trainData = trainData.reset_index(drop=True)


x = trainData[["Age","Systolic","Diastolic","Pulse","BMI","HDL","TCHOL",
               "eGFR","A1C","uACR","CurrentSmoker","Diabetes",]]
y = trainData["stroke"]

In [103]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, train_size=0.8, random_state=2025)
randForestModel = Pipeline([('scaler', RobustScaler()), ('RanomForest', RandomForestClassifier(n_estimators=100, max_depth=7,min_samples_leaf=1, criterion='entropy', random_state=0))])
randForestModel.fit(x_train, y_train)

y_pred = randForestModel.predict(x_test)

print((randForestModel.score(x_test, y_test)) * 100)
# print(confusion_matrix(y_test, y_pred))
# print(randForestModel.predict_proba(x_test))

74.65437788018433


In [40]:
testData = testData.drop(columns = ["Participant ID", "Sex","Race","Income","Edu","Insurance", "LDL", "Trig"])
testData = pandas.DataFrame(imputer.fit_transform(testData), columns=testData.columns)

finaltest = testData[["Age","Systolic","Diastolic","Pulse","BMI","HDL","TCHOL",
                      "eGFR","A1C","uACR","CurrentSmoker","Diabetes"]]
y_predprob =randForestModel.predict_proba(finaltest)[:,0]
# print(y_predprob)

In [None]:
# Add pred prob to csv
finalFile = pandas.read_csv("predictedProbabilitiesForStroke.csv")
finalFile['RandomForestPrediction'] = y_predprob
finalFile.to_csv("predictedProbabilitiesForStroke.csv", index=False)