In [11]:
import pandas
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import log_loss

In [12]:
trainData = pandas.read_csv("P2_data_stroke_train.csv")
testData = pandas.read_csv("P2_data_stroke_test.csv")

# Participant ID,Sex,Age,Race,Income,Edu,Systolic,Diastolic,Pulse,BMI,HDL,Trig,LDL,TCHOL,eGFR,A1C,uACR,CurrentSmoker,Diabetes,Insurance,stroke
trainData = trainData.drop(columns = ["Participant ID", "Sex","Race","Income","Edu","Insurance", "LDL", "Trig"])
trainData = trainData.dropna(subset=["stroke"])


imputer = KNNImputer(n_neighbors=15)
trainData = pandas.DataFrame(imputer.fit_transform(trainData), columns=trainData.columns)

haveStroke = trainData[trainData["stroke"] == 1]
noStroke = trainData[trainData["stroke"] == 2]
noStroke = noStroke.sample(n=len(haveStroke))
trainData = pandas.concat([haveStroke, noStroke])
trainData = trainData.sample(frac=1,)
trainData = trainData.reset_index(drop=True)


x = trainData[["Age","Systolic","Diastolic","Pulse","BMI","HDL","TCHOL","eGFR","A1C","uACR","CurrentSmoker","Diabetes"]]
y = trainData["stroke"]


In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, train_size=0.8, random_state=2025)
logR_model = Pipeline([('scaler', RobustScaler()), ('model', LogisticRegression(C = 1))])
logR_model.fit(x_train, y_train)

y_pred = logR_model.predict(x_test)

print((logR_model.score(x_test, y_test)) * 100)
print(confusion_matrix(y_test, y_pred))



78.80184331797236
[[78 21]
 [25 93]]


In [40]:
testData = pandas.read_csv("P2_data_stroke_test.csv")
participantId = testData["Participant ID"]

testData = testData.drop(columns = ["Participant ID", "Sex","Race","Income","Edu","Insurance", "LDL", "Trig"])

testData = pandas.DataFrame(imputer.fit_transform(testData), columns=testData.columns)
finaltest = testData[["Age","Systolic","Diastolic","Pulse","BMI","HDL","TCHOL","eGFR","A1C","uACR","CurrentSmoker","Diabetes"]]
y_predprob =logR_model.predict_proba(finaltest)[:,0]
# print(y_predprob)

finalcsv = pandas.DataFrame({"Participant ID": participantId, "LogisticRegressionPrediction": y_predprob})

finalcsv.to_csv("predictedProbabilitiesForStroke", index=False)