In [16]:
import pandas
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, log_loss
from sklearn import svm


In [17]:
trainData = pandas.read_csv("P2_data_stroke_train.csv")
testData = pandas.read_csv("P2_data_stroke_test.csv")

# Participant ID,Sex,Age,Race,Income,Edu,Systolic,Diastolic,Pulse,BMI,HDL,Trig,LDL,TCHOL,eGFR,A1C,uACR,CurrentSmoker,Diabetes,Insurance,stroke
trainData = trainData.drop(columns = ["Participant ID","Race","Income","Edu","Insurance", "LDL", "Trig"])
trainData = trainData.dropna(subset=["stroke"])

trainData['Age*systolic'] = trainData['Age'] * trainData['Systolic']
trainData['Age*CurrentSmoker'] = trainData['Age'] * trainData['CurrentSmoker']
trainData['TCHOL/HDL'] = trainData['TCHOL'] / trainData['HDL']

imputer = KNNImputer(n_neighbors=15)
trainData = pandas.DataFrame(imputer.fit_transform(trainData), columns=trainData.columns)

haveStroke = trainData[trainData["stroke"] == 1]
noStroke = trainData[trainData["stroke"] == 2]
noStroke = noStroke.sample(n=len(haveStroke))
trainData = pandas.concat([haveStroke, noStroke])
trainData = trainData.sample(frac=1,)
trainData = trainData.reset_index(drop=True)


x = trainData[["Age","Systolic","Diastolic","Pulse","BMI","HDL","TCHOL",
               "eGFR","A1C","uACR","CurrentSmoker","Diabetes","Age*systolic", "Age*CurrentSmoker", "TCHOL/HDL"]]
y = trainData["stroke"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, train_size=0.8, random_state=2025)
svmModel = Pipeline([('scaler', RobustScaler()), ('svm', svm.SVC(kernel='rbf', C=1, gamma='scale', probability=True))])
svmModel.fit(x_train, y_train)

y_pred = svmModel.predict(x_test)

print((svmModel.score(x_test, y_test)) * 100)
# print(confusion_matrix(y_test, y_pred))
# print(svmModel.predict_proba(x_test))

73.27188940092167
[[93 19]
 [39 66]]
[[0.65518576 0.34481424]
 [0.69383104 0.30616896]
 [0.39745353 0.60254647]
 [0.84055912 0.15944088]
 [0.44573398 0.55426602]
 [0.79224225 0.20775775]
 [0.72182679 0.27817321]
 [0.35340257 0.64659743]
 [0.83657959 0.16342041]
 [0.7697786  0.2302214 ]
 [0.24943432 0.75056568]
 [0.8823586  0.1176414 ]
 [0.73338506 0.26661494]
 [0.65709461 0.34290539]
 [0.67776429 0.32223571]
 [0.55035082 0.44964918]
 [0.82144892 0.17855108]
 [0.60378521 0.39621479]
 [0.38109362 0.61890638]
 [0.24061793 0.75938207]
 [0.14027021 0.85972979]
 [0.75547971 0.24452029]
 [0.43026572 0.56973428]
 [0.74104506 0.25895494]
 [0.75859349 0.24140651]
 [0.86094392 0.13905608]
 [0.21870801 0.78129199]
 [0.47873054 0.52126946]
 [0.18041451 0.81958549]
 [0.39307524 0.60692476]
 [0.72984206 0.27015794]
 [0.61775407 0.38224593]
 [0.18754435 0.81245565]
 [0.51188536 0.48811464]
 [0.25627362 0.74372638]
 [0.17927449 0.82072551]
 [0.76769104 0.23230896]
 [0.54156081 0.45843919]
 [0.625193   

In [None]:
testData = testData.drop(columns = ["Participant ID", "Sex","Race","Income","Edu","Insurance", "LDL", "Trig"])
testData = pandas.DataFrame(imputer.fit_transform(testData), columns=testData.columns)

testData['Age*systolic'] = testData['Age'] * testData['Systolic']
testData['Age*CurrentSmoker'] = testData['Age'] * testData['CurrentSmoker']
testData['TCHOL/HDL'] = testData['TCHOL'] / testData['HDL']

finaltest = testData[["Age","Systolic","Diastolic","Pulse","BMI","HDL","TCHOL",
                      "eGFR","A1C","uACR","CurrentSmoker","Diabetes","Age*systolic", "Age*CurrentSmoker", "TCHOL/HDL"]]
y_predprob =svmModel.predict_proba(finaltest)[:,0]
print(y_predprob)

[0.57435456 0.5        0.29411406 0.1309645  0.66106709 0.48796538
 0.54628921 0.43524069 0.67644394 0.41384055 0.14604385 0.56771002
 0.78571186 0.05038482 0.56947793 0.22497814 0.51467127 0.13046934
 0.25180906 0.74033725 0.08464523 0.70286368 0.243963   0.56626622
 0.55762244 0.54281707 0.58821411 0.81815757 0.29750402 0.2215507
 0.28478797 0.63124798 0.89166396 0.09819276 0.20496309 0.28181444
 0.72348348 0.25906601 0.30772363 0.64373237 0.2828992  0.61051485
 0.14944167 0.4741854  0.18249171 0.18691447 0.12231444 0.61288156
 0.25559962 0.8285169  0.39116686 0.09101925 0.19420198 0.33720908
 0.1746679  0.6215248  0.78895003 0.36996249 0.63067541 0.17701939
 0.42335986 0.76224083 0.82126984 0.82244901 0.75124557 0.13998487
 0.29507264 0.75692759 0.23910762 0.46777401 0.65764309 0.31722379
 0.79599329 0.68155867 0.49196018 0.33334113 0.53730673 0.74560054
 0.70471311 0.18223113 0.78083433 0.32506234 0.18163309 0.07400313
 0.57500832 0.37381043 0.09537644 0.34373839 0.88058444 0.58886

In [None]:
# finalFile = pandas.read_csv("predictedProbabilitiesForStroke.csv")
# finalFile['svmPrediction'] = y_predprob
# finalFile.to_csv("predictedProbabilitiesForStroke.csv", index=False)