In [14]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
kidney_df_cleaned = pd.read_csv("merged/kidney_clean_1.csv")
kidney_df_cleaned

Unnamed: 0.1,Unnamed: 0,Transplant Year,Age Group,N,Graft_Survival_Rate_1,Patient_Survival_Rate_1,All ABO,O,A,B,...,Retransplant/Graft Failure,Tubular and Interstitial Diseases,"White, Non-Hispanic","Black, Non-Hispanic",Hispanic/Latino,Unknown,"Asian, Non-Hispanic","American Indian/Alaska Native, Non-Hispanic","Pacific Islander, Non-Hispanic","Multiracial, Non-Hispanic"
0,0,2021,<1 Year,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,130,52,49,0,11,4,0,11
1,1,2021,1-5 Years,178.0,98.30,99.42,184.0,77.0,74.0,28.0,...,0.0,8.0,272,94,95,0,36,1,0,13
2,2,2021,6-10 Years,127.0,97.60,100.00,138.0,75.0,43.0,15.0,...,5.0,11.0,142,39,88,0,11,2,1,5
3,3,2021,11-17 Years,444.0,97.49,99.77,497.0,258.0,162.0,59.0,...,21.0,42.0,383,145,235,0,38,10,4,19
4,4,2021,18-34,2428.0,97.39,99.36,2975.0,1432.0,992.0,431.0,...,332.0,116.0,1884,1054,1158,0,217,32,24,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,267,1988,11-17 Years,271.0,85.19,96.65,407.0,182.0,163.0,44.0,...,3.0,62.0,356,103,62,3,11,4,0,0
268,268,1988,18-34,2076.0,86.15,95.63,2794.0,1286.0,1060.0,322.0,...,10.0,235.0,2419,603,303,3,83,16,7,0
269,269,1988,35-49,2579.0,83.17,93.57,3289.0,1486.0,1274.0,383.0,...,8.0,201.0,3196,838,291,5,109,43,1,0
270,270,1988,50-64,1654.0,80.53,88.41,1926.0,876.0,730.0,232.0,...,2.0,112.0,2487,490,179,6,60,20,3,0


In [4]:
good_survival = pd.cut( kidney_df_cleaned["Patient_Survival_Rate_1"], [-1, 25, 50, 75, 100], labels=["D", "C", "B", "A"])
good_survival

0      D
1      A
2      A
3      A
4      A
      ..
267    A
268    A
269    A
270    A
271    A
Name: Patient_Survival_Rate_1, Length: 272, dtype: category
Categories (4, object): ['D' < 'C' < 'B' < 'A']

In [6]:
#kidney_df_cleaned["Patient_Survival_status"] = good_survival
Y = kidney_df_cleaned["Patient_Survival_Rate_1"]
X = kidney_df_cleaned.drop(columns=['Unnamed: 0', 'N', 'Patient_Survival_Rate_1'])

In [7]:
X = pd.get_dummies(X)
X

Unnamed: 0,Transplant Year,Graft_Survival_Rate_1,All ABO,O,A,B,AB,Deceased Donor,Living Donor,"Congenital, Rare, Familial, and Metaboli",...,"Pacific Islander, Non-Hispanic","Multiracial, Non-Hispanic",Age Group_1-5 Years,Age Group_11-17 Years,Age Group_18-34,Age Group_35-49,Age Group_50-64,Age Group_6-10 Years,Age Group_65+,Age Group_<1 Year
0,2021,0.00,0.0,0.0,0.0,0.0,0.0,223,34,0.0,...,0,11,0,0,0,0,0,0,0,1
1,2021,98.30,184.0,77.0,74.0,28.0,5.0,404,107,76.0,...,0,13,1,0,0,0,0,0,0,0
2,2021,97.60,138.0,75.0,43.0,15.0,5.0,240,48,50.0,...,1,5,0,0,0,0,0,1,0,0
3,2021,97.49,497.0,258.0,162.0,59.0,18.0,712,122,119.0,...,4,19,0,1,0,0,0,0,0,0
4,2021,97.39,2975.0,1432.0,992.0,431.0,120.0,3373,1034,154.0,...,24,38,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,1988,85.19,407.0,182.0,163.0,44.0,18.0,369,170,76.0,...,0,0,0,1,0,0,0,0,0,0
268,1988,86.15,2794.0,1286.0,1060.0,322.0,125.0,2581,853,88.0,...,7,0,0,0,1,0,0,0,0,0
269,1988,83.17,3289.0,1486.0,1274.0,383.0,146.0,3982,501,32.0,...,1,0,0,0,0,1,0,0,0,0
270,1988,80.53,1926.0,876.0,730.0,232.0,88.0,3055,190,2.0,...,3,0,0,0,0,0,1,0,0,0


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1)
X_train.shape

(204, 36)

In [10]:
# Create a random forest classifier
lr_model = LinearRegression()
# Fitting the model
lr_model.fit(X_train, Y_train)
# Making predictions using the testing data
predictions = lr_model.predict(X_test)
predictions

array([ 9.20515255e+01,  9.35595550e+01,  8.60436170e+01,  9.70005525e+01,
        1.00248502e+02,  1.00050267e+02,  9.98521229e+01,  1.00101004e+02,
        1.69383907e-01,  9.58128325e+01,  9.11008786e+01,  9.42192857e+01,
        8.79325571e+01,  9.89967687e+01,  9.94212359e+01,  9.77401534e+01,
        9.85697589e+01,  9.49104970e+01,  9.95033360e+01,  9.87347759e+01,
        9.82092493e+01,  9.66114498e+01,  9.98077249e+01,  9.88369348e+01,
        9.10327774e+01,  9.98246575e+01,  8.27564408e+01,  8.89250262e+01,
        9.79500176e+01,  9.78547687e+01,  9.36408309e+01,  9.69038095e+01,
        9.82236261e+01,  9.37973814e+01,  9.98661968e+01,  1.00891744e+02,
        9.65321390e+01,  9.51293486e+01,  9.82916224e+01,  9.55061622e+01,
        1.00370822e+02,  9.61622042e+01,  1.00165235e+02,  9.93211475e+01,
        9.79499034e+01,  9.36099045e+01,  9.41039199e+01,  9.95659933e+01,
        1.00507516e+02,  9.98030099e+01,  9.96531155e+01, -8.05159100e-02,
        9.69156581e+01,  

In [15]:
score = lr_model.score(X_train, Y_train, sample_weight=None)
r2 = r2_score(Y_test, predictions)
print(f"Accuracy Score : {score}")
print(f"r2 Score : {r2}")

Accuracy Score : 0.9995454840316024
r2 Score : 0.9985557757197234
