In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('data/transfusion.data')
xDf = df.loc[:, df.columns != 'Donated']
y = df['Donated']
# get random numbers to split into train and test
np.random.seed(1)
r = np.random.rand(len(df))
# split into train test
X_train = xDf[r < 0.8]
X_test = xDf[r >= 0.8]
y_train = y[r < 0.8]
y_test = y[r >= 0.8]

In [10]:
y_test.shape

(153,)

In [26]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

  return self.partial_fit(X, y)


In [27]:
X_train_scaled.shape, X_test_scaled.shape

((595, 4), (153, 4))

In [29]:
X_test_scaled

array([[0.01351351, 0.24489796, 0.24489796, 0.46875   ],
       [0.02702703, 0.04081633, 0.04081633, 0.02083333],
       [0.05405405, 0.20408163, 0.20408163, 0.27083333],
       [0.12162162, 0.16326531, 0.16326531, 0.14583333],
       [0.05405405, 0.26530612, 0.26530612, 0.39583333],
       [0.05405405, 0.14285714, 0.14285714, 0.19791667],
       [0.05405405, 0.18367347, 0.18367347, 0.27083333],
       [0.05405405, 0.10204082, 0.10204082, 0.14583333],
       [0.02702703, 0.24489796, 0.24489796, 0.53125   ],
       [0.02702703, 0.16326531, 0.16326531, 0.35416667],
       [0.05405405, 0.12244898, 0.12244898, 0.23958333],
       [0.02702703, 0.06122449, 0.06122449, 0.14583333],
       [0.02702703, 0.02040816, 0.02040816, 0.02083333],
       [0.02702703, 0.06122449, 0.06122449, 0.14583333],
       [0.05405405, 0.10204082, 0.10204082, 0.21875   ],
       [0.02702703, 0.10204082, 0.10204082, 0.27083333],
       [0.05405405, 0.02040816, 0.02040816, 0.02083333],
       [0.05405405, 0.02040816,

In [34]:
# fit the model on the training set
clf = LogisticRegression(C=1).fit(X_train_scaled, y_train)

# predict probabilities on test set
scores = clf.predict_proba(X_test_scaled)[:,1]



In [35]:
scores

array([0.41395224, 0.3427626 , 0.40398377, 0.35261903, 0.42588863,
       0.36692555, 0.38414149, 0.34403432, 0.38473454, 0.35972156,
       0.3359104 , 0.32585655, 0.32429121, 0.32585655, 0.32340434,
       0.32742578, 0.30603377, 0.30603377, 0.35049743, 0.32866905,
       0.31508574, 0.3060285 , 0.29636387, 0.30549127, 0.30440788,
       0.29743087, 0.29743087, 0.24797648, 0.24978798, 0.28449299,
       0.27293705, 0.278826  , 0.3120145 , 0.3120145 , 0.3120145 ,
       0.3120145 , 0.28546064, 0.27281737, 0.29594214, 0.25829312,
       0.27051368, 0.24552161, 0.26970126, 0.25212188, 0.23788366,
       0.28867233, 0.28867233, 0.28867233, 0.28867233, 0.28867233,
       0.26879058, 0.28867233, 0.28867233, 0.24562876, 0.33207617,
       0.23011576, 0.20702378, 0.23311004, 0.25692864, 0.26264354,
       0.21664931, 0.21664931, 0.27824455, 0.21178876, 0.22314524,
       0.23337731, 0.20191926, 0.20191926, 0.20191926, 0.21143881,
       0.21858254, 0.20488348, 0.18321036, 0.21572238, 0.19462

In [36]:
fpr, tpr, thresholds = roc_curve(y_test, scores, pos_label=1)

In [37]:
fpr

array([0.        , 0.        , 0.02380952, 0.02380952, 0.03174603,
       0.03174603, 0.03174603, 0.03174603, 0.07142857, 0.07142857,
       0.11111111, 0.11904762, 0.11904762, 0.12698413, 0.12698413,
       0.13492063, 0.13492063, 0.16666667, 0.18253968, 0.20634921,
       0.20634921, 0.22222222, 0.24603175, 0.24603175, 0.31746032,
       0.36507937, 0.36507937, 0.37301587, 0.37301587, 0.38095238,
       0.38095238, 0.45238095, 0.45238095, 0.49206349, 0.51587302,
       0.53174603, 0.53174603, 0.56349206, 0.58730159, 0.6031746 ,
       0.61904762, 0.65079365, 0.67460317, 0.77777778, 0.80952381,
       0.86507937, 0.9047619 , 0.96031746, 0.97619048, 1.        ])

In [38]:
tpr

array([0.        , 0.03703704, 0.03703704, 0.07407407, 0.07407407,
       0.14814815, 0.22222222, 0.25925926, 0.25925926, 0.33333333,
       0.33333333, 0.37037037, 0.40740741, 0.40740741, 0.44444444,
       0.44444444, 0.48148148, 0.55555556, 0.55555556, 0.55555556,
       0.59259259, 0.59259259, 0.59259259, 0.62962963, 0.66666667,
       0.66666667, 0.7037037 , 0.7037037 , 0.74074074, 0.74074074,
       0.81481481, 0.81481481, 0.88888889, 0.88888889, 0.88888889,
       0.92592593, 0.96296296, 0.96296296, 0.96296296, 0.96296296,
       0.96296296, 0.96296296, 0.96296296, 0.96296296, 0.96296296,
       0.96296296, 1.        , 1.        , 1.        , 1.        ])

In [41]:
def get(preprocessing, c):
    # you need to preprocess the data according to user preferences (only fit preprocessing on train data)
    if preprocessing == "standardization":
        scaler = StandardScaler()
    elif preprocessing == "min-max-scaling":
        scaler = MinMaxScaler()
    else:
        return {'warning': 'Please preprocess the data using either \'standardization\' or \'min-max-scaling\''}

    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # fit the model on the training set
    clf = LogisticRegression(C=c).fit(X_train_scaled, y_train)

    # predict probabilities on test set
    scores = clf.predict_proba(X_test_scaled)[:,1]

    # return the false positives, true positives, and thresholds using roc_curve()
    fpr, tpr, thresholds = roc_curve(y_test, scores, pos_label=1)
    print(len(fpr))
    res = []
    for i in range(len(fpr)):
        res_item = {}
        res_item['tpr'] = tpr[i]
        res_item['fpr'] = fpr[i]
        res_item['threshold'] = thresholds[i]
        res.append(res_item)
    return res

In [46]:
"-1".isdigit()

False