**Run the following two cells before you begin.**

In [1]:
%autosave 10

In [2]:
import pandas as pd
import numpy as np

______________________________________________________________________
**First, import your data set and define the sigmoid function.**
<details>
    <summary>Hint:</summary>
    The definition of the sigmoid is $f(x) = \frac{1}{1 + e^{-X}}$.
</details>

In [3]:
# Import the data set
df =pd.read_csv('cleaned_data.csv')

In [4]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,EDUCATION_CAT,graduate school,high school,others,university
0,798fc410-45c1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,1,university,0,0,0,1
1,8a8c8f3b-8eb4,120000,2,2,2,26,-1,2,0,0,...,1000,1000,0,2000,1,university,0,0,0,1
2,85698822-43f5,90000,2,2,2,34,0,0,0,0,...,1000,1000,1000,5000,0,university,0,0,0,1
3,0737c11b-be42,50000,2,2,1,37,0,0,0,0,...,1200,1100,1069,1000,0,university,0,0,0,1
4,3b7f77cc-dbc0,50000,1,2,1,57,-1,0,-1,0,...,10000,9000,689,679,0,university,0,0,0,1


In [5]:
# Define the sigmoid function
def sig_func(x):
    s= 1/(1+np.exp(-x))
    return s

**Now, create a train/test split (80/20) with `PAY_1` and `LIMIT_BAL` as features and `default payment next month` as values. Use a random state of 24.**

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
# Create a train/test split
X = df[['PAY_1','LIMIT_BAL']]
Y = df['default payment next month']

X,Y

(       PAY_1  LIMIT_BAL
 0          2      20000
 1         -1     120000
 2          0      90000
 3          0      50000
 4         -1      50000
 ...      ...        ...
 26659      0     220000
 26660     -1     150000
 26661      4      30000
 26662      1      80000
 26663      0      50000
 
 [26664 rows x 2 columns],
 0        1
 1        1
 2        0
 3        0
 4        0
         ..
 26659    0
 26660    0
 26661    1
 26662    1
 26663    1
 Name: default payment next month, Length: 26664, dtype: int64)

In [8]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state = 24)
X_test.shape

(5333, 2)

______________________________________________________________________
**Next, import LogisticRegression, with the default options, but set the solver to `'liblinear'`.**

In [9]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver='liblinear')

______________________________________________________________________
**Now, train on the training data and obtain predicted classes, as well as class probabilities, using the testing data.**

In [10]:
# Fit the logistic regression model on training data
log_model.fit(X_train,Y_train)

LogisticRegression(solver='liblinear')

In [11]:
# Make predictions using `.predict()`
x_test_pred = log_model.predict(X_test)

In [12]:
x_test_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [13]:
# Find class probabilities using `.predict_proba()`
x_test_prob = log_model.predict_proba(X_test)

In [14]:
x_test_prob

array([[0.74826924, 0.25173076],
       [0.584297  , 0.415703  ],
       [0.79604453, 0.20395547],
       ...,
       [0.584297  , 0.415703  ],
       [0.82721498, 0.17278502],
       [0.66393435, 0.33606565]])

______________________________________________________________________
**Then, pull out the coefficients and intercept from the trained model and manually calculate predicted probabilities. You'll need to add a column of 1s to your features, to multiply by the intercept.**

In [15]:
# Add column of 1s to features
ones_col = np.ones(5333).reshape(-1,1)
ones_col,ones_col.shape


(array([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]]),
 (5333, 1))

In [16]:
new_feats = np.concatenate((ones_col,X_test),axis=1)
new_feats

array([[ 1.0e+00,  2.0e+00,  1.6e+05],
       [ 1.0e+00,  1.0e+00,  5.0e+04],
       [ 1.0e+00, -1.0e+00,  2.0e+05],
       ...,
       [ 1.0e+00, -1.0e+00,  5.0e+04],
       [ 1.0e+00,  1.0e+00,  2.3e+05],
       [ 1.0e+00,  2.0e+00,  1.0e+05]])

In [17]:
# Get coefficients and intercepts from trained model
log_model_coeff=log_model.coef_
log_model_intercept=log_model.intercept_
log_model_coeff, log_model_intercept

(array([[ 8.27451187e-11, -6.80876727e-06]]), array([-6.57647457e-11]))

In [18]:
const_features = np.concatenate((log_model_intercept.reshape(1,1),log_model_coeff),axis=1)
const_features = const_features.transpose()
const_features.shape

(3, 1)

In [19]:
new_feats.shape

(5333, 3)

In [20]:
prob_array = new_feats @ const_features
prob_array

array([[-1.08940276],
       [-0.34043836],
       [-1.36175345],
       ...,
       [-0.34043836],
       [-1.56601647],
       [-0.68087673]])

In [21]:
sigm_prob = sig_func(prob_array)

In [22]:
sigm_prob

array([[0.25173076],
       [0.415703  ],
       [0.20395547],
       ...,
       [0.415703  ],
       [0.17278502],
       [0.33606565]])

______________________________________________________________________
**Next, using a threshold of `0.5`, manually calculate predicted classes. Compare this to the class predictions output by scikit-learn.**

In [23]:
# Manually calculate predicted classes
manual_pred = sigm_prob>=0.5
manual_pred.shape


(5333, 1)

In [24]:
# Compare to scikit-learn's predicted classes
x_test_pred.shape

(5333,)

In [25]:
(manual_pred == x_test_pred).all()

True

______________________________________________________________________
**Finally, calculate ROC AUC using both scikit-learn's predicted probabilities, and your manually predicted probabilities, and compare.**

In [33]:
# Use scikit-learn's predicted probabilities to calculate ROC AUC
from sklearn import metrics

pred_prob_sklearn  = metrics.roc_auc_score(Y_test,x_test_prob[:,1])
pred_prob_sklearn



0.627207450280691

In [36]:
# Use manually calculated predicted probabilities to calculate ROC AUC
pred_prob_manual  = metrics.roc_auc_score(Y_test,prob_array)
pred_prob_manual

0.627207450280691