In [24]:
import numpy as np
import pandas as pd
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from IPython.display import Image
init_notebook_mode(connected=True)

In [14]:
breast_cancer = load_breast_cancer()

In [15]:
column_names = np.append(breast_cancer.feature_names, 'target')
breast_cancer_df = pd.DataFrame(data= np.c_[breast_cancer['data'], breast_cancer['target']],
                     columns=column_names)

In [17]:
len(breast_cancer.feature_names) + 1

31

In [7]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [172]:
N = 5
x_list = np.linspace(-N, N)
y_list = np.array([sigmoid(x) for x in x_list])

trace = go.Scatter(
    x = x_list,
    y = y_list
)

data = [trace]

layout = go.Layout(
    title='Logistic Function (Sigmoid)',
    xaxis=dict(
        title='X',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Y',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

config={'showLink': False}
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='sigmoid', config=config)

In [137]:
def cost_function(X, y, theta, deriv=False):
    hypothesis = sigmoid(X.dot(theta))
    error = hypothesis - y
    
    if deriv:
        gradient = (1/m) * X.T.dot(error)
        return gradient, error
    else:
        J = -(y * np.log(hypothesis) + (1-y) * np.log(1-hypothesis)).mean()
        return J

In [138]:
def gradient_descent(X, y, alpha, epochs, batch_size, theta):
    cost_list = []
    theta_list = []
    
    for epoch in range(epochs):
        cost = cost_function(X, y, theta)
        cost_list.append(cost)
        theta_list.append(theta)
        
        gradient, error = cost_function(X, y, theta, deriv=True)
        theta = theta - alpha * gradient
        
        if epoch % 1000 == 0:
            print(f"Cost: {cost}")

    return cost_list, theta_list

In [152]:
X = breast_cancer.data
ones = np.ones((breast_cancer.data.shape[0], 1))
X = np.hstack((ones, X))
y = breast_cancer.target.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
m = X_train.shape[0]

In [165]:
epochs = 10000
alpha = 0.00001
theta = np.zeros(len(breast_cancer.feature_names) + 1).reshape(-1,1)
batch_size = X_train.shape[0] # batch gradient descent
theta.shape

(31, 1)

In [166]:
cost_list, theta_list = gradient_descent(X_train, y_train, 
                                         alpha, epochs, batch_size, theta)

Cost: 0.6931471805599453
Cost: 0.2698996372684122
Cost: 0.24453280399268248
Cost: 0.2350154047195336
Cost: 0.23000488467947905
Cost: 0.22678614314300202
Cost: 0.22441799084103636
Cost: 0.22250839718139293
Cost: 0.22087362812033262
Cost: 0.21941984452930963


In [167]:
final_theta = theta_list[-1]
print(final_theta)

[[ 1.92703781e-03]
 [ 1.35554847e-02]
 [ 3.60820302e-03]
 [ 6.97526196e-02]
 [ 1.50007303e-02]
 [ 7.88937131e-05]
 [-3.17891121e-04]
 [-5.87657683e-04]
 [-2.38962583e-04]
 [ 1.48930045e-04]
 [ 8.03135029e-05]
 [ 1.43977665e-04]
 [ 7.23420211e-04]
 [-8.92767882e-04]
 [-2.03585319e-02]
 [ 2.00443469e-06]
 [-8.34870861e-05]
 [-1.11410046e-04]
 [-2.51145348e-05]
 [ 5.29275936e-06]
 [-3.60122336e-06]
 [ 1.41606597e-02]
 [-1.00640685e-03]
 [ 6.08403155e-02]
 [-2.69886811e-02]
 [ 6.22253222e-05]
 [-1.19446183e-03]
 [-1.63175322e-03]
 [-4.17573006e-04]
 [ 2.96157208e-05]
 [ 3.20680483e-06]]
