In [62]:
import numpy as np
import sklearn as skl
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
import pandas as pd
from numpy import linalg
from scipy.special import expit

In [3]:
data=datasets.load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [58]:
def SGD(data,validate,epsilon=0.1,target=None,batch_size=None,alpha=0.005,max_iter=1000000):
    if batch_size is None:
        batch_size=int(0.1*data.shape[0])
        #sets batch size to 5% of population if none entered
    w=np.random.normal(0,1,data.shape[1])
    #generates initial weights and biases for model from standard normal
    for i in np.arange(max_iter):
        sample=data.sample(n=batch_size)
        #samples data from population
        x=np.insert(sample.iloc[:,:-1].values,0,np.ones([batch_size,]),axis=1)
        #adds column of ones for w_0 of weights and bias vector
        t=sample.iloc[:,-1].values
        #vector of targets
        y=expit(x.dot(w))
        #compute the y_n for each x_n
        g=np.array([diff*phi for diff,phi in zip(y-t,x)]).sum(axis=0)
        #compute the gradient
        w=w-alpha*g
        #compute new weight and bias vector
        if i%1000==0:
            x_v=np.insert(validate.iloc[:,:-1].values,0,np.ones([validate.shape[0],]),axis=1)
            y_v=expit(x_v.dot(w))
            t_v=validate.iloc[:,-1].values
            error=np.sqrt(((y_v - t_v) ** 2).mean())
            if error<epsilon:
                break
    return w,error,i #return w vector, RMSE, and iterations

In [31]:
scaler=MinMaxScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)

In [76]:
scaled_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864,0.0
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878,0.0
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433,0.0
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711,0.0
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595,0.0


In [38]:
train, validate, test = np.split(scaled_df.sample(frac=1), [int(.6*len(scaled_df)), int(.8*len(scaled_df))])

In [61]:
test_features=np.insert(test.iloc[:,:-1].values,0,np.ones([test.shape[0],]),axis=1)
test_target=test.iloc[:,-1].values
predicted_target=expit(test_features.dot(SGD(train,validate)[0]))

In [75]:
print(classification_report(test_target,np.round(predicted_target)))

              precision    recall  f1-score   support

         0.0       0.93      0.88      0.91        49
         1.0       0.91      0.95      0.93        65

    accuracy                           0.92       114
   macro avg       0.92      0.92      0.92       114
weighted avg       0.92      0.92      0.92       114

