In [63]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
# import seaborn as sns
import plotly.express as px

In [64]:
df = pd.read_csv("../scratch algo/dataset/breast-cancer.csv")

In [65]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [66]:
px.histogram(data_frame=df, x='diagnosis', color="diagnosis",
             color_discrete_sequence=['#05445E', '#75E6DA'])

In [67]:
df.drop('id', axis=1, inplace=True)

In [68]:
df['diagnosis']=(df['diagnosis']=='M').astype(int)

In [69]:
corr = df.corr()

In [70]:
corr_target = abs(corr['diagnosis'])

rfeatures = corr_target[corr_target>0.2]
names = [index for index, value in rfeatures.items()]

names.remove('diagnosis')
names

['radius_mean',
 'texture_mean',
 'perimeter_mean',
 'area_mean',
 'smoothness_mean',
 'compactness_mean',
 'concavity_mean',
 'concave points_mean',
 'symmetry_mean',
 'radius_se',
 'perimeter_se',
 'area_se',
 'compactness_se',
 'concavity_se',
 'concave points_se',
 'radius_worst',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'smoothness_worst',
 'compactness_worst',
 'concavity_worst',
 'concave points_worst',
 'symmetry_worst',
 'fractal_dimension_worst']

In [71]:
x = df[names].values
y = df['diagnosis'].values

In [72]:
def scale(x):
    mean = np.mean(x, axis=0)
    std = np.std(x, axis=0)
    x= (x-mean)/std
    
    return x

In [73]:
x= scale(x)

In [74]:
def train_test_split(x, y, randon_state=42, test_size=0.2):
    n= x.shape[0]
    test_split =int( n*test_size)
#     ind = np.random.permutation(np.arange(n))
    
    return x[test_split:], x[:test_split], y[test_split:],y[:test_split]


In [75]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [76]:
def sigmoid(x):
    sg = 1/(1+ np.exp(-x))
    return sg


In [87]:
class LogisticRegression:
    def __init__(self, lr=0.0001):
        self.lr= lr
    
    def initialize_parameters(self):
        self.W = np.zeros(self.X.shape[1])
        self.b = 0.0
    def sigmoid(self, z):
        sg= 1/(1+np.exp(-z))
        return sg
    def forward(self, x):
        z = np.matmul(x, self.W)+self.b
        A= self.sigmoid(z)
        return A
    
    def compute_cost(self, predictions):
        m = int(self.X.shape[0])
        cost = np.sum((-np.log(predictions + 1e-8) * self.y) + (-np.log(1 - predictions + 1e-8)) * (
                1 - self.y))
        cost = cost/m
        return cost
    def compute_gradient(self, predictions):
        m = self.X.shape[0]
        self.dw = np.matmul(self.X.T, (predictions - self.y))
        self.dw = np.array([np.mean(grad) for grad in self.dw])
        
        self.db = np.sum(np.subtract(predictions, self.y))
        self.dw = self.dw*1/m
        self.db = self.db*1/m
        
    def accuracy(self, predictions, y):
        ap = np.round(predictions)
        acc = np.mean(ap==y)
        
        return acc
    def fit(self, X, y, iterations):
        self.X=X
        self.y=y
        self.initialize_parameters()
        costs=[]
        for i in range(iterations):
            pred=self.forward(self.X)
            cost = self.compute_cost(pred)
            self.compute_gradient(pred)
            
            self.W = self.W - self.lr*self.dw
            self.b = self.b - self.lr*self.db
            
            if i%1000==0:
                print(cost)
                
    def predict(self, x):
        pred = self.forward(x)
        return np.round(pred)

In [89]:
lg = LogisticRegression()
lg.fit(x_train, y_train, 100000)
# y_train.shape

0.6931471605599454
0.544278048676167
0.4566162960281572
0.3992215914602175
0.3585635754070937
0.3280966280520068
0.30429946344446673
0.28511361961704784
0.2692556024871911
0.2558831639057658
0.24442023952769604
0.23445910832551464
0.22570278108665845
0.21792956283417883
0.2109704315934799
0.20469413091125824
0.1989970700350751
0.1937963131251678
0.18902460710268712
0.18462678723797638
0.18055713378240326
0.17677739772097803
0.1732553054654756
0.16996341176012306
0.1668782093885095
0.16397943075568092
0.16124949456502186
0.15867306343589319
0.1562366872204407
0.15392851315270517
0.15173804857905773
0.1496559653997297
0.1476739378538431
0.1457845071515194
0.143980967868821
0.1422572720964494
0.1406079481585038
0.13902803135617373
0.1375130046889236
0.13605874789631076
0.1346614934721187
0.13331778854774104
0.13202446173784227
0.1307785941989811
0.12957749427931298
0.12841867524100214
0.1272998356214624
0.1262188418688153
0.12517371294399782
0.12416260662911462
0.12318380732078052
0.12223

In [90]:
pred = lg.predict(x_test)

In [91]:
lg.accuracy(pred, y_test)

0.9557522123893806