In [267]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import math
import matplotlib.pyplot as plt

In [259]:
def generateXvector(X):
    """ Taking the original independent variables matrix and add a row of 1 which corresponds to x_0
    x -> feature
        Parameters:
          X:  independent variables matrix
        Return value: the matrix that contains all the values in the dataset, not include the outcomes variables.
        shape[0] rows
    """
    vectorX = np.c_[np.ones((X.shape[0], 1)), X]
    return vectorX

In [263]:
def weights_intilization(X):
    """ Generate an initial value of vector θ weights from the original independent variables matrix
         Parameters:
          X:  independent variables matrix
        Return value: a vector of theta filled with initial guess

    """
    theta = np.random.randn(X.shape[1]+1, 1)
    return theta

In [264]:
def sigmoid_function(X):
    """ Calculate the sigmoid value of the inputs
         Parameters:
          X:  values
        Return value: the sigmoid value
        make output 0,1 binary classification
    """
    return 1/(1+math.e**(-X))

In [284]:
def Logistics_Regression(X , y, learningrate, iterations):
    """ Find the Logistics regression model for the data set
         Parameters:
          X: independent variables matrix
          y: dependent variables matrix
          learningrate: learningrate of Gradient Descent
          iterations: the number of iterations
        Return value: the final theta vector and the plot of cost function
    """
    y_new = np.reshape(y, (len(y), 1))   
    cost_lst = []
    vectorX = generateXvector(X)
    theta = weights_intilization(X)
    m = len(X)
    # training
    for i in range(iterations):
        gradients = 2/m * vectorX.T.dot(sigmoid_function(vectorX.dot(theta)) - y_new)
        theta = theta - learningrate * gradients
        y_pred = sigmoid_function(vectorX.dot(theta))
        cost_value = - np.sum(np.dot(y_new.T,np.log(y_pred)+ np.dot((1-y_new).T,np.log(1-y_pred)))) /(len(y_pred))
        print(cost_value)
 #Calculate the loss for each training instance
        cost_lst.append(cost_value)
    return theta

In [228]:
cols=['Oldpeak','RestBP','Chol','Thal','AHD']

In [229]:
df=pd.read_csv('./heart.csv',usecols=cols)

In [230]:
df

Unnamed: 0,RestBP,Chol,Oldpeak,Thal,AHD
0,145,233,2.3,fixed,No
1,160,286,1.5,normal,Yes
2,120,229,2.6,reversable,Yes
3,130,250,3.5,normal,No
4,130,204,1.4,normal,No
...,...,...,...,...,...
298,110,264,1.2,reversable,Yes
299,144,193,3.4,reversable,Yes
300,130,131,1.2,reversable,Yes
301,130,236,0.0,normal,Yes


In [231]:
df.dropna(inplace=True) # drop nulls

### Encoding Categorical Variables

In [232]:
df.AHD.replace({'Yes':1,'No':0},inplace=True)

In [233]:
df.head()

Unnamed: 0,RestBP,Chol,Oldpeak,Thal,AHD
0,145,233,2.3,fixed,0
1,160,286,1.5,normal,1
2,120,229,2.6,reversable,1
3,130,250,3.5,normal,0
4,130,204,1.4,normal,0


In [234]:
ordinal_mapping={'fixed': 0, 'normal': 1, 'reversable': 2}

In [235]:
df['Thal'].replace(ordinal_mapping,inplace=True)

In [236]:
df['Thal']

0      0
1      1
2      2
3      1
4      1
      ..
298    2
299    2
300    2
301    1
302    1
Name: Thal, Length: 301, dtype: int64

In [237]:
df.head()

Unnamed: 0,RestBP,Chol,Oldpeak,Thal,AHD
0,145,233,2.3,0,0
1,160,286,1.5,1,1
2,120,229,2.6,2,1
3,130,250,3.5,1,0
4,130,204,1.4,1,0


In [238]:
X=df.drop('AHD',axis=1)

In [239]:
X

Unnamed: 0,RestBP,Chol,Oldpeak,Thal
0,145,233,2.3,0
1,160,286,1.5,1
2,120,229,2.6,2
3,130,250,3.5,1
4,130,204,1.4,1
...,...,...,...,...
298,110,264,1.2,2
299,144,193,3.4,2
300,130,131,1.2,2
301,130,236,0.0,1


In [240]:
y=np.array(df['AHD']).astype(int)

In [241]:
y

array([0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0]

In [242]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size = 0.2, random_state = 0)

### LogisticsRegression function from SkLearn

In [243]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression()

In [244]:
predictions=classifier.predict(X_test)

In [245]:
accuracy_score(y_test,predictions)

0.7868852459016393

### LogisticsRegression from scratch

In [285]:
Logistics_Regression(X_train,y_train, 0.01, 1000)

  return 1/(1+math.e**(-X))
  cost_value = - np.sum(np.dot(y_new.T,np.log(y_pred)+ np.dot((1-y_new).T,np.log(1-y_pred)))) /(len(y_pred))


nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
22.75218077158657
nan
80.93369119097783
nan
121.8231627673667
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
33.02981046365177
nan
98.40124930916299
nan
139.29072088555185
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
12.99482256731502
nan
66.96023904939285
nan
107.84971062578173
nan
148.7391822021706
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
424.53164152171865
126.97488162187116
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
16.88692229457008
nan
57.77664367338087
nan
98.66611524976975
nan
139.5555868261586
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
23.890653166152223
nan
64.78012480981177
nan
105.66959638620065
nan
146.5590679625895
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
578.7157573712982
nan
nan
nan
nan
nan
13.714316298728537
nan
54.61126754692849
nan
95.50073912331736
nan
136.39021069970622
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
26.30224985715058
nan
67.19172143461842
nan
108.081193

array([[-2.2174378 ],
       [ 0.27671854],
       [ 0.42452141],
       [ 4.10453527],
       [ 0.60901947]])