In [19]:
import numpy as np
import matplotlib as plt
import pickle

# 1 - Creating Logistic Regression Class

In [20]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01) -> None:
        self.w = None
        self.b = None

        self.learning_rate = learning_rate

        # epsilon will be use for handling log(0)
        self.epsilon = 1e-15

        # creating for visualization
        self.cost_track = []

    def sigmoid(self, a):
        return 1 / (1 + np.exp(-a))

    def feed_forward(self, x):
        return self.sigmoid(
            np.dot(x, self.w) + self.b
        )

    def binary_crossentropy(self, y, y_pred): # also named as log loss 
        # cliping y_pred because of preventing log(0) error
        y_pred = np.clip(y_pred, self.epsilon, 1 - 1e-3)
        return - (1 / y.shape[0]) * np.sum((y * np.log(y_pred)) + ((1 - y) * np.log(1 - y_pred)))

    def back_propagation(self, x, y, y_pred):
        cost = self.binary_crossentropy(y, y_pred)
        dw = (1 / x.shape[0]) * np.dot(x.T, (y_pred - y))
        db = (1 / x.shape[0]) * np.sum(y_pred - y)

        # updating weights and bias
        self.w = self.w - self.learning_rate * dw
        self.b = self.b - self.learning_rate * db

        return cost

    def initialize_paramters(self, feature_count):
        # initialize weights and bias
        self.w = np.zeros((feature_count, 1))
        self.b = np.zeros((1, 1))

    def calculate_accuracy(self, y, y_pred):
        if y.shape != (y.shape[0], 1):
            y = y.reshape(-1, 1)
        return  ((y_pred > 0.5) == y).sum() / y.shape[0]

    def fit(self, x_train, y_train, epoch, learning_rate=0.001, print_cost=True):
        # getting shapes of data
        (x_length, feature_count) = x_train.shape[0], x_train.shape[1]
        y_train = y_train.reshape(-1, 1)

        # initialize weight and bias for input data
        self.initialize_paramters(feature_count)
        
        for e in range(1, epoch + 1):
            # predict train data
            y_pred = self.feed_forward(x_train)
            # calculate cost and update weights with back propagation
            cost = self.back_propagation(x_train, y_train, y_pred)

            if e % 100 == 0:
                print(f"Epoch: {e}, train_loss: {np.squeeze(cost)}, train_accuracy: {self.calculate_accuracy(y_train, y_pred)}")
                self.cost_track.setdefault(e, cost)

    def predict(self, x):
        return self.feed_forward(x)

    def save(self, file_path):
        with open(file_path, "wb") as f:
            pickle.dump({"w": self.w, "b": self.b}, f, protocol=pickle.HIGHEST_PROTOCOL)

    def load(self, file_path):
        with open(file_path, "rb") as f:
            temp = pickle.load(f)

        self.w = temp["w"]
        self.b = temp["b"]
        del temp

# 2 - Load Heart Disease Dataset

In [21]:
import pandas as pd

df = pd.read_csv("framingham.csv").dropna()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3656 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             3656 non-null   int64  
 1   age              3656 non-null   int64  
 2   education        3656 non-null   float64
 3   currentSmoker    3656 non-null   int64  
 4   cigsPerDay       3656 non-null   float64
 5   BPMeds           3656 non-null   float64
 6   prevalentStroke  3656 non-null   int64  
 7   prevalentHyp     3656 non-null   int64  
 8   diabetes         3656 non-null   int64  
 9   totChol          3656 non-null   float64
 10  sysBP            3656 non-null   float64
 11  diaBP            3656 non-null   float64
 12  BMI              3656 non-null   float64
 13  heartRate        3656 non-null   float64
 14  glucose          3656 non-null   float64
 15  TenYearCHD       3656 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 485.6 KB


In [22]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [23]:
df.tail()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
4231,1,58,3.0,0,0.0,0.0,0,1,0,187.0,141.0,81.0,24.96,80.0,81.0,0
4232,1,68,1.0,0,0.0,0.0,0,1,0,176.0,168.0,97.0,23.14,60.0,79.0,1
4233,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
4234,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
4237,0,52,2.0,0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0


In [24]:
x = df.drop("TenYearCHD", axis=1).values
y = df["TenYearCHD"].values

In [25]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [26]:
#creating model object

lrm_hearth = LogisticRegression()
lrm_hearth.fit(x_train=x_train, y_train=y_train, epoch=500, learning_rate=0.0015)

Epoch: 100, train_loss: 5.878369613728269, train_accuracy: 0.1490404246631278
Epoch: 200, train_loss: 5.147673901242303, train_accuracy: 0.8509595753368722
Epoch: 300, train_loss: 5.136523866826367, train_accuracy: 0.8509595753368722
Epoch: 400, train_loss: 5.147673901242303, train_accuracy: 0.8509595753368722
Epoch: 500, train_loss: 5.147673901242303, train_accuracy: 0.8509595753368722


  return 1 / (1 + np.exp(-a))


In [27]:
y_test_pred = lrm_hearth.predict(x_test)
accuracy = lrm_hearth.calculate_accuracy(y_test, y_test_pred)
print(f"Test Set Accuracy {accuracy}")

Test Set Accuracy 0.8409279204639603


  return 1 / (1 + np.exp(-a))


In [28]:
lrm_hearth.save("./hearth_model_weights.pckl")
# lrm_hearth.load("./test.pckl")

# 3 - Diabetics Dataset

In [29]:
diabate_df = pd.read_csv("diabetes2.csv")

diabate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [30]:
diabate_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [31]:
diabate_df.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [32]:
x = diabate_df.drop("Outcome", axis=1).values
y = diabate_df["Outcome"].values

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [34]:

lrm_diabate = LogisticRegression()
lrm_diabate.fit(x_train=x_train, y_train=y_train, epoch=800, learning_rate=0.000000015)

Epoch: 100, train_loss: 11.875501206345978, train_accuracy: 0.6381322957198443
Epoch: 200, train_loss: 12.092991163241003, train_accuracy: 0.6459143968871596
Epoch: 300, train_loss: 3.5297843467577352, train_accuracy: 0.6498054474708171
Epoch: 400, train_loss: 4.507938532168054, train_accuracy: 0.3560311284046693
Epoch: 500, train_loss: 11.897495553394915, train_accuracy: 0.6342412451361867
Epoch: 600, train_loss: 2.939311065917618, train_accuracy: 0.6828793774319066
Epoch: 700, train_loss: 2.497848933345334, train_accuracy: 0.6108949416342413
Epoch: 800, train_loss: 11.962117168074139, train_accuracy: 0.6459143968871596


In [35]:
y_test_pred = lrm_diabate.predict(x_test)
accuracy = lrm_diabate.calculate_accuracy(y_test, y_test_pred)
print(f"Test Set Accuracy {accuracy}")

Test Set Accuracy 0.6732283464566929


In [36]:
lrm_diabate.save("./diabate_model_weights.pckl")
# lrm_diabate.load("./test.pckl")