In [1]:
import numpy as np
import pandas as pd
import math

## DataFrame

A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

DataFrame documentation : https://pandas.pydata.org/docs/reference/frame.html

## Read csv file as DataFrame

In [2]:
customerData = pd.read_csv('customer_data.csv')

### Displaying the first rows

In [3]:
customerData.head()

Unnamed: 0,age,salary,purchased
0,44,39000,0
1,32,120000,1
2,38,50000,0
3,32,135000,1
4,52,21000,1


we can see that the dataframe has 3 columns.
we will consider age and salary columns as inputs 
and purchased as output

### Data processing and feature scaling

![feature scaling.png](attachment:5d7beccf-ebf8-4af1-921c-a565ea95c226.png)

we use normalization when the data distribution is unknown or the data doesn't have Gaussian Distribution. 

standardized data is usually applied when the data has a bell curve

![normal vs standard.png](attachment:c69171c7-0eaa-45f4-96dd-803c791b8846.png)

**in our case we are using normalization**

In [4]:
# shuffle the data
customerData.sample(frac=1)
# print(customerData)


# X => values of features 1,2 (age, salary) & Y => values of output (purchased)
X = customerData.iloc[:, 0:2].values
Y = customerData.iloc[:, 2:3].values

# Feature Scaling using minmax normalization
X0_temp = []
X1_temp = []
for i in X:
    X0_temp.append(i[0])
    X1_temp.append(i[1])
minVal = min(X0_temp)
maxVal = max(X0_temp)
for i in range(len(X0_temp)):
    X0_temp[i] = (X0_temp[i] - minVal) / (maxVal - minVal)

minVal = min(X1_temp)
maxVal = max(X1_temp)
for i in range(len(X1_temp)):
    X1_temp[i] = (X1_temp[i] - minVal) / (maxVal - minVal)

X = X.astype(float)
for i in range(len(X)):
    X[i][0] = (X0_temp[i])
    X[i][1] = (X1_temp[i])

# Split the dataset into training and testing sets (req2)
X_Train = X[0:320]
Y_Train = Y[0:320]
X_Test = X[320:]
Y_Test = Y[320:]
numOfTrainingItems = int(X_Train.size / 2)

### Model functions  (to modify)

![logistic regression.png](attachment:3a11cb7b-7b98-4ca0-af89-8431a34ec3bb.png)

![cost function.png](attachment:cf61f67a-26cb-4d1b-90d5-c1e3ce4df582.png)

![gradient descent.png](attachment:afc5a853-df22-4b2b-b111-fb60e39a4fed.png)

In [5]:
class LogisticRegression:
    # Z = X*w+b => h(x)
    def __init__(self, x_train, y_train, w, b, numOfTrainingItems, learning_rate,epochs):
        self.x_train = x_train
        self.y_train = y_train
        self.w = w
        self.b = b
        self.numOfTrainingItems = numOfTrainingItems
        self.learning_rate = learning_rate
        self.epochs = epochs

    # Sigmoid Function (Z = w*X+b) .. w=theta1 , b=theta0
    def Sigmoid(self, w, x, b):
        z = "........"     ##to modify
        out = "....."     ##to modify
        return out

    # Cost Function 
    def Cost_Function(self):
        cost0 = "......"       # to modify
        cost1 = "......"       # to modify
        cost = -((cost1 + cost0))/numOfTrainingItems
        return cost

    # get the values of w,b by gradient descent
    def gradient_descent(self):
        hypothesis = self.Sigmoid(self.w, self.x_train, self.b)
        self.w += "......"      # to modify
        self.b += "......"      # to modify
        return [self.w, self.b]

    
    # training the data
    def train(self):
        for i in range(self.epochs):
            self.w, self.b = self.gradient_descent()
        return [self.w, self.b]

    # predict new values
    def predict(self, x_test):
        hypothesis = self.Sigmoid(self.w, x_test, self.b)
        return [1 if val >= 0.5 else 0 for val in hypothesis]

## initializing weights and biases

In [6]:
# random initial values for w and b
w = np.random.random((2, 1))
b = np.random.random()

## set the learning rate and number of epochs

In [7]:
alpha = 0.03
epochs = 1000

### Create the model 

In [8]:
LogisticReg = LogisticRegression(X_Train, Y_Train, w, b, numOfTrainingItems, alpha, epochs)

### Train the model

In [9]:
w, b= LogisticReg.train()

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('float64'), dtype('<U6')) -> None

### Predict

In [10]:
Y_Predict = LogisticReg.predict(X_Test)

TypeError: '>=' not supported between instances of 'str' and 'float'

## Metrics (to modify)

![metrics1.png](attachment:e38ac01f-8341-49d6-ba46-b61103966338.png)![metrics2.png](attachment:72c7b971-527c-4c62-8a40-c176967335eb.png)

In [28]:
def metrics(y,y_hat):
    tp,tn,fp,fn = 0,0,0,0
    for i in range(len(y)):
        if y[i] == 1 and y_hat[i] == 1:
            tp += 1
        elif y[i] == 1 and y_hat[i] == 0:
            fn += 1
        elif y[i] == 0 and y_hat[i] == 1:
            fp += 1
        elif y[i] == 0 and y_hat[i] == 0:
            tn += 1
            
    accuracy = "......"   # to modify
    precision = "......"  # to modify
    recall = "........"   # to modify
    f1_score = "......"   # to modify
    return f1_score,accuracy,precision, recall

In [1]:
f1_score,accuracy, precision, recall  = metrics(Y_Test,Y_Predict)
print(f"Accuracy : {accuracy * 100}%")
print(f"f1_score : {f1_score * 100}%")
print(f"precision : {precision * 100}%")
print(f"recall : {recall * 100}%")

NameError: name 'metrics' is not defined

## using scikit learn (to modify)
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [28]:
#check scikit-learn website (
from ...... import .....
from ..... import ......
X, y = .......
clf = ........

  y = column_or_1d(y, warn=True)


## Predict with sklearn (to modify)

In [None]:
Y_Predict_sklearn = ........

In [27]:
f1_score,accuracy, precision, recall  = metrics(Y_Test,Y_Predict_sklearn)
print(f"Accuracy : {accuracy * 100}%")
print(f"f1_score : {f1_score * 100}%")
print(f"precision : {precision * 100}%")
print(f"recall : {recall * 100}%")

Accuracy : 86.25%
f1_score : 78.43137254901961%
precision : 95.23809523809523%
recall : 66.66666666666666%
