# Logistic Regression

In [None]:
import numpy as np
import pandas as pd

# Step 1: Load the training data
train_data = pd.read_csv("Train_file.csv", header=None)
train_X = train_data.iloc[:, :-1].values  # Features
train_y = train_data.iloc[:, -1].values   # Labels

# Step 2: Separate the instances with label 0 and 1
X0 = train_X[train_y == 0]
X1 = train_X[train_y == 1]

# Step 3: Calculate the means and covariance matrices
mu0 = np.mean(X0, axis=0)
mu1 = np.mean(X1, axis=0)
C0 = np.cov(X0.T)
C1 = np.cov(X1.T)

# Step 4: Define the discriminant function
def g(x):
    A = np.log(np.sum(train_y == 1) / np.sum(train_y == 0))
    B = -0.5 * np.log(np.linalg.det(C1) / np.linalg.det(C0))
    C = -0.5 * np.dot(np.dot(np.transpose(x - mu1), np.linalg.inv(C1)), x - mu1)
    D = 0.5 * np.dot(np.dot(np.transpose(x - mu0), np.linalg.inv(C0)), x - mu0)
    return A + B + C - D

# Step 5: Load the test data
test_data = pd.read_csv("Test_file.csv", header=None)
test_X = test_data.iloc[:, :-1].values  # Features
test_y = test_data.iloc[:, -1].values   # Labels

# Step 6: Pass each test instance to the discriminant function and assign labels
pred_y = np.zeros_like(test_y)
for i, x in enumerate(test_X):
    if g(x) >= 0:
        pred_y[i] = 1

# Step 7: Calculate the accuracy
class0_acc = np.sum((test_y == 0) & (pred_y == 0)) / np.sum(test_y == 0)
class1_acc = np.sum((test_y == 1) & (pred_y == 1)) / np.sum(test_y == 1)
overall_acc = np.mean(test_y == pred_y)

print("Class 0 accuracy:", class0_acc)
print("Class 1 accuracy:", class1_acc)
print("Overall accuracy:", overall_acc)

Class 0 accuracy: 1.0
Class 1 accuracy: 0.0
Overall accuracy: 0.5078066914498142


**Que. 2**

In [17]:
import numpy as np
import pandas as pd
# Load the training data
train_data = pd.read_csv('Train_file.csv', header=None)
X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values

# Load the test data
test_data = pd.read_csv('Test_file.csv', header=None)
X_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1].values

  test_data = pd.read_csv('Test_file.csv', header=None)


In [18]:
import random
def initialize_betas(dim):
    b = random.random()
    w = np.random.rand(dim)
    return b,w 

In [19]:
b,w = initialize_betas(X_train.shape[1])
print(b,w)

0.5834627791815397 [0.83696349 0.0456171  0.30140288 0.15055025 0.81401694 0.34453709
 0.24059533 0.42393516 0.05033345 0.95540644 0.29873441 0.92314832
 0.75151438 0.06462148 0.30644611 0.66186701 0.30020754 0.85922962
 0.44449162 0.57430862 0.33603871 0.33302364 0.87375171 0.56445846
 0.79369031 0.56635184 0.1438441  0.05195808 0.99069704 0.09493143
 0.61015516 0.57484555 0.82265471 0.69449658 0.95946716 0.14624593
 0.85619333 0.60369237 0.95853641 0.07308117 0.83417602 0.28287662
 0.74775102 0.12476352 0.04667217 0.61165989 0.446132   0.70450775
 0.84842976 0.88689024 0.29616334 0.80229718 0.28134588 0.82445533
 0.0743282  0.03427572 0.41026584 0.4877764  0.1735361  0.9270242 ]


In [20]:
w.shape

(60,)

In [22]:
def sigmoid(b, w ,X_new):
    Z = b + np.matmul(X_new,w)
    return (1.0 / (1 + np.exp(-Z)))

In [24]:
y_hat = sigmoid(b,w,X_train)
y_hat[0:5]

array([5.93254362e-06, 1.04204229e-03, 1.02018623e-03, 2.00219943e-01,
       2.18925948e-03])

In [25]:
y_train[0:5]

array([1., 0., 0., 0., 1.])

In [26]:
def get_cost( y, y_hat):
      #return np.sum(np.dot(y.T,np.log(1-y_hat)+ np.dot((1-y).T,np.log(1-y_hat)))) / ( len(y))
        return - np.sum(np.dot(y.T,np.log(1-y_hat)+ np.dot((1-y).T,np.log(1-y_hat)))) / ( len(y))

In [27]:
# Reshape Y
Y=y_train.reshape(len(y_train), )
current_cost= get_cost(Y,y_hat)
#print(current_cost)
Y.shape

(20847,)

In [28]:
def update_beta (b_0, w_0 , y , y_hat, X_new, alpha):
    db = np.sum( y_hat - y)/ len(y)
    b_0 = b_0 - alpha * db
    dw = np.dot((y_hat - y), X_new)/ len(y)
    w_0 = w_0 - alpha * dw
   
   
    return b_0,w_0

In [32]:
num_iterations = 500
alpha = 0.5

all_costs = []
b,w = initialize_betas(X_train.shape[1])
print("initial guess of b and w: " , b ,w)

for each_iter in range (num_iterations ):
    y_hat = sigmoid(b, w , X_train)
    current_cost = get_cost (Y, y_hat)
    prev_b = b
    prev_w = w
    b, w = update_beta (prev_b, prev_w, Y, y_hat, X_train, alpha)
    all_costs.append(current_cost)
    if each_iter % 10 == 0:
        print('Iteration: ', each_iter, 'Cost: ', current_cost)
        each_iter += 1
    
#print('b_0:', b_0, 'b_1:',b_1,'b_2:',b_2,'b_3:',b_3,'b_4:', b_4, 'b_5:',b_5,'b_6:',b_6,'b_7:',b_7,'b_8:',b_8,'b_9:',b_9)
print("Final estimates of b and q are: ", b,w)

initial guess of b and w:  0.8022492650595721 [0.42820373 0.54583209 0.25421992 0.12513606 0.29718046 0.89943589
 0.55273814 0.53093339 0.11956235 0.19412798 0.2877121  0.58073825
 0.21427061 0.08775307 0.16945847 0.28682324 0.82310573 0.19666467
 0.09636646 0.0416926  0.02153101 0.60549224 0.25816158 0.45655472
 0.94922843 0.85281775 0.19940806 0.47424098 0.61454815 0.69919583
 0.55005904 0.16539246 0.85694325 0.46555648 0.69148722 0.81905181
 0.33255603 0.23350611 0.52385707 0.70891455 0.85882408 0.82821859
 0.48527091 0.31630699 0.54532678 0.39509164 0.13710905 0.24166709
 0.11576019 0.85499728 0.61322057 0.95388917 0.21942989 0.51927407
 0.36698129 0.21922375 0.49084196 0.97634157 0.1077881  0.42985789]
Iteration:  0 Cost:  nan
Iteration:  10 Cost:  nan
Iteration:  20 Cost:  nan
Iteration:  30 Cost:  nan
Iteration:  40 Cost:  nan
Iteration:  50 Cost:  nan
Iteration:  60 Cost:  nan
Iteration:  70 Cost:  nan
Iteration:  80 Cost:  nan
Iteration:  90 Cost:  nan
Iteration:  100 Cost:  n