In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize

In [2]:
f = open("car/data-desc.txt", "r")
print(f.read())

| label values

unacc, acc, good, vgood

| attributes

buying:   vhigh, high, med, low.
maint:    vhigh, high, med, low.
doors:    2, 3, 4, 5more.
persons:  2, 4, more.
lug_boot: small, med, big.
safety:   low, med, high.

| columns
buying,maint,doors,persons,lug_boot,safety,label



In [3]:
df = pd.read_csv("car/train.csv", 
                 names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'label'])
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,low,vhigh,4,4,big,med,acc
1,low,high,5more,4,med,high,vgood
2,vhigh,med,2,2,big,high,unacc
3,high,high,2,2,small,high,unacc
4,vhigh,low,3,2,big,low,unacc


In [4]:
df_hot= pd.get_dummies(df, columns=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
df_hot.head()

Unnamed: 0,label,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_2,...,doors_5more,persons_2,persons_4,persons_more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med
0,acc,0,1,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,vgood,0,1,0,0,1,0,0,0,0,...,1,0,1,0,0,1,0,1,0,0
2,unacc,0,0,0,1,0,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0
3,unacc,1,0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,1,1,0,0
4,unacc,0,0,0,1,0,1,0,0,0,...,0,1,0,0,1,0,0,0,1,0


In [5]:
df_hot_ready_to_use = df_hot.copy()
Labels = ['unacc', 'acc', 'good', 'vgood']

In [6]:
for i in range(len(Labels)):
    df_hot_ready_to_use.loc[(df_hot_ready_to_use['label'] == Labels[i]), 'label'] = i

In [7]:
df_hot_ready_to_use.head()

Unnamed: 0,label,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_2,...,doors_5more,persons_2,persons_4,persons_more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med
0,1,0,1,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,3,0,1,0,0,1,0,0,0,0,...,1,0,1,0,0,1,0,1,0,0
2,0,0,0,0,1,0,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0
3,0,1,0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,1,1,0,0
4,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,1,0,0,0,1,0


In [8]:
columns = list(df_hot_ready_to_use.columns)
columns.remove('label')
print(columns)

['buying_high', 'buying_low', 'buying_med', 'buying_vhigh', 'maint_high', 'maint_low', 'maint_med', 'maint_vhigh', 'doors_2', 'doors_3', 'doors_4', 'doors_5more', 'persons_2', 'persons_4', 'persons_more', 'lug_boot_big', 'lug_boot_med', 'lug_boot_small', 'safety_high', 'safety_low', 'safety_med']


In [9]:
label = df_hot_ready_to_use['label'].to_numpy(dtype = int)
label.shape

(1000,)

In [10]:
train_data = df_hot_ready_to_use[columns].to_numpy(dtype = float)
print(train_data.shape)
train_data = np.hstack((np.ones((train_data.shape[0],1)), train_data))
train_data.shape

(1000, 21)


(1000, 22)

In [11]:
def softmax(arr):
    Max = arr.max(axis = 1)
    arr = arr - Max.reshape(-1,1)
    exp = np.exp(arr)
    return exp/exp.sum(axis =1,keepdims = True)

In [12]:
class  multi_class_logistic_regression:
    def __init__(self, n_class, n_features, initial = 'zero', prior_cov = 1):
        if initial == "zero":
            self.w = np.zeros((n_features, n_class))
        elif initial == "random":
            self.w = 1e-4*np.random.normal(loc = 0, scale = 1, size = (n_features, n_class))
        
        
    def train(self, data, label, lr= 1e-3, max_iter = 100, eps = 1e-6):
        self.data = data
        self.label = label
        self.lr = lr
        c = 0
        tolerance = np.inf
        while c < max_iter and tolerance > eps:
            c += 1
            w_old = self.w.copy()
                
            self.forward()
            self.backward()
            tolerance = ((self.w-w_old)**2).sum()
        
        
    def forward(self):
        N, _ = self.data.shape
        scores = self.data @ self.w # (N, n_n_features)x(n_features, n_class) = (N, n_class)
        softmax_prop = softmax(scores) #(N, n_class)

            
        corrected_scores =-np.log(1e-10 + softmax_prop[range(N), self.label])
            
        loss = corrected_scores.sum() + 1 * (self.w.T@self.w).sum()
            
        grad = softmax_prop # (N x n_class)
        grad[range(N), self.label] -= 1
            
        self.grad = self.data.T @ grad - 1/2 * self.w
        

            
    def backward(self):
        self.w -= self.lr * self.grad
            
    def accuracy(self,x, y):
        scores = x @ self.w # (N, n_n_features)x(n_features, n_class) = (N, n_class)
        softmax_prop = softmax(scores)
        y_pred = np.argmax(softmax_prop, axis = 1)
        return (y_pred==y).mean()

## Usnig gradient descent to Train

### 1. Zero initialization 

In [13]:
model_0 = multi_class_logistic_regression(n_class =4 , n_features = 22, initial = 'zero')

In [14]:
model_0.train(train_data, label, lr= 2e-3, max_iter = 1000, eps = 1e-8)

In [15]:
model_0.w.shape

(22, 4)

In [16]:
model_0.accuracy(train_data, label)

0.949

In [17]:
model_r = multi_class_logistic_regression(n_class =4 , n_features = 22, initial = 'random')

In [18]:
model_r.train(train_data, label, lr= 2e-3, max_iter = 1000, eps = 1e-8)

In [19]:
model_r.w.shape

(22, 4)

In [20]:
model_r.accuracy(train_data, label)

0.949

### 1. Random initialization 

# Loading Test Data 

In [21]:
df_test = pd.read_csv("car/test.csv", 
                 names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'label'])
df_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,vhigh,high,5more,2,small,low,unacc
1,low,low,5more,2,small,med,unacc
2,low,vhigh,4,2,med,low,unacc
3,high,vhigh,3,4,med,med,unacc
4,vhigh,low,4,4,med,low,unacc


In [22]:
df_test_hot= pd.get_dummies(df_test, columns=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
df_test_hot.head()

Unnamed: 0,label,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_2,...,doors_5more,persons_2,persons_4,persons_more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med
0,unacc,0,0,0,1,1,0,0,0,0,...,1,1,0,0,0,0,1,0,1,0
1,unacc,0,1,0,0,0,1,0,0,0,...,1,1,0,0,0,0,1,0,0,1
2,unacc,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,1,0
3,unacc,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,1
4,unacc,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0


In [23]:
df_test_hot_ready_to_use = df_test_hot.copy()

In [24]:
for i in range(len(Labels)):
    df_test_hot_ready_to_use.loc[(df_test_hot_ready_to_use['label'] == Labels[i]), 'label'] = i
df_test_hot_ready_to_use.head()

Unnamed: 0,label,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_2,...,doors_5more,persons_2,persons_4,persons_more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med
0,0,0,0,0,1,1,0,0,0,0,...,1,1,0,0,0,0,1,0,1,0
1,0,0,1,0,0,0,1,0,0,0,...,1,1,0,0,0,0,1,0,0,1
2,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,1,0
3,0,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,1
4,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0


In [25]:
label_test = df_test_hot_ready_to_use['label'].to_numpy(dtype = int)
label_test.shape

(728,)

In [26]:
test_data = df_test_hot_ready_to_use[columns].to_numpy(dtype = float)
print(test_data.shape)
test_data = np.hstack((np.ones((test_data.shape[0],1)), test_data))
test_data.shape

(728, 21)


(728, 22)

In [27]:
model_0.accuracy(test_data, label_test)

0.9175824175824175

In [28]:
model_r.accuracy(test_data, label_test)

0.9175824175824175

# Using L-BFGS method

In [29]:
def loss_grad(w, data, label, n_features = 22 , n_class = 4):
    
    N, _ = data.shape
    W = w.reshape(n_features, n_class)
    
    scores = data @ W # (N, n_features)x(n_features, n_class) = (N, n_class)
    softmax_prop = softmax(scores) # (N, n_class)

            
    corrected_scores = -np.log(softmax_prop[range(N), label])
            
    loss = corrected_scores.sum() + 1 * (W.T@ W).sum()
            
    grad = softmax_prop # (N x n_class)
    grad[range(N), label] -= 1
            
    grad = data.T @ grad - 1/2 * W
    return loss, grad.reshape(-1,)

In [30]:
loss = lambda w: loss_grad(w, train_data, label)[0]
grad = lambda w: loss_grad(w, train_data, label)[1]

### Zero initialization

In [31]:
w = np.zeros(22*4)
train_data.shape

(1000, 22)

In [32]:
model3 = minimize(loss, w, method = 'L-BFGS-B', jac= grad, options={'disp': False})

  corrected_scores = -np.log(softmax_prop[range(N), label])


In [33]:
def accuracy(w ,x, y):
    scores = x @ w # (N, n_n_features)x(n_features, n_class) = (N, n_class)
    softmax_prop = softmax(scores)
    y_pred = np.argmax(softmax_prop, axis = 1)
    return (y_pred==y).mean()

In [34]:
w = model3.x.reshape(22,4)
w.shape

(22, 4)

In [35]:
accuracy(w ,train_data, label)

0.928

In [36]:
accuracy(w ,test_data, label_test)

0.907967032967033

### Random Initialization

In [37]:
w = 1e-3*np.random.normal(loc=0.0, scale=1.0, size=22*4)

In [38]:
model3 = minimize(loss, w, method = 'L-BFGS-B', jac= grad, options={'disp': False})

  corrected_scores = -np.log(softmax_prop[range(N), label])


In [39]:
w_r = model3.x.reshape(22,4)

In [40]:
accuracy(w_r ,test_data, label_test)

0.9010989010989011

# Ugly method 

In [41]:
label_ = df_hot['label'].to_numpy()
#label_

In [42]:
def get_ugly_label(label, Labels = Labels):
    label_ugly = {}
    for value in Labels:
        
        temp_label = label.copy()
        temp_label[label == value] = 1
        temp_label[label != value] = 0
        label_ugly[value] = temp_label.astype(int).copy()
    
    return  label_ugly

In [43]:
ugly_label = get_ugly_label(label_)

In [44]:
ugly_label.keys()

dict_keys(['unacc', 'acc', 'good', 'vgood'])

In [45]:
def ugly_logistic_regression(data, label_dict, itital = 'zero'):
    n_features = data.shape[1]
    n_class = len(label_dict.keys())
    W= np.zeros((n_features, n_class))
    keys = list(label_dict.keys())
    for i in range(len(keys)):
        model = multi_class_logistic_regression(n_class =2 , n_features = 22, initial = itital)
        model.train(data, label_dict[keys[i]])
        W[:,i] = model.w[:,1]
    return W

### Zero initialization

In [46]:
W = ugly_logistic_regression(train_data, ugly_label, itital = 'zero')

In [47]:
def predict(W, data):
    scores = data @ W
    y_hat = np.argmax(scores, axis = 1)
    return y_hat

In [48]:
(label == predict(W, train_data)).mean()

0.867

In [49]:
(label_test == predict(W, test_data)).mean()

0.853021978021978

### Random initialization

In [50]:
W = ugly_logistic_regression(train_data, ugly_label, itital = 'random')

In [51]:
(label_test == predict(W, test_data)).mean()

0.853021978021978

## Ugly method via 'L-BFGS-B'

In [52]:
def sigmoid(z):
    return (z>=0) * 1/(1+np.exp(-z * (z>=0))) + (z<0)*np.exp(z*(z<0))/(1+np.exp(z*(z<0)))

In [53]:
def loss_grad_ugly(w, data, label, n_features = 22 , n_class = 4):
    eps = 1e-10
    N, _ = data.shape
    W = w.reshape(n_features, n_class)
    
    scores = data @ W # (N, n_features)x(n_features, n_class) = (N, n_class)
    sigmoid_prop = sigmoid(scores) # (N, n_class)

    mask = np.array([label == i for i in range(n_class)]).astype(int).T #(N, n_class)
    
    Y = -np.log(eps + mask*sigmoid_prop + (1-mask)*(1-sigmoid_prop)) # (N, n_class)
    
    grad = data.T @ (sigmoid_prop - mask) - 1/2 * W
    
    loss = np.sum(Y) + (W*W).sum()

    return loss, grad

In [54]:
loss_ugly = lambda w: loss_grad_ugly(w, train_data, label)[0]
grad_ugly = lambda w: loss_grad_ugly(w, train_data, label)[1].reshape(-1,)

In [55]:
w = 1e-4*np.random.normal(loc=0.0, scale=1.0, size=22*4)
train_data.shape

(1000, 22)

In [56]:
model_ugly = minimize(loss_ugly, w, method = 'L-BFGS-B', jac= grad_ugly, options={'disp': False})

In [57]:
w_ugly= model_ugly.x.reshape(22,4)

In [58]:
(label == predict(w_ugly, train_data)).mean()

0.868

In [59]:
(label_test == predict(w_ugly, test_data)).mean()

0.8612637362637363

### Check if the gradiant is implimented correctly

In [60]:
def num_grad(w, loss_function = loss):
    l = []
    n = len(w)
    for i in range(n):
        w_1 = w.copy()
        w_2 = w.copy()
        w_1[i] -= 1e-6
        w_2[i] += 1e-6
        l.append((loss_function(w_2) -loss_function(w_1))/2e-6)
    return np.array(l)

In [61]:
w = np.random.normal(loc=0.0, scale=1.0, size=22*4)/np.sqrt(5)

num_grad(w, loss_ugly), grad_ugly(w)

(array([-8.84955061e+01,  4.22682204e+02,  3.49597938e+02,  4.06486211e+02,
        -2.23349912e+00,  1.06156695e+02,  8.67990852e+01,  1.12977756e+02,
         4.04693992e-01,  1.05781904e+02,  5.81077936e+01,  9.97855711e+01,
         4.26461156e-01,  8.27749918e+01,  8.59133102e+01,  8.64560611e+01,
        -8.82621589e+01,  1.27252031e+02,  1.21012725e+02,  1.05013907e+02,
        -4.34736378e+01,  8.71308587e+01,  1.02249000e+02,  1.05660434e+02,
         9.79991455e-01,  9.86861357e+01,  4.92096210e+01,  5.93720908e+01,
        -9.31538580e-02,  1.06987062e+02,  6.44716456e+01,  1.12076012e+02,
        -4.38772126e+01,  1.28621914e+02,  1.34807345e+02,  1.30501694e+02,
        -2.66694735e+01,  1.38787462e+02,  1.09575777e+02,  9.54654786e+01,
        -6.19062862e+01,  1.07941044e+02,  8.72978578e+01,  1.10331501e+02,
         4.14892838e-01,  1.06954211e+02,  8.25521925e+01,  1.22755074e+02,
        -5.50580353e-01,  6.81569802e+01,  7.34464536e+01,  7.77835398e+01,
        -1.5