# Uni-variate Linear Regression

### import the need library

In [81]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

### read_data

In [82]:
data1 = pd.read_csv("./data/univariateData.dat", header = None)
x1 = np.asarray(data1.iloc[:,0:1])
y1 = np.asarray(data1.iloc[:,-1:])
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1)
m1_train = len(x1_train)
x1_train = np.append(np.ones((m1_train,1)),x1_train , axis = 1)
# x1_train.insert(0,"00",np.ones(m1),True)
iteration1 = 1500
alpha1 = .01
theta1 = np.zeros((x1_train.shape[1],1))


In [83]:
print(f"x_train : {x1_train.shape}\nx_test : {x1_test.shape} \nx_test : {y1_train.shape} \ny_test: {y1_test.shape}")

x_train : (72, 2)
x_test : (25, 1) 
x_test : (72, 1) 
y_test: (25, 1)


### to compute the cost of the function 

In [84]:
def compute_cost(x,y,theta):
        m = len(y)
        h = np.dot(x,theta)
        sE = np.power((h-y),2)
        J_theta = (1/(2*m))*np.sum(sE)
        return J_theta

### compute gradient descent

In [85]:
def gradient_descent(x,y,theta,n_iter, l_rate):
    m  =len(y)
    J_history = []
    h = np.dot(x,theta)
    for i in range(n_iter):
            h = np.dot(x,theta)
            E = h-y
            error = np.dot(x.transpose(),E)
            theta = theta - (l_rate/m) *error
            J_history.append(compute_cost(x,y,theta))
            
    return theta ,  J_history
        

### train model

In [86]:
def fit(x,y,theta,n_iter, l_rate) :
    theta, J_history = gradient_descent(x,y,theta,n_iter,l_rate)
    return theta, J_history
    

In [87]:
theta1, J_history1 = fit(x1_train,y1_train,theta1,iteration1,alpha1)

In [88]:
theta1.shape

(2, 1)

### prediction

In [89]:
def predict(x,theta) :
    predictions = np.dot(x,theta)
    return predictions

In [90]:
predict( [1,5.5277],theta1)

array([2.76506976])

### evaluation

In [91]:
def evaluate (x_test , y_test , theta):
    x_test = np.append(np.ones((len(y_test),1)),x_test,axis = 1)
    y_pred = predict(x_test,theta)
    MSE = mean_squared_error(y_test,y_pred,squared = False)
    print("MSE = ", MSE)
    return MSE

In [92]:
MSE = evaluate(x1_test , y1_test , theta1)

MSE =  1.9822161078379283


# Multi-variate Linear Regression

### read data 

In [93]:
def normalize(x):
    norm = np.linalg.norm(x)
    return x / norm

In [94]:
data = pd.read_csv("./data/multivariateData.dat", header = None)
x2 = np.asarray(data.iloc[:,0:-1])
x2 = normalize(x2)
y2 = np.asarray(data.iloc[:,-1:])
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2)
m2_train = len(y2_train)

iteration2 = 1500
alpha2 = .01
theta2 = np.zeros((x2_train.shape[1] + 1,1))
print(f"{x2_train.shape}\n{x2_test.shape}\n{y2_train.shape}\n{y2_test.shape}")

(35, 2)
(12, 2)
(35, 1)
(12, 1)


### add bias

In [95]:
x2_train_bias = np.append(np.ones((m2_train,1)),x2_train , axis = 1)
print(x2_train_bias.shape)

(35, 3)


### train model 

In [96]:
theta2 , J_history = fit(x2_train_bias,y2_train,theta2,iteration2,alpha2)

In [97]:
J_history

[64344620579.53373,
 63175038679.9236,
 62029137311.00632,
 60906436936.811,
 59806467732.07912,
 58728769385.62047,
 57672890907.65112,
 56638390441.03269,
 55624835076.33439,
 54631800670.63973,
 53658871670.022736,
 52705640935.61892,
 51771709573.21827,
 50856686766.30927,
 49960189612.50359,
 49081842963.273285,
 48221279266.933365,
 47378138414.80405,
 46552067590.488144,
 45742721122.20072,
 44949760338.089066,
 44172853424.48247,
 43411675287.01246,
 42665907414.54545,
 41935237745.87068,
 41219360539.087875,
 40517976243.63972,
 39830791374.93575,
 39157518391.51525,
 38497875574.697365,
 37851586910.66859,
 37218381974.957855,
 36597995819.251175,
 35990168860.49808,
 35394646772.2639,
 34811180378.28207,
 34239525548.162067,
 33679443095.209263,
 33130698676.313957,
 32593062693.867725,
 32066310199.66584,
 31550220800.75591,
 31044578567.19288,
 30549171941.6622,
 30063793650.932983,
 29588240619.104427,
 29122313882.60885,
 28665818506.93605,
 28218563505.043964,
 27780361

### prediction

In [98]:
predict([1,1,2],theta2)

array([437283.452321])

In [99]:
x2_test.shape

(12, 2)

In [100]:
MSE = evaluate(x2_test,y2_test,theta2)

MSE =  135726.2616304436


# Losso Regression

In [101]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha = .01)
clf.fit(x2_train, y2_train)
predictions = clf.predict(x2_test)
predC= predictions.reshape(len(y2_test),1)

In [102]:
predlosso = np.append(y2_test,predC,axis=1)
print(predlosso)

[[239500.         231441.26461994]
 [299900.         266073.38167704]
 [314900.         341139.82903794]
 [255000.         352006.11446305]
 [573900.         637957.55510588]
 [347000.         352019.41256088]
 [699900.         693053.71949538]
 [249900.         215858.39230135]
 [229900.         236887.70543032]
 [239999.         326010.82678688]
 [299900.         448391.15690012]
 [252900.         316768.83731591]]


In [103]:
MSE = mean_squared_error(y2_test, predC,squared=False)
print("MSE:   "  + str(MSE))

MSE:   64690.210559202045


### the losso regression reduce the MSE to the half compared our algorithm 
#### 92157 to 50944