In [1]:
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [18]:
df = pd.read_csv('mllab-03-auto-mpg.csv')

In [19]:
df.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino


In [20]:
y = df['mpg']

In [21]:
x = df.drop(['name','mpg'], axis = 1)

In [22]:
y_train = y[0:int(0.8*len(y))]

In [23]:
y_test = y[int(0.8*len(y)):len(y)]

In [24]:
x_train = x[0:int(0.8*len(x))]

In [25]:
x_test = x[int(0.8*len(x)):len(x)]

In [18]:
x.columns

Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'year', 'origin'],
      dtype='object')

## Calculate weightts for each feature (a.k.a column)

In [28]:
y_mean= y.mean()

In [24]:
x_means = {}
for col in x.columns:
    x_means[col] = x[col].mean()


In [26]:
n_var_x = {}
for col in x.columns:
    n_var_x[col] = np.sum((x[col]-x_means[col])**2.)

In [29]:
n_cov_xy = {}
for col in x.columns:
    n_cov_xy[col] = np.sum((x[col] - x_means[col])*(y-y_mean))

In [30]:
col_w = {}
for col in x.columns:
    col_w[col] = n_cov_xy[col]/n_var_x[col]

## Calculate bias (a.k.a w0)

In [65]:
w_0 = {}
for col in x.columns:
    w_0[col] = y_mean-x[col].mean() * col_w[col]

In [128]:
y_test

313    24.3
314    19.1
315    34.3
316    29.8
317    31.3
       ... 
387    27.0
388    44.0
389    32.0
390    28.0
391    31.0
Name: mpg, Length: 79, dtype: float64

## Linear Regression Class

In [48]:
class LinearRegression_w:
    
    def __init__(self):
        
        self.w0 = float()
        self.w_0 = {}
        self.x_means = {}
        self.y_mean= y.mean()
        self.n_var_x = {}
        self.n_cov_xy = {}
        self.col_w = {}
    
    def train(self, x, y):

        for col in x.columns:
            self.x_means[col] = x[col].mean()
            #print('x means: ', self.x_means)
                
        for col in x.columns:
            self.n_var_x[col] = sum((x[col]-self.x_means[col])**2.)
            self.n_cov_xy[col] = sum((x[col] - self.x_means[col])*(y-self.y_mean))
            #print('n_var_x: ', self.n_var_x)
            #print('n_cov_xy: ', self.n_cov_xy)
        
        for col in x.columns:
            self.col_w[col] = self.n_cov_xy[col]/self.n_var_x[col]
        
        '''for col in x.columns:
            self.w_0[col] = self.y_mean-x[col].mean() * self.col_w[col]
        self.w0 = np.sum(self.w_0.values())'''
        
        for col in x.columns:
            self.w_0[col] = x[col].mean() * self.col_w[col]
        self.w0 = self.y_mean - sum(self.w_0.values())
        
        print('calculated w_n: ',self.col_w)
        print('calculated w0: ',self.w0)
    def predict(self, x_test):
        y_hat = x_test@np.array(list(self.col_w.values())).T + self.w0
        #y_hat = features_expanded.dot(self.w_best)
        
        return y_hat

In [137]:
model = LinearRegression_w()

In [138]:
model.train(x_train, y_train)

In [140]:
y_hat = model.predict(x_test)

In [141]:
residuals = (y_test - y_hat) 
rss = np.sum(residuals**2)

In [142]:
rss

30672.623346697175

#### Simple experiment

In [5]:
''' Data '''
data = np.array([[5,30530, 50],[7,90000, 79],[15,159899, 124],[28,270564, 300]])
X = data[:,[0]]
y = data[:,[1]]
print ("Independent variables:", X, type(X))
print ("Dependent variable:", y, type(y))

Independent variables: [[ 5]
 [ 7]
 [15]
 [28]] <class 'numpy.ndarray'>
Dependent variable: [[ 30530]
 [ 90000]
 [159899]
 [270564]] <class 'numpy.ndarray'>


In [11]:
dataset = pd.DataFrame({'Age': data[:, 0], 'Mileage': data[:, 1],'Stopping Distance': data[:, 2]})
dataset

Unnamed: 0,Age,Mileage,Stopping Distance
0,5,30530,50
1,7,90000,79
2,15,159899,124
3,28,270564,300


In [45]:
X = dataset['Age']
y = dataset['Mileage']

In [46]:
X = X.to_frame()
X

Unnamed: 0,Age
0,5
1,7
2,15
3,28


In [49]:
model_2 = LinearRegression_w()
model_2.train(X, y)

x means:  {'Age': 13.75}
n_var_x:  {'Age': 326.75}
n_cov_xy:  {'Age': 3180773.25}
calculated w_n:  {'Age': 9734.57765876052}
calculated w0:  3897.8071920428483


In [5]:
def w0_estimate1D(x, y, w1):
    x_mean = np.mean(x, axis=0)
    y_mean = np.mean(y, axis=0)
    return float(y_mean - x_mean * w1)

def w1_estimate1D(x, y):
    x_mean = np.mean(x, axis=0)
    y_mean = np.mean(y, axis=0)
    ncov_xy = np.sum((x-x_mean)*(y-y_mean)) # OR (x1.T - x_mean).dot((y - y_mean))
    nvar_x = np.sum((x-x_mean)**2.)
    return ncov_xy/nvar_x

w1 = w1_estimate1D(X, y)
w0 = w0_estimate1D(X, y, w1)
print (w0, w1)


3897.8071920428483 9734.57765876052
