In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline

### Implement Linear Regression with Numpy only

In [2]:
# load dataset
data = '../data/cars.csv'
df = pd.read_csv(data)
df.head(2)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650


In [6]:
import wrangle as wr

In [7]:
df = wr.rename_columns(df)
df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'msrp'],
      dtype='object')

__Train, validate, test split__

In [8]:
# make the split reproducible
np.random.seed(2912)

n = len(df)

# 20% for validate, 20% for test and 60% for train
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

# shuffle values to make the split random
idx = np.arange(n)
np.random.shuffle(idx)

# rearrange the values inside the dataframe
df_shuffled = df.iloc[idx]

# split with slicing the data
df_train = df_shuffled.iloc[:n_train].copy().reset_index(drop=True)
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy().reset_index(drop=True)
df_test = df_shuffled.iloc[n_train+n_val:].copy().reset_index(drop=True)

In [9]:
df_train.head(2)

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
0,mercedes-benz,350-class,1991,diesel,134.0,6.0,automatic,rear_wheel_drive,4.0,"diesel,luxury",large,sedan,23,19,617,2178
1,aston_martin,db9,2014,premium_unleaded_(required),510.0,12.0,automatic,rear_wheel_drive,2.0,"exotic,high-performance",midsize,coupe,19,13,259,185800


In [5]:
# check if all element are included in the split
len(df) == len(df_train) + len(df_val) + len(df_test)

True

In [11]:
# separate the target variable and make its logarithmic transformation
y_train = np.log1p(df_train.msrp.values)
y_val = np.log1p(df_val.msrp.values)
y_test = np.log1p(df_test.msrp.values)

# drop the target variable from the data sets
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']

### Linear Regression

For one single $y$ the formula is $g(x_i) = w_0 + \sum_{j=0}^{n-1} w_j * x_{ij}$, where:
- $w_0$ is bias (if we don't know any other features, the final result will be equal to this number), or `y-intersect` on the graph. 
- $n$ - number of features (we count from 0 like in array, that's why $j=0$ and the count goes till $n-1$, normally we count from 1 till $n$). 
- $w_j$ weight of the $j$ -th feature
- $x_{ij}$ the $j$ -th feature of $i$ -th row

In [22]:
# make up values
xi = [453, 11, 86]
w0 = 5.45 # bias
w = [0.04, -0.1, 0.01] # weights

In [23]:
def linear_regression_1(xi):
    n = len(xi)

    pred = w0
    for j in range(n):
        pred = pred + w[j]*xi[j]
    return pred

In [24]:
pred = linear_regression_1(xi)
pred

23.33

In [25]:
# convert the prediction value (log(y+1)) to price
price = np.expm1(pred)
display(price.round(2))
price == np.exp(pred) - 1

13554711010.88

True