In [312]:
import numpy as np
import pandas as pd
df = pd.read_csv("../Data/car_fuel_efficiency.csv")

In [313]:
df = df[[
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]]

In [314]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [315]:
for col in df.columns:
    print(col)
    print(df[col].unique()[:5])
    print(df[col].nunique())
    print()

engine_displacement
[170 130 220 210 190]
36

horsepower
[159.  97.  78.  nan 140.]
192

vehicle_weight
[3413.43375861 3149.66493422 3079.03899737 2542.39240183 3460.87098999]
9704

model_year
[2003 2007 2018 2009 2008]
24

fuel_efficiency_mpg
[13.23172891 13.68821744 14.246341   16.9127356  12.48836912]
9704



In [316]:
df.horsepower.describe()

count    8996.000000
mean      149.657292
std        29.879555
min        37.000000
25%       130.000000
50%       149.000000
75%       170.000000
max       271.000000
Name: horsepower, dtype: float64

In [317]:
# Shuffle the DataFrame with seed 42
df = df.sample(frac=1, random_state=9).reset_index(drop=True)

In [318]:
df.horsepower.mean()

np.float64(149.65729212983547)

In [319]:
df

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,240,136.0,4050.512290,2006,10.304700
1,190,141.0,3195.866942,2017,13.479434
2,200,184.0,3006.164370,2017,16.190653
3,200,,2947.143980,2022,15.446503
4,250,132.0,3114.371978,2001,14.472172
...,...,...,...,...,...
9699,130,190.0,2929.265698,2018,16.408894
9700,270,127.0,3489.985764,2017,12.502945
9701,240,177.0,3348.840052,2010,13.245621
9702,250,150.0,2985.704630,2022,15.215270


In [320]:
df['horsepower'] = df['horsepower'].fillna(0)

In [321]:
df

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,240,136.0,4050.512290,2006,10.304700
1,190,141.0,3195.866942,2017,13.479434
2,200,184.0,3006.164370,2017,16.190653
3,200,0.0,2947.143980,2022,15.446503
4,250,132.0,3114.371978,2001,14.472172
...,...,...,...,...,...
9699,130,190.0,2929.265698,2018,16.408894
9700,270,127.0,3489.985764,2017,12.502945
9701,240,177.0,3348.840052,2010,13.245621
9702,250,150.0,2985.704630,2022,15.215270


In [322]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n- (n_val+n_test)


In [323]:
n, n_val+ n_test+ n_train

(9704, 9704)

In [324]:
df.iloc[[0, 1, 2]]

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,240,136.0,4050.51229,2006,10.3047
1,190,141.0,3195.866942,2017,13.479434
2,200,184.0,3006.16437,2017,16.190653


In [325]:
df_train = df.iloc[n_train:]
df_val = df.iloc[n_train: n_train+ n_val]
df_test = df.iloc[n_train+n_val:]



In [326]:
df_train

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
5824,180,175.0,2775.381743,2014,15.931657
5825,160,166.0,3542.655183,2012,13.130355
5826,250,173.0,1354.787120,2007,22.858156
5827,260,75.0,2531.997079,2005,16.399893
5828,250,175.0,2609.369103,2004,17.464552
...,...,...,...,...,...
9699,130,190.0,2929.265698,2018,16.408894
9700,270,127.0,3489.985764,2017,12.502945
9701,240,177.0,3348.840052,2010,13.245621
9702,250,150.0,2985.704630,2022,15.215270


In [327]:
# df_val_zero = df_val['horsepower'].fillna(0)
# df_test_zero = df_test['horsepower'].fillna(0)
# df_train_zero = df_train['horsepower'].fillna(0)

# df_val_mean = df_val['horsepower'].fillna(149.657292)
# df_test_mean = df_test['horsepower'].fillna(149.657292)
# df_train_mean = df_train['horsepower'].fillna(149.657292)

In [328]:
df_train.columns

Index(['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year',
       'fuel_efficiency_mpg'],
      dtype='object')

In [329]:


y_train_orig = df_train.fuel_efficiency_mpg.values
y_val_orig = df_val.fuel_efficiency_mpg.values
y_test_orig = df_val.fuel_efficiency_mpg.values

y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_val.fuel_efficiency_mpg.values)


del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [330]:


def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]



In [331]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

In [332]:


def prepare_X(df, f):
    df_num = df[base]
    df_num = df_num.fillna(f)
    X = df_num.values
    return X



In [333]:
X_train = prepare_X(df_train, f)
w_0, w = train_linear_regression(X_train, y_train)

In [334]:


y_pred = w_0 + X_train.dot(w)



## RMSE

In [335]:
def rmse(y, y_pred):
    error = y-y_pred
    se = error ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [336]:
rmse(y_train, y_pred)

np.float64(0.038879474264375824)

In [337]:


X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)



In [338]:


rmse(y_val, y_pred)



np.float64(0.03859510526930455)

In [339]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [340]:


X_train = prepare_X(df_train)



In [341]:


for r in [0, 0.001, 0.01, 0.1, 1, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    print('%5s, %.2f, %.2f, %.2f' % (r, w_0, w[1], w[2]))





    0, 3.62, 0.00, -0.00
0.001, 3.54, 0.00, -0.00
 0.01, 2.93, 0.00, -0.00
  0.1, 1.09, 0.00, -0.00
    1, 0.15, 0.00, -0.00
   10, 0.02, 0.00, -0.00
  100, 0.00, 0.00, -0.00


In [342]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression_reg(X_train, y_train, r=0)

y_pred = w_0 + X_train.dot(w)
print('train', rmse(y_train, y_pred))

X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)
print('val', rmse(y_val, y_pred))

train 0.038879474264375824
val 0.03859510526930455


In [343]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression_reg(X_train, y_train, r=0.01)

y_pred = w_0 + X_train.dot(w)
print('train', rmse(y_train, y_pred))

X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)
print('val', rmse(y_val, y_pred))

train 0.03894596509366817
val 0.038639966179458966


In [344]:
X_train = prepare_X(df_train)
X_val = prepare_X(df_val)

for r in [0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    print('%6s' %r, round(rmse(y_val, y_pred), 3))


 1e-06 0.039
0.0001 0.039
 0.001 0.039
  0.01 0.039
   0.1 0.039
     1 0.04
     5 0.04
    10 0.04
   100 0.04


In [345]:
X_test = prepare_X(df_test)
y_pred = w_0 + X_test.dot(w)
print('test:', rmse(y_test, y_pred))

test: 0.2267282632640925


In [346]:
df = pd.read_csv("../Data/car_fuel_efficiency.csv")

In [347]:
df = df[[
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]]

In [348]:
df['horsepower'] = df['horsepower'].fillna(df.horsepower.mean())

In [349]:
# Shuffle the DataFrame with seed 42
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [350]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = int(n * 0.6)


In [351]:
df_train = df.iloc[n_train:]
df_val = df.iloc[n_train: n_train+ n_val]
df_test = df.iloc[n_train+n_val:]

In [352]:


y_train_orig = df_train.fuel_efficiency_mpg.values
y_val_orig = df_val.fuel_efficiency_mpg.values
y_test_orig = df_val.fuel_efficiency_mpg.values

y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_val.fuel_efficiency_mpg.values)

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [353]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression(X_train, y_train)

In [354]:
y_pred = w_0 + X_train.dot(w)

In [355]:
rmse(y_train, y_pred)

np.float64(0.036067282910292466)

In [356]:

X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)

In [357]:
rmse(y_val, y_pred)

np.float64(0.03732890159944149)

In [358]:
X_train = prepare_X(df_train)

In [359]:

for r in [0, 0.001, 0.01, 0.1, 1, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    print('%5s, %.2f, %.2f, %.2f' % (r, w_0, w[1], w[2]))
    y_pred = w_0 + X_train.dot(w)
    print('train', rmse(y_train, y_pred))

    X_val = prepare_X(df_val)
    y_pred = w_0 + X_val.dot(w)
    print('val', rmse(y_val, y_pred))

    0, 3.54, 0.00, -0.00
train 0.036067282910292466
val 0.03732890159944149
0.001, 3.45, 0.00, -0.00
train 0.036068288768327364
val 0.037327828861128294
 0.01, 2.86, 0.00, -0.00
train 0.03613619398943273
val 0.037379419813634525
  0.1, 1.05, 0.00, -0.00
train 0.03698849629510014
val 0.03817104624641738
    1, 0.14, 0.00, -0.00
train 0.03776735444438928
val 0.03891495746073317
   10, 0.01, 0.00, -0.00
train 0.0378956901031735
val 0.039038191547834944
  100, 0.00, 0.00, -0.00
train 0.03790931101763537
val 0.0390512790694323


In [360]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression_reg(X_train, y_train, r=0)

y_pred = w_0 + X_train.dot(w)
print('train', rmse(y_train, y_pred))

X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)
print('val', rmse(y_val, y_pred))

train 0.036067282910292466
val 0.03732890159944149
