In [103]:
import numpy as np
import pandas as pd

In [104]:
df = pd.read_csv('car_fuel_efficiency.csv')
df = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]

In [105]:
df

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


### QUESTION 1

In [106]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

### QUESTION 2

In [107]:
df.horsepower.median()

np.float64(149.0)

### QUESTION 3

In [108]:
shuffled_df = df.sample(frac = 1, random_state = 42)

shuffled_df

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
483,220,144.0,2535.887591,2009,16.642943
7506,160,141.0,2741.170484,2019,16.298377
8795,230,155.0,2471.880237,2017,18.591822
1688,150,206.0,3748.164469,2015,11.818843
6217,300,111.0,2135.716359,2006,19.402209
...,...,...,...,...,...
5734,210,163.0,1972.029124,2011,19.961672
5191,160,126.0,3011.588014,2009,14.651056
5390,290,187.0,2440.508039,2019,18.404435
860,260,129.0,1865.404480,2019,20.502460


In [109]:
n = len(shuffled_df)

n_test = int(0.2 * n)
n_validation = int(0.2 * n)
n_train = n - n_test - n_validation

n, n_train, n_test, n_validation

(9704, 5824, 1940, 1940)

In [110]:
train_df = shuffled_df.iloc[:n_train]
validation_df = shuffled_df.iloc[n_train:n_train + n_test]
test_df = shuffled_df.iloc[n_train + n_test:]

len(train_df), len(validation_df), len(test_df)

(5824, 1940, 1940)

In [111]:
y_train = np.log1p(train_df.fuel_efficiency_mpg.values)
y_validation = np.log1p(validation_df.fuel_efficiency_mpg.values)
y_test = np.log1p(test_df.fuel_efficiency_mpg.values)

del train_df['fuel_efficiency_mpg']
del validation_df['fuel_efficiency_mpg']
del test_df['fuel_efficiency_mpg']

y_train, y_validation, y_test

(array([2.8703359 , 2.85061269, 2.97511223, ..., 2.92279789, 2.99201277,
        2.94261017], shape=(5824,)),
 array([2.79522685, 2.74734667, 2.58377354, ..., 2.68192023, 2.69011317,
        2.82233377], shape=(1940,)),
 array([2.82163803, 2.72604557, 2.54540897, ..., 2.96550163, 3.06816735,
        2.88011043], shape=(1940,)))

In [112]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [113]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    
    return np.sqrt(mse)

### FILL WITH ZEROS

In [114]:
train_df.dtypes

engine_displacement      int64
horsepower             float64
vehicle_weight         float64
model_year               int64
dtype: object

In [115]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [116]:
X_train = prepare_X(train_df)
X_validation = prepare_X(validation_df)
# X_test = prepare_X(test_df)

In [117]:
w0, w = train_linear_regression(X_train, y_train)
w0 + X_train.dot(w)

array([2.90981589, 2.84278709, 2.93310627, ..., 2.92329662, 3.00305539,
       2.94404905], shape=(5824,))

In [118]:
y_prediction = w0 + X_validation.dot(w)

score = rmse(y_validation, y_prediction)
round(score, 2)

np.float64(0.04)

### FILL WITH MEAN

In [119]:
train_df.isnull().sum()

engine_displacement      0
horsepower             429
vehicle_weight           0
model_year               0
dtype: int64

In [127]:
def prepare_X_m(df):
    train_mean = X_train.mean()
    
    df_num = df[base]
    df_num = df_num.fillna(train_mean)
    X = df_num.values
    
    return X

In [128]:
X_train_m = prepare_X_m(train_df)

w0_m, w_m = train_linear_regression(X_train_m, y_train)
w0_m + X_train_m.dot(w_m)

array([2.90799649, 2.84185237, 2.92905193, ..., 2.9199751 , 2.99901547,
       2.94051176], shape=(5824,))

In [129]:
y_prediction_m = w0_m + X_validation.dot(w_m)

score_m = rmse(y_validation, y_prediction_m)
round(score_m, 2)

np.float64(0.04)

### QUESTION 4

In [124]:
def train_linear_regression_reg(X, y, r=0.01):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [158]:
X_train_reg = prepare_X(train_df)

w0_reg, w_reg = train_linear_regression_reg(X_train_reg, y_train, r=0)
w0_reg + X_train_reg.dot(w_reg)

array([2.90799649, 2.84185237, 2.92905193, ..., 2.9199751 , 2.99901547,
       2.94051176], shape=(5824,))

In [159]:
y_prediction_reg = w0_reg + X_validation.dot(w_reg)

score_reg = rmse(y_validation, y_prediction_reg)
#round(score_reg, 2)
score_reg

np.float64(0.041743405758080075)

### QUESTION 6

In [169]:
shuffled_df = df.sample(frac = 1, random_state = 9)

train_df = shuffled_df.iloc[:n_train]
validation_df = shuffled_df.iloc[n_train:n_train + n_test]
full_df = pd.concat([train_df, validation_df])
test_df = shuffled_df.iloc[n_train + n_test:]

y_full = np.concatenate([y_train, y_validation])

In [168]:
X_train_full = prepare_X(full_df)
w0_full, w_full = train_linear_regression_reg(X_train_full, y_full, r = 0.001)

w0_full + X_train_full.dot(w_full)

array([2.75298844, 2.75672542, 2.75746576, ..., 2.75626902, 2.75938058,
       2.75814461], shape=(7764,))

In [171]:
y_prediction_full = w0_full + X_train_full.dot(w_full)

score_full = rmse(y_full, y_prediction_full)
round(score_full, 2)

np.float64(0.17)