In [None]:
import pandas as pd
import numpy as np

In [None]:
car_df = pd.read_csv("car_fuel_efficiency.csv")
car_df.head()

In [None]:
car_df_new = car_df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.histplot(car_df_new['fuel_efficiency_mpg'], bins=30)

In [None]:
car_df_new.isnull().sum()

In [None]:
np.random.seed(42)
np.random.shuffle(car_df_new.values)

In [None]:
n = len(car_df_new)
n_train = int(n * 0.6)
n_val = int(n * 0.2)
n_test = n - n_train - n_val

In [None]:
df_train = car_df_new[:n_train]
df_val = car_df_new[n_train:n_train + n_val]
df_test = car_df_new[n_train + n_val:]

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

In [None]:
def train_linear_regression(X, y):
    X = np.array(X)
    y = np.array(y)
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:]

In [None]:
x_train = df_train[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(0).values

In [None]:
w0, w = train_linear_regression(x_train, y_train)

In [None]:
y_pred = w0 + x_train.dot(w)

In [None]:
sns.histplot(y_pred)
sns.histplot(y_train)

In [None]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean() 
    return round(np.sqrt(mse), 2)

In [None]:
rmse(y_train, y_pred)

In [None]:
cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
X = df_train[cols]
x_train_n = X.fillna(X.mean()).to_numpy()

w0, w = train_linear_regression(x_train_n, y_train)

y_pred = w0 + x_train_n.dot(w)

In [None]:
rmse(y_train, y_pred)

In [None]:

def prepare_X(df, cols, type):
    df_num = df[cols]
    if type == 'mean':
        df_num = df_num.fillna(df.mean())
    elif type == 'zero':
        df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [None]:
x_train_n = prepare_X(df_train)
w0, w = train_linear_regression(x_train_n, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

rmse(y_val, y_pred)

In [None]:
def train_linear_regression_reg(X, y, r=0.01):
    X = np.array(X)
    y = np.array(y)
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)

    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [None]:
for r in [0, 0.01, 1, 10, 100]:
    x_train_n = prepare_X(df_train)
    w0, w = train_linear_regression_reg(x_train_n, y_train, r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)

    score = rmse(y_val, y_pred)

    print(r, w0, score)

In [None]:
# Root mean squared error
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    rmse = np.sqrt(mse)
    return rmse

In [None]:
def prepare_X(df, feature, fillnan_with):
    df_copy = df.copy()
    if fillnan_with == 'mean':
        df_copy[feature].fillna(value = df_copy[feature].mean(), inplace=True)
    elif fillnan_with == 'zero':
        df_copy[feature].fillna(value = 0, inplace=True)
        
    return df_copy.values

In [None]:


def split_data(df, target_column, train_size = 0.6, 
               val_size = 0.2, seed = 42):
    
    
    n = len(df)
    n_train = int(train_size*n)
    n_val = int(val_size*n)
    
    # Shuffle data
    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)
    df_shuffled = df.iloc[idx]

    # Split data
    X_train = df_shuffled.iloc[:n_train].copy()
    X_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
    X_test = df_shuffled.iloc[n_train + n_val:].copy()


    Y_train = X_train[target_column].values
    Y_val = X_val[target_column].values
    Y_test = X_test[target_column].values

    del X_train[target_column]
    del X_val[target_column]
    del X_test[target_column]
    
    col = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

    # Fill missing values with zeros
    X_train = prepare_X(X_train[col], 'horsepower', 'zero')
    X_val = prepare_X(X_val[col], 'horsepower', 'zero')
    X_test = prepare_X(X_test[col], 'horsepower', 'zero')

    X = {'train':X_train, 'val':X_val, 'test':X_test}
    Y = {'train': Y_train,'val':Y_val,'test': Y_test}

    return X,Y

In [None]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

errors = []
for seed in seeds:
    X,Y = split_data(df = car_df, target_column= 'fuel_efficiency_mpg', seed = seed)
    w0, w = train_linear_regression(X['train'], Y['train'])


    Y_pred = w0 + X['val'].dot(w)
    error = rmse(Y['val'], Y_pred)

    print('%10s' %seed, round( error, 3) )
    errors.append( error )   

print('Std =', round(np.std(errors), 3))

In [None]:
X,Y = split_data(df = car_df, target_column= 'fuel_efficiency_mpg', seed = 9)

In [None]:
# Combine train and validation
X_train = np.concatenate([ X['train'], X['val']])
Y_train = np.concatenate([ Y['train'], Y['val']])

In [None]:

def train_linear_regression_reg(X, y, r=0.001):
    X = np.array(X)
    y = np.array(y)
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)

    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [None]:
# Train model on train and validation and use in test set
w0, w = train_linear_regression_reg(X_train, Y_train, r = 0.001)


Y_pred = w0 + X['test'].dot(w)

score = rmse(Y['test'], Y_pred)
print('RMSE on test set = ', round( rmse(Y['test'], Y_pred), 4))