In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [3]:
data=pd.read_csv('car_fuel_efficiency.csv')

In [4]:
df=pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [7]:
df.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [7]:
cols=['engine_displacement','horsepower', 'vehicle_weight','model_year','fuel_efficiency_mpg']
df=df[cols]
df

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


In [10]:
df.isna().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [11]:
median_hp=df.horsepower.median()
print(median_hp)

149.0


In [8]:
np.random.seed(42)
n=len(df)
print('length of df:',n)


n_test=int(0.2*n)
n_val=int(0.2*n)
n_train=n-(n_val+n_test)
print('split data length:',n_train,n_val,n_test)

idx=np.arange(n)
np.random.shuffle(idx)
df_shuffled=df.iloc[idx]
df_shuffled.head()

df_train= df_shuffled.iloc[:n_train].copy()
df_val=df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test=df_shuffled.iloc[n_train+n_val:].copy()

print(len(df_train),len(df_val),len(df_test))


length of df: 9704
split data length: 5824 1940 1940
5824 1940 1940


In [9]:
df_train.iloc[10]

engine_displacement     240.000000
horsepower              159.000000
vehicle_weight         2916.987921
model_year             2001.000000
fuel_efficiency_mpg      15.066416
Name: 311, dtype: float64

In [10]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:]

In [15]:
base = ['engine_displacement','horsepower', 'vehicle_weight','model_year','fuel_efficiency_mpg']
target = 'fuel_efficiency_mpg'


In [None]:
X_train = df_train[base].fillna(0).values
y_train = df_train[target].values

w0, w = train_linear_regression(X_train, y_train)

X_val = df_val[base].fillna(0).values
y_val = df_val[target].values

y_pred = w0 + X_val.dot(w)
rmse_0 = round(np.sqrt(mean_squared_error(y_val, y_pred)), 2)

print("RMSE (fillna 0):", rmse_0)


RMSE (fillna 0): 0.0


In [None]:

mean_hp = df_train['horsepower'].mean()
X_train = df_train[base].fillna({'horsepower': mean_hp}).values
y_train = df_train[target].values

w0, w = train_linear_regression(X_train, y_train)


X_val = df_val[base].fillna({'horsepower': mean_hp}).values
y_val = df_val[target].values

y_pred = w0 + X_val.dot(w)
rmse_mean = round(np.sqrt(mean_squared_error(y_val, y_pred)), 2)

print("RMSE (fillna mean):", rmse_mean)



RMSE (fillna mean): 0.0


In [None]:
if rmse_0 < rmse_mean:
    print("Filling with 0 gives better RMSE")
else:
    print("Filling with mean gives better RMSE")


✅ Filling with mean gives better RMSE


In [None]:
def train_linear_regression_reg(X, y, r=0.0):
    X = np.column_stack([np.ones(len(X)), X]) 
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])       
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:]

X_train = df_train[base].fillna(0).values
y_train = df_train[target].values

X_val = df_val[base].fillna(0).values
y_val = df_val[target].values

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
scores = []

for r in r_values:
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w0 + X_val.dot(w)
    rmse = round(np.sqrt(mean_squared_error(y_val, y_pred)), 2)
    scores.append((r, rmse))

scores

best_r, best_rmse = min(scores, key=lambda x: (x[1], x[0]))

print("RMSE scores:", scores)
print("✅ Best r:", best_r)
print("✅ Best RMSE:", best_rmse)

RMSE scores: [(0, np.float64(0.0)), (0.01, np.float64(0.0)), (0.1, np.float64(0.0)), (1, np.float64(0.0)), (5, np.float64(0.0)), (10, np.float64(0.0)), (100, np.float64(0.03))]
✅ Best r: 0
✅ Best RMSE: 0.0


In [None]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:]


seeds = [0,1,2,3,4,5,6,7,8,9]
rmse_scores = []

    

X_train = df_train[base].fillna(0).values
y_train = df_train[target].values
X_val   = df_val[base].fillna(0).values
y_val   = df_val[target].values


w0, w = train_linear_regression(X_train, y_train)
y_pred = w0 + X_val.dot(w)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse_scores.append(rmse)


std_rmse = round(np.std(rmse_scores), 3)
print("RMSE scores for all seeds:", [round(x,2) for x in rmse_scores])
print("Standard deviation of RMSEs:", std_rmse)

RMSE scores for all seeds: [np.float64(0.0)]
Standard deviation of RMSEs: 0.0
