# Imports & Data Load

In [50]:
import pandas as pd
import numpy as np

In [2]:
# load the data
data = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

# Q1 -> pandas version

In [None]:
# Q1 -> pandas version (2.3.2)
pd.__version__

'2.3.2'

as shown, pandas version is '2.3.2'

# Q2 -> Records Count

In [None]:
# Q2 -> 9704 rows
data.shape

(9704, 11)

as shown, there are 9704 rows (records) in the dataframe

# Q3 -> Fuel types

In [20]:
# Q3 -> 2 types, gasoline and diesel
data['fuel_type'].nunique(dropna=False)

2

as shown, there are 2 unique fuel types

# Q4 -> Missing Values

In [None]:
# Q4 -> 4 columns with missing values
data.isna().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

as shown, there are 4 columns containing missing values.

# Q5 -> Max Fuel Efficiency

In [21]:
data['origin'].value_counts()

origin
Europe    3254
Asia      3247
USA       3203
Name: count, dtype: int64

In [None]:
# we'll reuse this later for final question
data_asia = data[
    data['origin'] == 'Asia'
].copy()

In [37]:
data_asia['fuel_efficiency_mpg'].max()

23.759122836520497

as shown above, the max fuel efficiency for cars from Asia is 23.75

# Q6 -> median value of horsepower

In [28]:
median_hp = data['horsepower'].median()
mode_hp = data['horsepower'].mode()[0]
print(f"median horsepower: {median_hp}, mode horsepower: {mode_hp}")

median horsepower: 149.0, mode horsepower: 152.0


In [32]:
# fill the missing horsepower rows with its most frequent value (mode)
data_filled = data.copy() # without modifying the original data (safer)
data_filled['horsepower'] = data_filled['horsepower'].fillna(value=mode_hp)

In [35]:
updated_median_hp = data_filled['horsepower'].median()
print(f"updated median horsepower: {updated_median_hp}")

updated median horsepower: 152.0


In [34]:
(updated_median_hp > median_hp)

True

as shown, the median value of 'horsepower' column in the updated data (missing rows filled with its frequent value) is increased compared to the original data

# Q7 -> Sum of Weights
(and reuses the `data_asia` that we created in Q5)

In [46]:
# get vehicle_weight and `model_year`, take the first 7 rows, and get its numpy array (let's say, X)
filtered_data = data_asia[['vehicle_weight', 'model_year']].iloc[:7]
X = filtered_data.values

In [47]:
# get X transpose
X_T = X.T

In [64]:
# compute matrix-matrix multiplication
XTX = X_T @ X # or np.matmul

In [65]:
XTX

array([[62248334.33150762, 41431216.5073268 ],
       [41431216.5073268 , 28373339.        ]])

In [66]:
# invert XTX
XTX_inv = np.linalg.inv(XTX)

In [55]:
# create y
y = [1100, 1300, 800, 900, 1000, 1100, 1200]

In [69]:
# matmul XTX_invert and X_T, then matmul the result and y to get the weights
w = (XTX_inv @ X_T) @ y # also mathematically eqv to XTX_inv (X_T @ y) since matmul is associative

In [70]:
w

array([0.01386421, 0.5049067 ])

In [71]:
# sum of the weights
sum(w)

np.float64(0.5187709081074023)

as shown above, sum of the weights is `0.51`