<a href="https://colab.research.google.com/github/oleksiyo/machine-learning-zoomcamp/blob/master/cohorts/2025/01-intro/Homework_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages

In [3]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns

# Downloading the dataset

In [4]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-10-01 14:16:46--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.1’


2025-10-01 14:16:48 (17.9 MB/s) - ‘car_fuel_efficiency.csv.1’ saved [874188/874188]



# Loading the dataset as a DataFrame

In [5]:
df = pd.read_csv("/content/car_fuel_efficiency.csv")
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


# Q1. Pandas version

In [8]:
pd.__version__

'2.2.2'

# Q2. Records count

In [6]:
rows, cols = df.shape
print(f"The dataset contains {rows} records and {cols} features.")

The dataset contains 9704 records and 11 features.


# Q3. Fuel types

In [7]:
fuel_types = df['fuel_type'].unique()
print(f"There are {len(fuel_types)} fuel types in the dataset: {', '.join(fuel_types)}")

There are 2 fuel types in the dataset: Gasoline, Diesel


# Q4. Missing values

In [8]:
nan_counts = df.isna().sum()
nan_counts[nan_counts > 0]
print(f"The dataset contains {len(nan_counts[nan_counts > 0])} missing values in total.")

The dataset contains 4 missing values in total.


# Q5. Max fuel efficiency

In [19]:
df_asia = df[df['origin'] == 'Asia']
max_eff = df_asia['fuel_efficiency_mpg'].max()
print(f"The maximum fuel efficiency for cars from Asia is {max_eff:.4f} mpg.")

The maximum fuel efficiency for cars from Asia is 23.7591 mpg.


# Q6. Median value of horsepower

## Calculate the initial median of the horsepower column

In [10]:
median_before = df['horsepower'].median()
print("Median before filling:", median_before)

Median before filling: 149.0


## Find the most frequent value (mode) of the horsepower column

In [11]:
most_freq = df['horsepower'].mode()[0]
print("Most frequent horsepower:", most_freq)

Most frequent horsepower: 152.0


## Fill missing values in the horsepower column with the most frequent value

In [13]:
df['horsepower'] = df['horsepower'].fillna(most_freq)

## Calculate the median of the horsepower column again after filling missing values

In [14]:
median_after = df['horsepower'].median()
print("Median after filling:", median_after)

Median after filling: 152.0


In [20]:
if median_after > median_before:
    answer_q6 = "Yes, it increased"
elif median_after < median_before:
    answer_q6 = "Yes, it decreased"
else:
    answer_q6 = "No"

print("Q6 - Median horsepower BEFORE fill:", median_before)
print("Q6 - Median horsepower AFTER  fill:", median_after)
print("Q6 - Answer:", answer_q6)

Q6 - Median horsepower BEFORE fill: 149.0
Q6 - Median horsepower AFTER  fill: 152.0
Q6 - Answer: Yes, it increased


# Q7. Sum of weights

In [21]:
col_origin = "origin"
col_weight = "vehicle_weight"
col_year   = "model_year"

# Normalize origin values
if df[col_origin].dtype == object:
    df[col_origin] = df[col_origin].str.strip().str.title()

# Ensure numeric types (just in case)
df[col_weight] = pd.to_numeric(df[col_weight], errors="coerce")
df[col_year]   = pd.to_numeric(df[col_year], errors="coerce")

# Select Asia cars, required columns, first 7 rows
subset = df.loc[df[col_origin].eq("Asia"), [col_weight, col_year]].dropna().head(7).copy()

# Safety check: must have 7 rows
if len(subset) < 7:
    raise ValueError(f"Expected at least 7 Asia rows, got {len(subset)}.")

# Build X
X = subset.to_numpy()

# Compute X^T X and invert (fallback to pseudo-inverse if singular)
XTX = X.T @ X
try:
    XTX_inv = np.linalg.inv(XTX)
except np.linalg.LinAlgError:
    XTX_inv = np.linalg.pinv(XTX)

# Given y
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200], dtype=float)

# Compute w and the sum of its elements
w = XTX_inv @ X.T @ y
sum_w = float(w.sum())

print("Subset used (first 7 Asia rows):")
print(subset.to_string(index=False))
print("\nw =", w)
print("Sum of weights =", sum_w)
# Expected (multiple-choice): about 0.51

Subset used (first 7 Asia rows):
 vehicle_weight  model_year
    2714.219310        2016
    2783.868974        2010
    3582.687368        2007
    2231.808142        2011
    2659.431451        2016
    2844.227534        2014
    3761.994038        2019

w = [0.01386421 0.5049067 ]
Sum of weights = 0.5187709081074016
