# MLZC_HOMEWORK_WK1

## SETUP

In [11]:
import pandas as pd
import numpy as np
import sys

print("✅ Libraries loaded\n")
print(f"Python version: {sys.version.split()[0]}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version:  {np.__version__}")

✅ Libraries loaded

Python version: 3.12.3
Pandas version: 2.3.2
NumPy version:  2.3.2


## DATA

In [12]:
import urllib.request

url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
filename = 'car_fuel_efficiency.csv'

urllib.request.urlretrieve(url, filename)
print(f"✅ Downloaded: {filename}")

✅ Downloaded: car_fuel_efficiency.csv


In [13]:
df = pd.read_csv('car_fuel_efficiency.csv')

print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")

df.head()

Shape: 9704 rows × 11 columns

Columns: ['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain', 'num_doors', 'fuel_efficiency_mpg']

Data types:
engine_displacement      int64
num_cylinders          float64
horsepower             float64
vehicle_weight         float64
acceleration           float64
model_year               int64
origin                  object
fuel_type               object
drivetrain              object
num_doors              float64
fuel_efficiency_mpg    float64
dtype: object

Missing values:
engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64


Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


## QUESTIONS

In [14]:
# Q1: What version of Pandas?
print(f"Answer: {pd.__version__}")

Answer: 2.3.2


In [15]:
# Q2: How many records are in the dataset?
num_records = len(df)
print(f"Number of records: {num_records}")

Number of records: 9704


In [16]:
# Q3: How many unique fuel types?
num_fuel_types = df['fuel_type'].nunique()
print(f"Number of fuel types: {num_fuel_types}")
print(f"Fuel types: {df['fuel_type'].unique()}")

Number of fuel types: 2
Fuel types: ['Gasoline' 'Diesel']


In [17]:
# Q4: How many columns have missing values?
missing_per_column = df.isnull().sum()
columns_with_missing = (missing_per_column > 0).sum()

print(f"Columns with missing values: {columns_with_missing}")
print(f"\nDetails:")
print(missing_per_column[missing_per_column > 0])

Columns with missing values: 4

Details:
num_cylinders    482
horsepower       708
acceleration     930
num_doors        502
dtype: int64


In [18]:
# Q5: Maximum fuel efficiency for Asian cars
asia_cars = df[df['origin'] == 'Asia']
max_fuel_efficiency = asia_cars['fuel_efficiency_mpg'].max()

print(f"Max fuel efficiency (Asia): {max_fuel_efficiency}")

Max fuel efficiency (Asia): 23.759122836520497


In [19]:
# Q6: Median horsepower analysis

# Median BEFORE filling
median_before = df['horsepower'].median()
print(f"1. Median (before): {median_before}")

# Most frequent value (mode)
mode_value = df['horsepower'].mode()[0]
print(f"2. Mode (most frequent): {mode_value}")

# Fill missing values with mode
df['horsepower'] = df['horsepower'].fillna(mode_value)

# Median AFTER filling
median_after = df['horsepower'].median()
print(f"4. Median (after): {median_after}")

# Compare
print(f"\nChanged? {median_before != median_after}")
if median_after > median_before:
    print("Answer: Yes, it increased")
elif median_after < median_before:
    print("Answer: Yes, it decreased")
else:
    print("Answer: No")

1. Median (before): 149.0
2. Mode (most frequent): 152.0
4. Median (after): 152.0

Changed? True
Answer: Yes, it increased


In [20]:
# Q7: Sum of weights (linear regression)

# Select Asia cars, only vehicle_weight and model_year
asia_data = df[df['origin'] == 'Asia'][['vehicle_weight', 'model_year']]

# First 7 rows
first_7 = asia_data.head(7)

# Get NumPy array X
X = first_7.values
print("X (first 7 Asia cars):")
print(X)

# Compute XTX (X transpose times X)
XTX = X.T @ X
print(f"\nXTX:\n{XTX}")

# Invert XTX
XTX_inv = np.linalg.inv(XTX)
print(f"\nXTX inverse:\n{XTX_inv}")

# Create y array
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
print(f"\ny: {y}")

# Compute w
w = XTX_inv @ X.T @ y
print(f"\nw (weights): {w}")

# Sum of all elements
sum_w = w.sum()
print(f"\n✅ Sum of weights: {sum_w}")
print(f"   Rounded: {sum_w:.2f}")

X (first 7 Asia cars):
[[2714.21930965 2016.        ]
 [2783.86897424 2010.        ]
 [3582.68736772 2007.        ]
 [2231.8081416  2011.        ]
 [2659.43145076 2016.        ]
 [2844.22753389 2014.        ]
 [3761.99403819 2019.        ]]

XTX:
[[62248334.33150762 41431216.5073268 ]
 [41431216.5073268  28373339.        ]]

XTX inverse:
[[ 5.71497081e-07 -8.34509443e-07]
 [-8.34509443e-07  1.25380877e-06]]

y: [1100 1300  800  900 1000 1100 1200]

w (weights): [0.01386421 0.5049067 ]

✅ Sum of weights: 0.5187709081074016
   Rounded: 0.52
