# Libraries and data load

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [3]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


# Questions

1. Pandas ver.

In [4]:
print(pd.__version__)

2.3.1


2. record count

In [5]:
df.shape

(9704, 11)

3. Fuel types 

In [6]:
print('Fuel Types')
print('----------')

for ftype in df.fuel_type.unique():
    print(ftype)

print('\nDistinct Fuel Types:', df.fuel_type.nunique())

Fuel Types
----------
Gasoline
Diesel

Distinct Fuel Types: 2


4. Missing values

In [7]:
null_counts_per_column = df.isnull().sum()
print(null_counts_per_column)

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64


5. Max fuel efficiency

In [8]:
print('Max Fuel Efficiency:', df[df.origin == 'Asia'].fuel_efficiency_mpg.max())

Max Fuel Efficiency: 23.759122836520497


6. Median Horsepower

In [9]:
print('Median HP:', df.horsepower.median())
print('Most Frequent HP:', df.horsepower.mode()[0])

Median HP: 149.0
Most Frequent HP: 152.0


In [10]:
df.horsepower.fillna(df.horsepower.mode()[0], inplace=True)
print('Median HP after fillna:', df.horsepower.median())

Median HP after fillna: 152.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.horsepower.fillna(df.horsepower.mode()[0], inplace=True)


7. Sum of weights

In [11]:
# Filter to Asia; keep only weight and year; select first 7 values
df_asia = df[df.origin == 'Asia'][['vehicle_weight', 'model_year']][:7]
df_asia

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


In [12]:
# Convert to NumPy array
X = df_asia.to_numpy()

In [13]:
# Gram matrix = X * TX
XTX = X.T @ X
XTX_inv = np.linalg.inv(XTX)

In [14]:
# Define Y and get coefficients
y = [1100, 1300, 800, 900, 1000, 1100, 1200]
w = XTX_inv @ X.T @ y

In [15]:
# Get sum of weights
w_sum = w.sum()
print("Sum of weights:", w_sum)

Sum of weights: 0.5187709081074016
