In [1]:
# Load libraries
import pandas as pd
import numpy as np

In [2]:
# Load data
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [3]:
# Visualise data
df

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [4]:
df.describe()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,9704.0,9222.0,8996.0,9704.0,8774.0,9704.0,9202.0,9704.0
mean,199.708368,3.962481,149.657292,3001.280993,15.021928,2011.484027,-0.006412,14.985243
std,49.455319,1.999323,29.879555,497.89486,2.510339,6.659808,1.048162,2.556468
min,10.0,0.0,37.0,952.681761,6.0,2000.0,-4.0,6.200971
25%,170.0,3.0,130.0,2666.248985,13.3,2006.0,-1.0,13.267459
50%,200.0,4.0,149.0,2993.226296,15.0,2012.0,0.0,15.006037
75%,230.0,5.0,170.0,3334.957039,16.7,2017.0,1.0,16.707965
max,380.0,13.0,271.0,4739.077089,24.3,2023.0,4.0,25.967222


In [5]:
all_variables = df.columns

In [6]:
# Q1. Pandas version
pd.__version__

'2.3.1'

In [7]:
# Q2. Records count
df.shape[0]

9704

In [8]:
# Q3. Fuel types
df.fuel_type.nunique()

2

In [9]:
# Q4. Missing values
all_variables[ df.isnull().sum() > 0 ].nunique()

4

In [10]:
# Q5. Max fuel efficiency
df.fuel_efficiency_mpg[ df.origin == 'Asia' ].max()

np.float64(23.759122836520497)

In [11]:
# Q6. Median value of horsepower

## Q6.1 Find the median value of the horsepower column in the dataset.
np.nanmedian( df.horsepower )

np.float64(149.0)

In [12]:
## 6.2 Next, calculate the most frequent value of the same horsepower column.
df.horsepower.value_counts()

horsepower
152.0    142
145.0    141
151.0    134
148.0    130
141.0    130
        ... 
40.0       1
57.0       1
245.0      1
252.0      1
61.0       1
Name: count, Length: 192, dtype: int64

In [13]:
## 6.3 Use the fillna method to fill the missing values in the horsepower column with the most frequent value from the previous step.
most_frequent_value = df.horsepower.value_counts().index[0]
df_NAfilled = df.fillna( most_frequent_value )

In [14]:
## 6.4 Now, calculate the median value of horsepower once again. Has it changed?
np.nanmedian( df_NAfilled.horsepower )

np.float64(152.0)

In [15]:
## Has it changed?
np.nanmedian( df.horsepower ) != np.nanmedian( df.fillna(df.horsepower.value_counts().index[0]).horsepower )

np.True_

In [16]:
## Has it increased?
np.nanmedian( df.horsepower ) < np.nanmedian( df.fillna(df.horsepower.value_counts().index[0]).horsepower )

np.True_

In [17]:
## Has it decreased?
np.nanmedian( df.horsepower ) > np.nanmedian( df.fillna(df.horsepower.value_counts().index[0]).horsepower )

np.False_

In [18]:
# Q7. Sum of weights

## 7.1 Select all the cars from Asia
df_ASIAcars = df[ df.origin == 'Asia' ]

In [19]:
## 7.2 Select only columns vehicle_weight and model_year
## 7.3 Select the first 7 values
## 7.4 Get the underlying NumPy array. Let's call it X.

X = df_ASIAcars.loc[:, ['vehicle_weight', 'model_year']].iloc[:7]

In [20]:
## 7.5 Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
XTX = X.T.dot( X )

In [21]:
## 7.6 Invert XTX.
XTX_inverted = np.linalg.inv(XTX)

In [22]:
## 7.7 Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200].
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])

In [23]:
## 7.8 Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
w = XTX_inverted.dot( X.T ).dot( y )

In [24]:
## 7.9 What's the sum of all the elements of the result?
np.sum(w)

np.float64(0.5187709081074016)