In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 1. numpy version
np.__version__

'1.19.5'

In [3]:
# 2. pandas version
pd.__version__

'1.1.5'

In [4]:
# pull in the car price data
car_df = pd.read_csv('car_data.csv')
car_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [5]:
# 3. average price of BMW cars
bmws = car_df[car_df['Make'] == 'BMW']
print('The average price of BMWs is ${:0.2f}'.format(bmws['MSRP'].mean()))

The average price of BMWs is $61546.76


In [6]:
# 4. in a subset of cars after 2015, how many have missing values?
cars_post_2015 = car_df[car_df['Year'] >= 2015]
print('{0} cars after 2015 have missing values'.format(cars_post_2015['Engine HP'].isna().sum()))

51 cars after 2015 have missing values


In [7]:
# 5. average Engine HP
avg_engine_hp = round(car_df['Engine HP'].mean())
print('The average Engine HP is {0}'.format(avg_engine_hp))

The average Engine HP is 249


In [8]:
# using fillna
cars_fillna = car_df.copy()
cars_fillna['Engine HP'].fillna(value=avg_engine_hp, inplace=True)
print('The average Engine HP (with missing data filled) is {0}'.format(round(cars_fillna['Engine HP'].mean())))


The average Engine HP (with missing data filled) is 249


In [9]:
# 6. subsetting, removing duplicates, converting to numpy array and matrix multiplication
# only Rolls-Royces
rr = car_df[car_df['Make'] == 'Rolls-Royce']
# keep only these fields
rr = rr[['Engine HP','Engine Cylinders','highway MPG']]
# remove duplicates
rr.drop_duplicates(inplace=True)
# convert to a numpy array
X = rr.to_numpy()
# compute matrix-matrix multiplication of X and X transposed
XTX = np.matmul(X.T, X)
# invert XTX
inverted_xtx = np.linalg.inv(XTX)
# sum all the elements
print('The sum of the elements of the inverted xtx array is {}'.format(np.sum(inverted_xtx)))

The sum of the elements of the inverted xtx array is 0.032212320677486125


In [10]:
print(X.shape)
print((X.T).shape)
print(XTX.shape)
print(inverted_xtx.shape)

(7, 3)
(3, 7)
(3, 3)
(3, 3)


In [11]:
# 7. create array y, multiply XTX with X.T, then multiply by the array y
y = [1000, 1100, 900, 1200, 1000, 850, 1300]
w = np.matmul(np.matmul(inverted_xtx, X.T), y)
print(w)
print('The first element of w is {0}'.format(w[0]))

[ 0.19989598 31.02612262 31.65378877]
The first element of w is 0.19989598183188978
