In [19]:
import numpy as np
import pandas as pd

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 100)

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# import and filter warnings
import warnings
warnings.filterwarnings("ignore")

In [7]:
# read the car price data csv
df = pd.read_csv("car-price-data.csv")
# show the first 5 rows
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [12]:
# format the column names to be lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(" ", "_")

# show the column names
df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'msrp'],
      dtype='object')

In [18]:
# convert object column data to lowercase with spaces as underscores
strings = list(df.dtypes[df.dtypes == "object"].index)
for col in strings:
    df[col] = df[col].str.lower().str.replace(" ", "_")

# print the head of object columns
df[strings].head()

Unnamed: 0,make,model,engine_fuel_type,transmission_type,driven_wheels,market_category,vehicle_size,vehicle_style
0,bmw,1_series_m,premium_unleaded_(required),manual,rear_wheel_drive,"factory_tuner,luxury,high-performance",compact,coupe
1,bmw,1_series,premium_unleaded_(required),manual,rear_wheel_drive,"luxury,performance",compact,convertible
2,bmw,1_series,premium_unleaded_(required),manual,rear_wheel_drive,"luxury,high-performance",compact,coupe
3,bmw,1_series,premium_unleaded_(required),manual,rear_wheel_drive,"luxury,performance",compact,coupe
4,bmw,1_series,premium_unleaded_(required),manual,rear_wheel_drive,luxury,compact,convertible


In [20]:
# print 5 unique values and total count of unique values for each column
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")
    print(df[col].unique()[:5])
    print()

make: 48 unique values
['bmw' 'audi' 'fiat' 'mercedes-benz' 'chrysler']

model: 914 unique values
['1_series_m' '1_series' '100' '124_spider' '190-class']

year: 28 unique values
[2011 2012 2013 1992 1993]

engine_fuel_type: 10 unique values
['premium_unleaded_(required)' 'regular_unleaded'
 'premium_unleaded_(recommended)' 'flex-fuel_(unleaded/e85)' 'diesel']

engine_hp: 356 unique values
[335. 300. 230. 320. 172.]

engine_cylinders: 9 unique values
[ 6.  4.  5.  8. 12.]

transmission_type: 5 unique values
['manual' 'automatic' 'automated_manual' 'direct_drive' 'unknown']

driven_wheels: 4 unique values
['rear_wheel_drive' 'front_wheel_drive' 'all_wheel_drive'
 'four_wheel_drive']

number_of_doors: 3 unique values
[ 2.  4.  3. nan]

market_category: 71 unique values
['factory_tuner,luxury,high-performance' 'luxury,performance'
 'luxury,high-performance' 'luxury' 'performance']

vehicle_size: 3 unique values
['compact' 'midsize' 'large']

vehicle_style: 16 unique values
['coupe' 'conve