In [11]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats as st
import math

In [12]:
# Attempt to import data, handle exception, give feedback if successful
file_path = "../vehicles_us.csv"

try:
    vehicles_df = pd.read_csv(file_path)
except FileNotFoundError as error_msg:
    print(f"Error reading file: {error_msg}. Try again!")
else:
    print(f"The file at path: [{file_path}] was imported.")
    print("The import was saved to the variable: [vehicles_df]")

The file at path: [../vehicles_us.csv] was imported.
The import was saved to the variable: [vehicles_df]


In [13]:
# Print general information
vehicles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


**Observations**

The dataset comprises 51,525 entries. Within it, several columns contain missing values, including:

- `model_year`
- `cylinders`
- `odometer`
- `paint_color`
- `is_4wd`

For the analysis at hand, a standardized approach will be adopted to handle these missing values:

- The missing values in the columns `model_year`, `cylinders`, and `odometer` will be substituted with their respective statistical means.
- The missing values in the `paint_color` column will be filled with the categorical value "Unknown."
- The missing values in the `is_4wd` column will be filled with a zero. From an analytical standpoint, I would prefer to miss-calculate cars that have four-wheel drive as nothing having it than the other way around.


In [14]:
# Updating numerical 
try:
    vehicles_df["model_year"]=vehicles_df["model_year"].fillna(round(vehicles_df["model_year"].mean()))
    vehicles_df["cylinders"] = vehicles_df["cylinders"].fillna(round(vehicles_df["cylinders"].mean()))
    vehicles_df["odometer"] = vehicles_df["odometer"].fillna(round(vehicles_df["odometer"].mean()))
    vehicles_df['paint_color'] = vehicles_df['paint_color'].fillna("unknown")
    vehicles_df['is_4wd'] = vehicles_df['is_4wd'].fillna(0)
except:
    print("There was an error! Try again.")
else:
    print(f"Missing values for 'model_year': {vehicles_df['model_year'].isna().sum()}")
    print(f"Missing values for 'cylinders': {vehicles_df['cylinders'].isna().sum()}")
    print(f"Missing values for 'odometer': {vehicles_df['odometer'].isna().sum()}")
    print(f"Missing values for 'paint_color': {vehicles_df['paint_color'].isna().sum()}")
    print(f"Missing values for 'is_4wd': {vehicles_df['is_4wd'].isna().sum()}")

Missing values for 'model_year': 0
Missing values for 'cylinders': 0
Missing values for 'odometer': 0
Missing values for 'paint_color': 0
Missing values for 'is_4wd': 0


In [15]:
# Change the data type of the entire column
columns_to_convert = ['model_year', 'cylinders', 'odometer', 'is_4wd']
vehicles_df[columns_to_convert] = vehicles_df[columns_to_convert].astype(int)

In [16]:
# Feature engineering dates based on the date_posted column
vehicles_df['date_posted'] = pd.to_datetime(vehicles_df['date_posted'])
vehicles_df['year_posted'] = vehicles_df['date_posted'].dt.year
vehicles_df['month_posted'] = vehicles_df['date_posted'].dt.month
vehicles_df['day_posted'] = vehicles_df['date_posted'].dt.day

# Separate out the make and model so analysis can be done on both
vehicles_df[['make', 'model']] = vehicles_df['model'].str.split(' ', n=1, expand=True)
vehicles_df.insert(2, 'make', vehicles_df.pop('make'))
vehicles_df.insert(3, 'model', vehicles_df.pop('model'))

In [17]:
# Create an ID column based on index. If the index is reset later, the ID will still map back to original
vehicles_df.insert(0, 'id', vehicles_df.index)

In [18]:
vehicles_df

Unnamed: 0,id,price,model_year,make,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,year_posted,month_posted,day_posted
0,0,9400,2011,bmw,x5,good,6,gas,145000,automatic,SUV,unknown,1,2018-06-23,19,2018,6,23
1,1,25500,2010,ford,f-150,good,6,gas,88705,automatic,pickup,white,1,2018-10-19,50,2018,10,19
2,2,5500,2013,hyundai,sonata,like new,4,gas,110000,automatic,sedan,red,0,2019-02-07,79,2019,2,7
3,3,1500,2003,ford,f-150,fair,8,gas,115553,automatic,pickup,unknown,0,2019-03-22,9,2019,3,22
4,4,14900,2017,chrysler,200,excellent,4,gas,80903,automatic,sedan,black,0,2019-04-02,28,2019,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,51520,9249,2013,nissan,maxima,like new,6,gas,88136,automatic,sedan,black,0,2018-10-03,37,2018,10,3
51521,51521,2700,2002,honda,civic,salvage,4,gas,181500,automatic,sedan,white,0,2018-11-14,22,2018,11,14
51522,51522,3950,2009,hyundai,sonata,excellent,4,gas,128000,automatic,sedan,blue,0,2018-11-15,32,2018,11,15
51523,51523,7455,2013,toyota,corolla,good,4,gas,139573,automatic,sedan,black,0,2018-07-02,71,2018,7,2
