## Import Libraries

In [None]:
# pip install pandas

In [None]:
import pandas as pd # type: ignore
import plotly.express as px # type: ignore

import plotly.io as pio

## Load Data

In [2]:
# Load the dataset
df = pd.read_csv('../vehicles_us.csv')

## Exploratory Data Analysis

In [4]:
# Basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [5]:
# Preview data
df.head()


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


In [6]:

# Shape of Data
print("Shape of the dataset:", df.shape)

Shape of the dataset: (51525, 13)


In [7]:
# Summary statistics
df.describe()

Unnamed: 0,price,model_year,cylinders,odometer,is_4wd,days_listed
count,51525.0,47906.0,46265.0,43633.0,25572.0,51525.0
mean,12132.46492,2009.75047,6.125235,115553.461738,1.0,39.55476
std,10040.803015,6.282065,1.66036,65094.611341,0.0,28.20427
min,1.0,1908.0,3.0,0.0,1.0,0.0
25%,5000.0,2006.0,4.0,70000.0,1.0,19.0
50%,9000.0,2011.0,6.0,113000.0,1.0,33.0
75%,16839.0,2014.0,8.0,155000.0,1.0,53.0
max,375000.0,2019.0,12.0,990000.0,1.0,271.0


In [8]:
# Checking for missing values
df.isnull().sum()


price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64

In [9]:

# List of columns in the dataset
print(df.columns)

Index(['price', 'model_year', 'model', 'condition', 'cylinders', 'fuel',
       'odometer', 'transmission', 'type', 'paint_color', 'is_4wd',
       'date_posted', 'days_listed'],
      dtype='object')


## Visualizations

In [10]:
pio.renderers.default = 'browser'  # This opens plots in web browser


In [13]:

pio.renderers.default = 'browser'  # This opens plots in web browser
# Histogram for Price Distribution
fig = px.histogram(df, x='price', title='Distribution of Car Prices')
fig.show()

In [12]:
# Scatter plot for Mileage vs Price
fig = px.scatter(df, x='odometer', y='price', title='Odometer (Mileage) vs Price')
fig.show()

In [14]:
# Average Price by Make
fig = px.bar(df.groupby('model')['price'].mean().reset_index(),
             x='model', y='price', title='Average Price by Model')
fig.show()

In [15]:
print(df['model_year'].isna().sum())

df = df.dropna(subset=['model_year'])

# Calculate car age assuming current year is 2023
df['age'] = 2023 - df['model_year']

# Now plot Age vs Price
fig = px.scatter(df, x='age', y='price', title='Car Age vs Price')
fig.show()



3619
