**Importing Library and loading Dataset**

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score


df = pd.read_csv("/content/vehicles.csv", engine='python', on_bad_lines='warn')


  df = pd.read_csv("/content/vehicles.csv", engine='python', on_bad_lines='warn')


In [12]:
df.head() # show first five rows of dataset

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,


In [13]:
df.shape # show no. of rows and colummns

(314, 26)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 314 entries, 0 to 313
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            314 non-null    int64  
 1   url           314 non-null    object 
 2   region        314 non-null    object 
 3   region_url    314 non-null    object 
 4   price         314 non-null    int64  
 5   year          286 non-null    float64
 6   manufacturer  274 non-null    object 
 7   model         283 non-null    object 
 8   condition     211 non-null    object 
 9   cylinders     174 non-null    object 
 10  fuel          284 non-null    object 
 11  odometer      287 non-null    float64
 12  title_status  273 non-null    object 
 13  transmission  287 non-null    object 
 14  VIN           197 non-null    object 
 15  drive         179 non-null    object 
 16  size          42 non-null     object 
 17  type          234 non-null    object 
 18  paint_color   218 non-null    

In [15]:
df.describe()

Unnamed: 0,id,price,year,odometer,county,lat,long
count,314.0,314.0,286.0,287.0,0.0,283.0,283.0
mean,7304623000.0,3165142.0,2011.412587,81057.466899,,32.972135,-86.031692
std,27118690.0,55735440.0,10.8193,87360.035397,,0.765893,0.784507
min,7208550000.0,0.0,1954.0,21.0,,28.0091,-87.782814
25%,7306676000.0,6725.0,2008.0,20796.0,,32.59,-86.783493
50%,7315550000.0,21125.0,2015.0,55251.0,,32.6454,-85.484447
75%,7316243000.0,29990.0,2018.0,130474.0,,33.455361,-85.48
max,7316878000.0,987654300.0,2021.0,999999.0,,36.1379,-82.5034


**checking and handling Missing values**

In [16]:
# checking for null values
df.isnull().sum()

Unnamed: 0,0
id,0
url,0
region,0
region_url,0
price,0
year,28
manufacturer,40
model,31
condition,103
cylinders,140


In [17]:
# this below columns are not useful for prediction so we remove it from our dataset
columns_remove=[
    'id','url','region_url','image_url','posting_date',
    'description','county','VIN','lat','long','size'
  ]
df.drop(columns=columns_remove,inplace=True)

In [18]:
df.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state
0,prescott,6000,,,,,,,,,,,,,az
1,fayetteville,11900,,,,,,,,,,,,,ar
2,florida keys,21000,,,,,,,,,,,,,fl
3,worcester / central MA,1500,,,,,,,,,,,,,ma
4,greensboro,4900,,,,,,,,,,,,,nc


In [19]:
df.dropna(subset=['year','odometer','state'],inplace=True) # drop rows which contain null values

In [21]:
df['drive'].value_counts()

Unnamed: 0_level_0,count
drive,Unnamed: 1_level_1
4wd,63
fwd,61
rwd,54


In [22]:

df['cylinders'].value_counts()

Unnamed: 0_level_0,count
cylinders,Unnamed: 1_level_1
6 cylinders,93
8 cylinders,47
4 cylinders,30
5 cylinders,2
other,1


In [23]:
# fill null values with 'unknown' because this columns are useful for predicitng the car price
cat_cols=['manufacturer','model','condition','cylinders','fuel',
        'title_status','transmission','drive','type','paint_color']

for col in cat_cols:
    df[col]=df[col].fillna('unknown')

In [24]:
df.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state
27,auburn,33590,2014.0,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,57923.0,clean,other,unknown,pickup,white,al
28,auburn,22590,2010.0,chevrolet,silverado 1500,good,8 cylinders,gas,71229.0,clean,other,unknown,pickup,blue,al
29,auburn,39590,2020.0,chevrolet,silverado 1500 crew,good,8 cylinders,gas,19160.0,clean,other,unknown,pickup,red,al
30,auburn,30990,2017.0,toyota,tundra double cab sr,good,8 cylinders,gas,41124.0,clean,other,unknown,pickup,red,al
31,auburn,15000,2013.0,ford,f-150 xlt,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,truck,black,al


In [25]:
#convert year into age for particular car

df['car_age']=2025-df['year']
df.drop('year',axis=1,inplace=True)

In [27]:
df['model'].value_counts()

Unnamed: 0_level_0,count
model,Unnamed: 1_level_1
wrangler unlimited sport,6
unknown,4
s60 t6 r-design sedan 4d,4
tacoma,3
colorado extended cab,3
...,...
e-class,1
grand cherokee,1
cavalier,1
expedition el,1


In [28]:
#Reduce Rare Model Names
top_models=df['model'].value_counts().nlargest(30).index
df['model']=df['model'].apply(lambda x: x if x in top_models else 'other')

In [29]:
#Remove entries with price=0 or unrealistic (e.g >$1M)
df=df[(df['price']>500)&(df['price']<=1000000)]

In [30]:
# Apply log transformation to stabilize variance
df['log_price'] = np.log1p(df['price'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['log_price'] = np.log1p(df['price'])


In [31]:
# Select relevant features
features = ['car_age', 'odometer', 'state', 'manufacturer', 'model', 'condition',
            'cylinders', 'fuel', 'title_status', 'transmission', 'drive',
            'type', 'paint_color']

X = df[features]
y = df['log_price']


**Split the data into train and test**

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Preprocessing & Modeling Pipeline (Ridge and Lasso)**

In [33]:
# Split columns
cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(include=['int64', 'float64']).columns

# Numeric pipeline
num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='constant',fill_value='unknown')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

# Combine both
preprocessor=ColumnTransformer([
    ('num',num_pipeline,num_features),
    ('cat',cat_pipeline,cat_features)
])

**Train Ridge and Lasso Models**

In [34]:
ridge_model = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

lasso_model = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', Lasso(alpha=0.1))
])

ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)


**Evaluate Models**

In [35]:
def evaluate_model(model, name):
    y_pred_log =model.predict(X_test)
    y_pred =np.expm1(y_pred_log)
    y_actual=np.expm1(y_test)

    rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
    r2 = r2_score(y_actual, y_pred)

    print(f"{name} Model Performance:")
    print(f"  RMSE: ${rmse:,.2f}")
    print(f"  R2 Score: {r2:.4f}")
    print()

evaluate_model(ridge_model, "Ridge Regression")
evaluate_model(lasso_model, "Lasso Regression")


Ridge Regression Model Performance:
  RMSE: $5,907.72
  R2 Score: 0.7785

Lasso Regression Model Performance:
  RMSE: $7,801.81
  R2 Score: 0.6137



**Predict Price for New Car**

In [36]:
sample_car = pd.DataFrame([{
    'car_age': 5,
    'odometer': 40000,
    'state': 'ca',
    'manufacturer': 'bmw',
    'model': '3 series',
    'condition': 'excellent',
    'cylinders': '4 cylinders',
    'fuel': 'gas',
    'title_status': 'clean',
    'transmission': 'automatic',
    'drive': 'rwd',
    'type': 'sedan',
    'paint_color': 'black'
}])

pred_log = ridge_model.predict(sample_car)
pred_price = np.expm1(pred_log[0])
print(f"Predicted Car Price: ${pred_price:,.2f}")


Predicted Car Price: $25,914.91
