# Cleaning Car Max Web Scraped Data 

#### Here we simply clean the scraped CarMax data, so we can join it to the kaggle data

In [18]:
import pandas as pd

# load the data
car_data = pd.read_csv('Raw_CarMax.csv')

# clean 'car price' column by removing dollar signs, commas, asterisks, and extra spaces, then convert to float
car_data['Car Price'] = car_data['Car Price'].replace({'\$': '', ',': '', '\*': '', ' ': ''}, regex=True)

# display the df
display(car_data)

Unnamed: 0,Car Make,Car Model,Car Trim,Car Year,Car Mileage,Car Price
0,Acura,TLX,,2015,101K mi,16998
1,Acura,TLX,,2015,91K mi,18998
2,Scion,FR-S,,2014,110K mi,17998
3,Chevrolet,Camaro,LT,2017,83K mi,19998
4,Dodge,Challenger,SXT,2019,41K mi,24998
...,...,...,...,...,...,...
1192,Ford,Mustang,Bullitt,2020,6K mi,45998
1193,Dodge,Challenger,SXT,2022,57K mi,24998
1194,Mazda,MX-5,Miata Grand Touring,2022,12K mi,28998
1195,Ford,Mustang,GT,2024,6K mi,42998


In [19]:
# remove 'k mi' and convert to a number in thousands
car_data['Car Mileage'] = car_data['Car Mileage'].str.replace('K mi', '').astype(float) * 1000

# format the 'car mileage' column with commas 
car_data['Car Mileage'] = car_data['Car Mileage'].apply(lambda x: f"{x:,.0f}")

# show the updated df
display(car_data)

Unnamed: 0,Car Make,Car Model,Car Trim,Car Year,Car Mileage,Car Price
0,Acura,TLX,,2015,101000,16998
1,Acura,TLX,,2015,91000,18998
2,Scion,FR-S,,2014,110000,17998
3,Chevrolet,Camaro,LT,2017,83000,19998
4,Dodge,Challenger,SXT,2019,41000,24998
...,...,...,...,...,...,...
1192,Ford,Mustang,Bullitt,2020,6000,45998
1193,Dodge,Challenger,SXT,2022,57000,24998
1194,Mazda,MX-5,Miata Grand Touring,2022,12000,28998
1195,Ford,Mustang,GT,2024,6000,42998


In [20]:
# remove commas from the 'car mileage' column and convert to integers
car_data['Car Mileage'] = car_data['Car Mileage'].str.replace(',', '').astype(int)

# drop the 'car trim' column
car_data.drop('Car Trim', axis=1, inplace=True)

# display the cleaned df
display(car_data)

Unnamed: 0,Car Make,Car Model,Car Year,Car Mileage,Car Price
0,Acura,TLX,2015,101000,16998
1,Acura,TLX,2015,91000,18998
2,Scion,FR-S,2014,110000,17998
3,Chevrolet,Camaro,2017,83000,19998
4,Dodge,Challenger,2019,41000,24998
...,...,...,...,...,...
1192,Ford,Mustang,2020,6000,45998
1193,Dodge,Challenger,2022,57000,24998
1194,Mazda,MX-5,2022,12000,28998
1195,Ford,Mustang,2024,6000,42998


In [21]:
# save the cleaned DF
car_data.to_csv('Cleaned_CarMax.csv', index=False)
print("Data saved to 'Cleaned_CarMax.csv'")

Data saved to 'Cleaned_CarMax.csv'
