# Kelly Blue Book Part 2 Cleaning:

In [169]:
from bs4 import BeautifulSoup
import requests
import json
import time
import pandas as pd 
import re 

pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')

#had to restart on a new notebook from an error on the original. This is a continution of the cleaning. 

In [416]:
kbb = pd.read_csv("kbb.csv")
kbb_cleaned = pd.read_csv("kbb_cleaned.csv")

kbb_cleaned

Unnamed: 0,name,price,mileage,engine_type,transmission_type,color,drivewheel_configuration
0,Used 2019 Acura RDX FWD w/ Technology Package,29999,30288,4-Cylinder Turbo,Automatic,Black,2 Wheel Drive - Front
1,Certified 2021 Acura TLX SH-AWD w/ Technology ...,33983,18121,4-Cylinder Turbo,Automatic,Purple,All Wheel Drive
2,Used 2020 Acura RDX,34997,41174,4-Cylinder Turbo,Automatic,Gray,All wheel drive
3,Used 2019 Acura MDX SH-AWD w/ A-SPEC Package,31993,39008,6-Cylinder,Automatic,Blue,All wheel drive
4,Used 2017 Acura RDX FWD w/ Advance Package,23923,56546,6-Cylinder,Automatic,Black,2 wheel drive - front
...,...,...,...,...,...,...,...
35566,Certified 2022 Volkswagen Tiguan SE R-Line,0,25210,4-Cylinder Turbo,Automatic,White,2 wheel drive - front
35567,Certified 2023 Volkswagen Taos SE w/ Panoramic...,28778,7637,4-Cylinder Turbo,Automatic,White,All wheel drive
35568,Certified 2019 Volkswagen Tiguan S w/ 3rd Row ...,17768,67724,4-Cylinder Turbo,Automatic,Black,All wheel drive
35569,Certified 2023 Volkswagen Taos SE w/ Panoramic...,27936,8522,4-Cylinder Turbo,Automatic,Gray,2 wheel drive - front


### Cleaning Mileage column:

In [417]:
#stripping comma from mileage column
kbb_cleaned["mileage"] = kbb_cleaned["mileage"].str.replace(",","")

kbb_cleaned["mileage"]

0        30288
1        18121
2        41174
3        39008
4        56546
         ...  
35566    25210
35567     7637
35568    67724
35569     8522
35570    61997
Name: mileage, Length: 35571, dtype: object

In [418]:
#searching for none numbers in mileage column
pd.set_option('display.max_rows', None)

column_name = 'mileage'

# Use boolean indexing to locate non-numeric values in the specified column
non_numeric_values = kbb_cleaned[~kbb_cleaned[column_name].apply(lambda x: str(x).isnumeric())]

# Print the rows with non-numeric values in the specified column
print(non_numeric_values)

                                                    name   price mileage  \
19419  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19446  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19473  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19500  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19527  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19554  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19581  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19608  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19635  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19662  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19689  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19716  Certified 2023 Land Rover Range Rover Sport SE...  112937     NaN   
19743  Certi

In [419]:
# filling nan value with 0 and converting the column to int. the mileage is unknown on these listings and not 
# provided by the dealership

kbb_cleaned["mileage"] = kbb_cleaned["mileage"].fillna(0)

kbb_cleaned["mileage"] = kbb_cleaned["mileage"].astype(int)
kbb_cleaned["mileage"].dtype


dtype('int32')

### Converting price column to integer:

In [420]:
# converting price column to integer type. 
kbb_cleaned["price"] = kbb_cleaned["price"].astype(int)

kbb_cleaned["price"].dtype

dtype('int32')

### Cleaning color column: 

In [421]:
#before tackling the name column I will look at the last columns to clean color and drivewheel. 
# looking below we can combine the 2 off whites and combine the unavailable and unknown
kbb_cleaned['color'].value_counts()

White        8012
Black        7869
Gray         5765
Blue         4245
Silver       3659
Red          2950
Off White     596
Green         398
Brown         318
Orange        261
Beige         251
Burgundy      185
Gold          122
Offwhite       95
Unavail        58
Yellow         48
Unknown         6
Charcoal        2
Tan             2
Purple          1
Turquoise       1
Name: color, dtype: int64

In [422]:
kbb_cleaned["color"] = kbb_cleaned["color"].str.replace("Offwhite", "Off White")
kbb_cleaned["color"] = kbb_cleaned["color"].str.replace("Unavail", "Unknown")

kbb_cleaned['color'].value_counts()

White        8012
Black        7869
Gray         5765
Blue         4245
Silver       3659
Red          2950
Off White     691
Green         398
Brown         318
Orange        261
Beige         251
Burgundy      185
Gold          122
Unknown        64
Yellow         48
Charcoal        2
Tan             2
Purple          1
Turquoise       1
Name: color, dtype: int64

In [None]:
kbb_cleaned['color'] = kbb_cleaned['color'].fillna('Unknown')

kbb_cleaned['color'].unique()

### Converting drivertrain column: 

In [423]:
#simplfying column name
kbb_cleaned["drivetrain"] = kbb_cleaned["drivewheel_configuration"]

kbb_cleaned["drivetrain"].value_counts()

All wheel drive                       10381
2 wheel drive - front                 10258
2 wheel drive - rear                   5644
4 wheel drive                          4343
All Wheel Drive                        1904
2 Wheel Drive - Front                  1417
4 Wheel Drive                           826
2 Wheel Drive - Rear                    793
4 Wheel Drive - Rear Wheel Default        3
Name: drivetrain, dtype: int64

Note: consolidating drivetrain types to simple format. 

|            drivetrain | acroynm |      meaning     |
|-----------------------|---------|------------------|
| 2 wheel drive - front |   FWD   | front wheel drive|
| 2 Wheel Drive - Front |   FWD   | front wheel drive|
| 2 Wheel Drive - Rear  |   RWD   | rear wheel drive |
| 2 wheel drive - rear  |   RWD   | rear wheel drive |
| All Wheel Drive       |   AWD   | all wheel drive  |
| All wheel drive       |   AWD   | all wheel drive  |
| 4 wheel drive         |   4WD   | four wheel drive |
| 4 Wheel Drive         |   4WD   | four wheel drive |
| 4 Wheel Drive - Rear Wheel Default     |   4WD_RWB   | this is a 4wd with a rear wheel default bias which can be changed  |

In [424]:
#consolidating drivetrain types down to 5. 

kbb_cleaned["drivetrain"] = kbb_cleaned["drivetrain"].str.replace("All wheel drive", "AWD")
kbb_cleaned["drivetrain"] = kbb_cleaned["drivetrain"].str.replace("All Wheel Drive", "AWD")
kbb_cleaned["drivetrain"] = kbb_cleaned["drivetrain"].str.replace("2 wheel drive - front", "FWD")
kbb_cleaned["drivetrain"] = kbb_cleaned["drivetrain"].str.replace("2 Wheel Drive - Front", "FWD")
kbb_cleaned["drivetrain"] = kbb_cleaned["drivetrain"].str.replace("2 wheel drive - rear", "RWD")
kbb_cleaned["drivetrain"] = kbb_cleaned["drivetrain"].str.replace("2 Wheel Drive - Rear", "RWD")
kbb_cleaned["drivetrain"] = kbb_cleaned["drivetrain"].str.replace("4 wheel drive", "4WD")
kbb_cleaned["drivetrain"] = kbb_cleaned["drivetrain"].str.replace("4 Wheel Drive", "4WD")
kbb_cleaned["drivetrain"] = kbb_cleaned["drivetrain"].str.replace("4 Wheel Drive - Rear Wheel Default", "4WD_RWD")
kbb_cleaned["drivetrain"] = kbb_cleaned["drivetrain"].str.replace(" - Rear Wheel Default", "_RWB")


kbb_cleaned["drivetrain"].value_counts()

AWD        12285
FWD        11675
RWD         6437
4WD         5169
4WD_RWB        3
Name: drivetrain, dtype: int64

In [425]:
kbb_cleaned.head(15)

Unnamed: 0,name,price,mileage,engine_type,transmission_type,color,drivewheel_configuration,drivetrain
0,Used 2019 Acura RDX FWD w/ Technology Package,29999,30288,4-Cylinder Turbo,Automatic,Black,2 Wheel Drive - Front,FWD
1,Certified 2021 Acura TLX SH-AWD w/ Technology ...,33983,18121,4-Cylinder Turbo,Automatic,Purple,All Wheel Drive,AWD
2,Used 2020 Acura RDX,34997,41174,4-Cylinder Turbo,Automatic,Gray,All wheel drive,AWD
3,Used 2019 Acura MDX SH-AWD w/ A-SPEC Package,31993,39008,6-Cylinder,Automatic,Blue,All wheel drive,AWD
4,Used 2017 Acura RDX FWD w/ Advance Package,23923,56546,6-Cylinder,Automatic,Black,2 wheel drive - front,FWD
5,Used 2023 Acura RDX FWD w/ A-Spec Package,44895,2148,4-Cylinder Turbo,Automatic,Red,2 wheel drive - front,FWD
6,Used 2019 Acura RDX FWD w/ Advance Package,29550,43275,4-Cylinder Turbo,Automatic,White,2 wheel drive - front,FWD
7,Certified 2020 Acura ILX w/ Premium Package,23989,38101,4-Cylinder,Automatic,Black,2 Wheel Drive - Front,FWD
8,Used 2019 Acura RDX FWD w/ Advance Package,30987,39761,4-Cylinder Turbo,Automatic,Blue,2 wheel drive - front,FWD
9,Used 2019 Acura RDX FWD w/ Technology Package,26999,58664,4-Cylinder Turbo,Automatic,Gray,2 wheel drive - front,FWD


### Cleaning the name column: 

#### The name column I will split the columns into the following:  
##### Used Vehicle Classification, Year, Make, Model, Packages. 


In [426]:
#stripped and extracted certified and used. 

kbb_cleaned["Used Vehicle Classification"] = kbb_cleaned["name"].str.extract('(Used|Certified)', expand=False)
kbb_cleaned["name"] = kbb_cleaned["name"].str.strip('(Used|Certified)')
kbb_cleaned.head(15)

Unnamed: 0,name,price,mileage,engine_type,transmission_type,color,drivewheel_configuration,drivetrain,Used Vehicle Classification
0,2019 Acura RDX FWD w/ Technology Packag,29999,30288,4-Cylinder Turbo,Automatic,Black,2 Wheel Drive - Front,FWD,Used
1,2021 Acura TLX SH-AWD w/ Technology Packag,33983,18121,4-Cylinder Turbo,Automatic,Purple,All Wheel Drive,AWD,Certified
2,2020 Acura RDX,34997,41174,4-Cylinder Turbo,Automatic,Gray,All wheel drive,AWD,Used
3,2019 Acura MDX SH-AWD w/ A-SPEC Packag,31993,39008,6-Cylinder,Automatic,Blue,All wheel drive,AWD,Used
4,2017 Acura RDX FWD w/ Advance Packag,23923,56546,6-Cylinder,Automatic,Black,2 wheel drive - front,FWD,Used
5,2023 Acura RDX FWD w/ A-Spec Packag,44895,2148,4-Cylinder Turbo,Automatic,Red,2 wheel drive - front,FWD,Used
6,2019 Acura RDX FWD w/ Advance Packag,29550,43275,4-Cylinder Turbo,Automatic,White,2 wheel drive - front,FWD,Used
7,2020 Acura ILX w/ Premium Packag,23989,38101,4-Cylinder,Automatic,Black,2 Wheel Drive - Front,FWD,Certified
8,2019 Acura RDX FWD w/ Advance Packag,30987,39761,4-Cylinder Turbo,Automatic,Blue,2 wheel drive - front,FWD,Used
9,2019 Acura RDX FWD w/ Technology Packag,26999,58664,4-Cylinder Turbo,Automatic,Gray,2 wheel drive - front,FWD,Used


In [427]:
#extracting year and stripping blank white space 
kbb_cleaned['year'] = kbb_cleaned['name'].str.extract(r'(\d{4})')
kbb_cleaned['name'] = kbb_cleaned['name'].str.strip("")
kbb_cleaned.head()

Unnamed: 0,name,price,mileage,engine_type,transmission_type,color,drivewheel_configuration,drivetrain,Used Vehicle Classification,year
0,2019 Acura RDX FWD w/ Technology Packag,29999,30288,4-Cylinder Turbo,Automatic,Black,2 Wheel Drive - Front,FWD,Used,2019
1,2021 Acura TLX SH-AWD w/ Technology Packag,33983,18121,4-Cylinder Turbo,Automatic,Purple,All Wheel Drive,AWD,Certified,2021
2,2020 Acura RDX,34997,41174,4-Cylinder Turbo,Automatic,Gray,All wheel drive,AWD,Used,2020
3,2019 Acura MDX SH-AWD w/ A-SPEC Packag,31993,39008,6-Cylinder,Automatic,Blue,All wheel drive,AWD,Used,2019
4,2017 Acura RDX FWD w/ Advance Packag,23923,56546,6-Cylinder,Automatic,Black,2 wheel drive - front,FWD,Used,2017


In [428]:
# removing year and any possible white space

kbb_cleaned['name'] = kbb_cleaned['name'].str.replace(r'(\d{4})', "")
kbb_cleaned['name'] = kbb_cleaned['name'].str.strip()
kbb_cleaned.head()

  kbb_cleaned['name'] = kbb_cleaned['name'].str.replace(r'(\d{4})', "")


Unnamed: 0,name,price,mileage,engine_type,transmission_type,color,drivewheel_configuration,drivetrain,Used Vehicle Classification,year
0,Acura RDX FWD w/ Technology Packag,29999,30288,4-Cylinder Turbo,Automatic,Black,2 Wheel Drive - Front,FWD,Used,2019
1,Acura TLX SH-AWD w/ Technology Packag,33983,18121,4-Cylinder Turbo,Automatic,Purple,All Wheel Drive,AWD,Certified,2021
2,Acura RDX,34997,41174,4-Cylinder Turbo,Automatic,Gray,All wheel drive,AWD,Used,2020
3,Acura MDX SH-AWD w/ A-SPEC Packag,31993,39008,6-Cylinder,Automatic,Blue,All wheel drive,AWD,Used,2019
4,Acura RDX FWD w/ Advance Packag,23923,56546,6-Cylinder,Automatic,Black,2 wheel drive - front,FWD,Used,2017


In [429]:
# removing Romeo from Alfa Romeo and removing Rover from Land Rover so i can properly clean the name column 
kbb_cleaned['name'] = kbb_cleaned['name'].str.replace('Romeo','')
kbb_cleaned['name'] = kbb_cleaned['name'].str.replace('Rover', '')


In [430]:
#creating make and model column 
kbb_cleaned['make'] = kbb_cleaned['name'].str.split().str[0]
kbb_cleaned['model'] = kbb_cleaned['name'].str.split().str[1]

kbb_cleaned.head(15)
kbb_cleaned.tail(15)

Unnamed: 0,name,price,mileage,engine_type,transmission_type,color,drivewheel_configuration,drivetrain,Used Vehicle Classification,year,make,model
35556,Volkswagen Passat 2.0T SE,17551,60043,4-Cylinder Turbo,Automatic,Gray,2 wheel drive - front,FWD,Certified,2021,Volkswagen,Passat
35557,Volkswagen Tiguan SE R-Lin,21810,70394,4-Cylinder Turbo,Automatic,Gray,2 wheel drive - front,FWD,Certified,2020,Volkswagen,Tiguan
35558,Volkswagen Atlas SE,24923,62110,6-Cylinder,Automatic,Blue,All wheel drive,AWD,Certified,2021,Volkswagen,Atlas
35559,Volkswagen Tiguan SE R-Lin,22883,65594,4-Cylinder Turbo,Automatic,Black,2 wheel drive - front,FWD,Certified,2020,Volkswagen,Tiguan
35560,Volkswagen Tiguan SE,21715,35007,4-Cylinder Turbo,Automatic,White,2 wheel drive - front,FWD,Certified,2020,Volkswagen,Tiguan
35561,Volkswagen Passat 2.0T SE,17923,62815,4-Cylinder Turbo,Automatic,Black,2 wheel drive - front,FWD,Certified,2021,Volkswagen,Passat
35562,Volkswagen Tiguan S,21056,21551,4-Cylinder Turbo,Automatic,Blue,2 wheel drive - front,FWD,Certified,2020,Volkswagen,Tiguan
35563,Volkswagen Atlas SEL Premium,34166,24730,6-Cylinder,Automatic,Black,All wheel drive,AWD,Certified,2020,Volkswagen,Atlas
35564,Volkswagen Tiguan S,22345,24208,4-Cylinder Turbo,Automatic,Black,2 wheel drive - front,FWD,Certified,2022,Volkswagen,Tiguan
35565,Volkswagen Arteon SEL,42517,4837,4-Cylinder Turbo,Automatic,Black,All wheel drive,AWD,Certified,2023,Volkswagen,Arteon


In [431]:
#stripping make and model and renaming column 
kbb_cleaned['name'] = kbb_cleaned['name'].str.split(n=2).str[2:].str.join(' ')

kbb_cleaned = kbb_cleaned.rename(columns={'name': 'packages'})

# adding back correct names 
kbb_cleaned['make'] = kbb_cleaned['make'].str.replace('Alfa', 'Alfa Romeo')
kbb_cleaned['make'] = kbb_cleaned['make'].str.replace('Land', 'Land Rover')


kbb_cleaned.head(10)

Unnamed: 0,packages,price,mileage,engine_type,transmission_type,color,drivewheel_configuration,drivetrain,Used Vehicle Classification,year,make,model
0,FWD w/ Technology Packag,29999,30288,4-Cylinder Turbo,Automatic,Black,2 Wheel Drive - Front,FWD,Used,2019,Acura,RDX
1,SH-AWD w/ Technology Packag,33983,18121,4-Cylinder Turbo,Automatic,Purple,All Wheel Drive,AWD,Certified,2021,Acura,TLX
2,,34997,41174,4-Cylinder Turbo,Automatic,Gray,All wheel drive,AWD,Used,2020,Acura,RDX
3,SH-AWD w/ A-SPEC Packag,31993,39008,6-Cylinder,Automatic,Blue,All wheel drive,AWD,Used,2019,Acura,MDX
4,FWD w/ Advance Packag,23923,56546,6-Cylinder,Automatic,Black,2 wheel drive - front,FWD,Used,2017,Acura,RDX
5,FWD w/ A-Spec Packag,44895,2148,4-Cylinder Turbo,Automatic,Red,2 wheel drive - front,FWD,Used,2023,Acura,RDX
6,FWD w/ Advance Packag,29550,43275,4-Cylinder Turbo,Automatic,White,2 wheel drive - front,FWD,Used,2019,Acura,RDX
7,w/ Premium Packag,23989,38101,4-Cylinder,Automatic,Black,2 Wheel Drive - Front,FWD,Certified,2020,Acura,ILX
8,FWD w/ Advance Packag,30987,39761,4-Cylinder Turbo,Automatic,Blue,2 wheel drive - front,FWD,Used,2019,Acura,RDX
9,FWD w/ Technology Packag,26999,58664,4-Cylinder Turbo,Automatic,Gray,2 wheel drive - front,FWD,Used,2019,Acura,RDX


In [432]:
#reorganizing the dataframe
kbb_cleaned = kbb_cleaned[['Used Vehicle Classification', 'year','make','model','packages', 'price', 'mileage', 'engine_type', 'transmission_type','drivetrain', 'color']]

kbb_cleaned.head(15)

Unnamed: 0,Used Vehicle Classification,year,make,model,packages,price,mileage,engine_type,transmission_type,drivetrain,color
0,Used,2019,Acura,RDX,FWD w/ Technology Packag,29999,30288,4-Cylinder Turbo,Automatic,FWD,Black
1,Certified,2021,Acura,TLX,SH-AWD w/ Technology Packag,33983,18121,4-Cylinder Turbo,Automatic,AWD,Purple
2,Used,2020,Acura,RDX,,34997,41174,4-Cylinder Turbo,Automatic,AWD,Gray
3,Used,2019,Acura,MDX,SH-AWD w/ A-SPEC Packag,31993,39008,6-Cylinder,Automatic,AWD,Blue
4,Used,2017,Acura,RDX,FWD w/ Advance Packag,23923,56546,6-Cylinder,Automatic,FWD,Black
5,Used,2023,Acura,RDX,FWD w/ A-Spec Packag,44895,2148,4-Cylinder Turbo,Automatic,FWD,Red
6,Used,2019,Acura,RDX,FWD w/ Advance Packag,29550,43275,4-Cylinder Turbo,Automatic,FWD,White
7,Certified,2020,Acura,ILX,w/ Premium Packag,23989,38101,4-Cylinder,Automatic,FWD,Black
8,Used,2019,Acura,RDX,FWD w/ Advance Packag,30987,39761,4-Cylinder Turbo,Automatic,FWD,Blue
9,Used,2019,Acura,RDX,FWD w/ Technology Packag,26999,58664,4-Cylinder Turbo,Automatic,FWD,Gray


### Creating vehicle type column: 

In [433]:
#viewing car models to create vehicle type column. 
kbb_cleaned['model'].unique()

array(['RDX', 'TLX', 'MDX', 'ILX', 'Integra', 'RLX', 'Stelvio', 'Giulia',
       '4C', 'A5', 'A4', 'Q8', 'A7', 'Q5', 'Q3', 'e-tron', 'Q7', 'S4',
       'Q4', 'SQ5', 'SQ8', 'RS', 'A3', 'A8', 'A6', 'R8', 'SQ7', 'S7',
       'X5', 'M2', 'M4', 'X2', 'X3', '430', '530', '330i', '440i', 'X1',
       '530i', '430i', '740', '540i', '740i', 'X4', 'X7', '750i', 'i3',
       '530e', 'i8', 'M550i', '228i', '320i', 'iX', '330e', 'X6', '228',
       'M5', 'M340i', 'M850i', '230i', '745e', '540', 'M8', 'Regal',
       'Encore', 'Envision', 'Enclave', 'LaCrosse', 'Cascada', 'CT5',
       'XT5', 'Escalade', 'CT4', 'XT6', 'XT4', 'CTS', 'CT6', 'ATS',
       'Silverado', 'Malibu', 'Equinox', 'Corvette', 'Camaro', 'Tahoe',
       'Blazer', 'TrailBlazer', 'Trax', 'Traverse', 'Colorado', 'Express',
       'Cruze', 'Impala', 'Pacifica', '300', 'Voyager', '200', 'Charger',
       'Challenger', 'Durango', 'Grand', 'Journey', 'Escape', 'Fusion',
       'Mustang', 'F150', 'F250', 'E-450', 'Expedition', 'Transit',

In [441]:
#creating mapping for vehicle type and car model 
car_type_mapping = {
    'RDX': 'SUV', 
    'TLX': 'Sedan',
    'MDX': 'SUV',
    'ILX': 'Sedan', 
    'Integra': 'Sedan', 
    'RLX': 'Sedan', 
    'Stelvio': 'SUV',
    'Giulia': 'Sedan',
    '4C': 'Coupe',
    'A5': 'Coupe', 
    'A4': 'Sedan',
    'Q8': 'SUV', 
    'A7': 'Sedan', 
    'Q5': 'SUV', 
    'Q3': 'SUV', 
    'e-tron': 'Sedan', 
    'Q7': 'SUV', 
    'S4': 'Sedan', 
    'Q4': 'SUV', 
    'SQ5': 'SUV', 
    'SQ8': 'SUV',
    'RS': 'Coupe', 
    'A3': 'Sedan', 
    'A8': 'Sedan', 
    'A6': 'Sedan', 
    'R8': 'Coupe', 
    'SQ7': 'SUV', 
    'S7': 'Sedan', 
    'X5': 'SUV', 
    'M2': 'Coupe', 
    'M4': 'Coupe', 
    'X2': 'SUV',
    'X3': 'SUV', 
    '430': 'Coupe', 
    '530': 'Sedan', 
    '330i': 'Sedan', 
    '440i': 'Coupe', 
    'X1': 'SUV', 
    '530i': 'Sedan', 
    '430i': 'Coupe', 
    '740': 'Sedan',
    '540i': 'Sedan', 
    '740i': 'Sedan', 
    'X4': 'SUV', 
    'X7': 'SUV', 
    '750i': 'Sedan', 
    'i3': 'Coupe', 
    '530e': 'Sedan', 
    'i8': 'Coupe', 
    'M550i': 'Sedan',
    '228i': 'Coupe', 
    '320i': 'Sedan', 
    'iX': 'SUV', 
    '330e': 'Sedan', 
    'X6': 'SUV', 
    '228': 'Coupe', 
    'M5': 'Sedan', 
    'M340i': 'Sedan', 
    'M850i': 'Sedan',
    '230i': 'Coupe', 
    '745e': 'Sedan', 
    '540': 'Sedan', 
    'M8': 'Coupe', 
    'Regal': 'Sedan', 
    'Encore': 'SUV', 
    'Envision': 'SUV',
    'Enclave': 'SUV', 
    'LaCrosse': 'Sedan', 
    'Cascada': 'Coupe', 
    'CT5': 'Sedan', 
    'XT5': 'SUV', 
    'Escalade': 'SUV', 
    'CT4': 'Sedan',
    'XT6': 'SUV', 
    'XT4': 'SUV', 
    'CTS': 'Sedan', 
    'CT6': 'Sedan', 
    'ATS': 'Sedan', 
    'Silverado':'Pick up',
    'Malibu': 'Sedan',
    'Equinox': 'SUV', 
    'Corvette': 'Coupe', 
    'Camaro': 'Coupe', 
    'Tahoe': 'SUV', 
    'Blazer': 'SUV', 
    'TrailBlazer': 'SUV',
    'Trax': 'SUV', 
    'Traverse': 'SUV', 
    'Colorado':'Pick up', 
    'Express': 'Van', 
    'Cruze': 'Sedan', 
    'Impala': 'Sedan',
    'Pacifica':'Mini van', 
    '300': 'Sedan', 
    'Voyager': 'Mini van', 
    '200': 'Sedan', 
    'Charger': 'Sedan', 
    'Challenger': 'Coupe',
    'Durango': 'SUV', 
    'Grand': 'Mini van', 
    'Journey': 'Mini van', 
    'Escape': 'SUV', 
    'Fusion': 'Sedan', 
    'Mustang': 'Coupe',
    'F150':'Pick up', 
    'F250':'Pick up', 
    'E-450': 'Van', 
    'Expedition': 'SUV', 
    'Transit': 'Van', 
    'Maverick':'Pick up',
    'F350':'Pick up', 
    'Ranger':'Pick up', 
    'Explorer': 'SUV', 
    'Edge': 'SUV', 
    'Bronco': 'SUV', 
    'Focus': 'Sedan', 
    'Fiesta': 'Sedan',
    'EcoSport': 'SUV', 
    'Taurus': 'Sedan', 
    'GV80': 'SUV', 
    'G80': 'Sedan', 
    'G70': 'Sedan', 
    'GV60': 'SUV', 
    'GV70': 'SUV', 
    'G90': 'Sedan',
    'Acadia': 'SUV', 
    'Yukon': 'SUV', 
    'Canyon':'Pick up', 
    'Sierra':'Pick up', 
    'Terrain': 'SUV', 
    'Savana': 'Van',
    'Civic': 'Sedan', 
    'HR-V': 'SUV', 
    'Odyssey':'Mini van', 
    'CR-V': 'SUV', 
    'Ridgeline':'Pick up', 
    'Pilot': 'SUV',
    'Insight': 'Sedan', 
    'Accord': 'Coupe', 
    'Passport': 'SUV', 
    'Fit': 'Sedan', 
    'Tucson': 'SUV', 
    'Palisade': 'SUV',
    'Ioniq': 'SUV', 
    'Elantra': 'Sedan', 
    'Sonata': 'Sedan', 
    'Santa': 'SUV', 
    'Kona': 'SUV', 
    'Venue': 'SUV', 
    'Accent': 'Sedan',
    'Veloster': 'Coupe', 
    'Velo': 'Coupe', 
    'QX60': 'SUV', 
    'QX50': 'SUV', 
    'QX80': 'SUV', 
    'Q60': 'Coupe', 
    'QX55': 'SUV', 
    'QX70': 'SUV', 
    'Q50': 'Sedan', 
    'QX30': 'SUV', 
    'F-TYPE': 'Coupe', 
    'XJ': 'Sedan',
    'F-PACE': 'SUV', 
    'I-PACE': 'SUV', 
    'E-PACE': 'SUV', 
    'XE': 'Sedan', 
    'XF': 'Sedan', 
    'Wrangler': 'SUV', 
    'Gladiator': 'SUV',
    'Compass': 'SUV', 
    'Cherokee': 'SUV', 
    'Patriot': 'SUV', 
    'Renegade': 'SUV', 
    'Wagoneer': 'SUV', 
    'Soul': 'SUV',
    'Optima': 'Sedan', 
    'Sorento': 'SUV', 
    'K5': 'Sedan', 
    'Sportage': 'SUV', 
    'Stinger': 'Sedan', 
    'Forte': 'Sedan', 
    'EV6': 'SUV',
    'Seltos': 'SUV', 
    'Carnival': 'Mini van', 
    'Telluride': 'SUV', 
    'Rio': 'Sedan', 
    'K900': 'Sedan', 
    'Cadenza': 'Sedan',
    'Niro': 'SUV', 
    'Range': 'SUV', 
    'Defender': 'SUV', 
    'Discovery': 'SUV', 
    'RX': 'SUV', 
    'GS': 'Sedan', 
    'NX': 'SUV', 
    'ES': 'Sedan',
    'LC': 'Coupe', 
    'GX': 'SUV', 
    'IS': 'Sedan', 
    'UX': 'SUV', 
    'LX': 'SUV', 
    'LS': 'Sedan', 
    'RC': 'Coupe', 
    'Navigator': 'SUV', 
    'MKX': 'Sedan',
    'Corsair': 'Coupe', 
    'Nautilus': 'SUV', 
    'Aviator': 'SUV', 
    'MKZ': 'Sedan', 
    'Continental': 'Sedan', 
    'MKC': 'SUV',
    'MKT': 'SUV', 
    'Quattroporte': 'Sedan', 
    'Levante': 'SUV', 
    'Ghibli': 'Sedan', 
    'GranTurismo': 'Coupe',
    'Grecale': 'SUV', 
    'MC20': 'Coupe', 
    'Ghibl': 'Sedan', 
    'Levan': 'SUV', 
    'MAZDA3': 'Sedan', 
    'CX-9': 'SUV', 
    'CX-5': 'SUV',
    'MAZDA6': 'Sedan', 
    'CX-30': 'SUV', 
    'CX-90': 'SUV', 
    'CX-3': 'SUV', 
    'CX-50': 'SUV', 
    'MX-5': 'Coupe', 
    'GLB': 'SUV', 
    'E': 'Sedan',
    'GLC': 'SUV', 
    'GLA': 'SUV', 
    'GLS': 'SUV', 
    'GLE': 'SUV', 
    'C': 'Sedan', 
    'A': 'Sedan', 
    'CLA': 'Coupe', 
    'G': 'SUV', 
    'Maybach': 'Sedan', 
    'SLC': 'Coupe',
    'Sprinter': 'Van', 
    'S': 'Sedan', 
    'CLS': 'Sedan', 
    'AMG': 'Coupe', 
    'SL': 'Coupe', 
    'Cooper': 'Coupe', 
    'Outlander': 'SUV',
    'Lancer': 'Sedan', 
    'Eclipse':'SUV', 
    'Mirage': 'Sedan', 
    'Mirag': 'Sedan', 
    'Rogue': 'SUV', 
    'Murano': 'SUV',
    'Armada': 'SUV', 
    'Ariya': 'SUV', 
    'Maxima': 'Sedan', 
    '370Z': 'Coupe',
    'Sentra': 'Sedan', 
    'Leaf': 'SUV', 
    'Titan':'Pick up',
    'Versa': 'Sedan', 
    'Pathfinder': 'SUV', 
    'Kicks': 'SUV', 
    'Altima': 'Sedan', 
    'Frontier':'Pick up', 
    'NV': 'Van',
    'NV200': 'Van', 
    'Macan': 'SUV', 
    'Cayenne': 'SUV', 
    'Cayenn': 'SUV', 
    'Taycan': 'Sedan', 
    '911': 'Coupe', 
    '718': 'Coupe',
    'Panamera': 'Sedan', 
    'Big':'Pick up', 
    'Rebel':'Pick up', 
    'TRX':'Pick up', 
    'Classic':'Pick up', 
    'Laramie':'Pick up',
    'Tradesman':'Pick up', 
    'Longhorn':'Pick up', 
    'Laram':'Pick up', 
    'Lone':'Pick up', 
    'Lim':'Pick up', 
    'ProMaster': 'Van' ,
    'Power':'Pick up', 
    'Limited':'Pick up', 
    'SLT':'Pick up', 
    'Ascent': 'SUV', 
    'Outback': 'SUV', 
    'Legacy': 'Sedan',
    'Impreza': 'Sedan', 
    'Crosstrek': 'SUV', 
    'BRZ': 'Coupe', 
    'Forester': 'SUV', 
    'WRX': 'Sedan', 
    'Solterra': 'SUV',
    'Model': 'Sedan', 
    'Tundra':'Pick up', 
    'Tacoma':'Pick up', 
    'Camry': 'Sedan', 
    '4Runner': 'SUV', 
    'Supra': 'Coupe', 
    'RAV4': 'SUV',
    'Highlander': 'SUV', 
    'Corolla': 'Sedan', 
    'Sienna': 'Mini van', 
    'C-HR': 'SUV', 
    'Mira': 'Sedan', 
    'Venza': 'SUV', 
    'Land':'SUV',
    'Priu': 'Sedan', 
    'Prius': 'Sedan', 
    'GTI': 'Sedan', 
    'Atlas': 'SUV', 
    'Tiguan': 'SUV', 
    'Passat': 'Sedan', 
    'Jetta': 'Sedan',
    'Arteon': 'Sedan', 
    'Taos': 'SUV', 
    'Golf': 'Sedan', 
    'ID.4': 'SUV'

}

In [443]:
#mapping and checking vehicle type 
kbb_cleaned['vehicle_type'] = kbb_cleaned['model'].map(car_type_mapping)

kbb_cleaned['vehicle_type'].value_counts()

SUV         20388
Sedan        7838
Pick up      3040
Coupe        2852
Mini van     1353
Van           100
Name: vehicle_type, dtype: int64

In [445]:
#saving file checkpoint due to inclement weather incase I lose power. 
kbb_cleaned.to_csv('kbb_cleaned.csv')

### Fixing car model names: 

The car models with 2 words were split betwen model and package columns. For example Land Cruiser was split into make = Land and packages starts with Cruiser. I noticed this while mapping for the new vehicle type column. 

In [487]:
# pulling Audi model name that is incorrect 
audi_rs = kbb_cleaned[kbb_cleaned['model'] == 'RS']

print(audi_rs)

Empty DataFrame
Columns: [Used Vehicle Classification, year, make, model, packages, price, mileage, engine_type, transmission_type, drivetrain, color, vehicle_type]
Index: []


In [488]:
#fixing model name and removing it from packages

kbb_cleaned.loc[2431, 'model'] = kbb_cleaned.loc[2431, 'model'] = "RS7"
kbb_cleaned.loc[2484, 'model'] = kbb_cleaned.loc[2484, 'model'] = "RSQ8"
kbb_cleaned.loc[2734, 'model'] = kbb_cleaned.loc[2734, 'model'] = "RS7"

kbb_cleaned.at[2431, 'packages'] = kbb_cleaned.at[2431, 'packages'].replace("7","")
kbb_cleaned.at[2484, 'packages'] = kbb_cleaned.at[2484, 'packages'].replace("Q8","")
kbb_cleaned.at[2734, 'packages'] = kbb_cleaned.at[2734, 'packages'].replace("7","")
print(kbb_cleaned.loc[2431])

Used Vehicle Classification                                Used
year                                                       2021
make                                                       Audi
model                                                       RS7
packages                        Sportback w/ Black Optic Packag
price                                                     99950
mileage                                                   10031
engine_type                                    8-Cylinder Turbo
transmission_type                                     Automatic
drivetrain                                                  AWD
color                                                    Silver
vehicle_type                                              Coupe
Name: 2431, dtype: object


In [509]:
#fixing toyota land cruiser
toyo_land = kbb_cleaned[kbb_cleaned['model'] == 'Land']
print(toyo_land)

kbb_cleaned.loc[34033, 'model'] = kbb_cleaned.loc[34033, 'model'] = 'Land Cruiser'
kbb_cleaned.loc[34033]

Empty DataFrame
Columns: [Used Vehicle Classification, year, make, model, packages, price, mileage, engine_type, transmission_type, drivetrain, color, vehicle_type]
Index: []


Used Vehicle Classification            Used
year                                   2018
make                                 Toyota
model                          Land Cruiser
packages                                Cru
price                                 65495
mileage                               68511
engine_type                      8-Cylinder
transmission_type                 Automatic
drivetrain                              4WD
color                                 White
vehicle_type                            SUV
Name: 34033, dtype: object

In [510]:
kbb_cleaned['model'] = kbb_cleaned['model'].replace("Range", "Range Rover")

for index, row in kbb_cleaned.iterrows():
    
    if not pd.isna(row['packages']) and row['packages'].strip():
    # Split the "packages" column to get the first word
        first_word = row['packages'].split()[0]
    
    # Check if the first word is "Velar"
    if first_word == 'Velar':
        # Update the "model" column by adding "Velar"
        kbb_cleaned.at[index, 'model'] += ' Velar'

for index, row in kbb_cleaned.iterrows():
    
    if not pd.isna(row['packages']) and row['packages'].strip():
    # Split the "packages" column to get the first word
        first_word = row['packages'].split()[0]
    
    # Check if the first word is "Evoque"
        if first_word == 'Evoque':
        # Update the "model" column by adding "Evoque"
            kbb_cleaned.at[index, 'model'] += ' Evoque'
    
for index, row in kbb_cleaned.iterrows():
    
    if not pd.isna(row['packages']) and row['packages'].strip():
    # Split the "packages" column to get the first word
        first_word = row['packages'].split()[0]
    
    # Check if the first word is "Evoqu"
    if first_word == 'Evoqu':
        # Update the "model" column by adding "Evoque"
        kbb_cleaned.at[index, 'model'] += ' Evoque'


In [515]:
# fixed mitsubishi mirag to mitsubishi mirage
kbb_cleaned.loc[27776, 'model'] = kbb_cleaned.loc[27776, 'model'].replace('Mirag', 'Mirage')

kbb_cleaned.loc[27776]

Used Vehicle Classification          Used
year                                 2021
make                           Mitsubishi
model                              Mirage
packages                                 
price                               14900
mileage                             54025
engine_type                    3-Cylinder
transmission_type               Automatic
drivetrain                            FWD
color                                Gray
vehicle_type                        Sedan
Name: 27776, dtype: object

In [521]:
# fixed porsche cayenn to porsche cayenne 

kbb_cleaned[kbb_cleaned['model'] == "Cayenn"]

kbb_cleaned['model'] = kbb_cleaned['model'].str.replace('Cayenn', 'Cayenne')

In [553]:
# corrected model names for Ram trucks 
kbb_cleaned['model'] = kbb_cleaned['model'].replace("Big", "Big Horn")
kbb_cleaned['model'] = kbb_cleaned['model'].replace("Lone", "Lone Star")
kbb_cleaned['model'] = kbb_cleaned['model'].replace("Lim", "Limited")

# corrected toyota prius typo
kbb_cleaned['model'] = kbb_cleaned['model'].replace("Priu", "Prius")

In [554]:
for index, row in kbb_cleaned.iterrows():
    
    if not pd.isna(row['packages']) and row['packages'].strip():
    # Split the "packages" column to get the first word
        first_word = row['packages'].split()[0]
    
        if first_word == '3':
            # Check if the make is "Tesla" and the model is "Model"
            if row['make'] == 'Tesla' and row['model'] == 'Model':
                # Update the "model" column by adding " 3"
                kbb_cleaned.at[index, 'model'] += ' 3'

for index, row in kbb_cleaned.iterrows():
    
    if not pd.isna(row['packages']) and row['packages'].strip():
    # Split the "packages" column to get the first word
        first_word = row['packages'].split()[0]
    
        if first_word == 'S':
            # Check if the make is "Tesla" and the model is "Model"
            if row['make'] == 'Tesla' and row['model'] == 'Model':
                # Update the "model" column by adding "S"
                kbb_cleaned.at[index, 'model'] += ' S'

    
for index, row in kbb_cleaned.iterrows():
    
    if not pd.isna(row['packages']) and row['packages'].strip():
    # Split the "packages" column to get the first word
        first_word = row['packages'].split()[0]
    
        if first_word == 'X':
            # Check if the make is "Tesla" and the model is "Model"
            if row['make'] == 'Tesla' and row['model'] == 'Model':
                # Update the "model" column by adding " X"
                kbb_cleaned.at[index, 'model'] += ' X'


for index, row in kbb_cleaned.iterrows():
    
    if not pd.isna(row['packages']) and row['packages'].strip():
    # Split the "packages" column to get the first word
        first_word = row['packages'].split()[0]
    
        if first_word == 'Y':
            # Check if the make is "Tesla" and the model is "Model"
            if row['make'] == 'Tesla' and row['model'] == 'Model':
                # Update the "model" column by adding " Y"
                kbb_cleaned.at[index, 'model'] += ' Y'

In [556]:
# checking model names for accuracy 
kbb_cleaned['model'].unique()

array(['RDX', 'TLX', 'MDX', 'ILX', 'Integra', 'RLX', 'Stelvio', 'Giulia',
       '4C', 'A5', 'A4', 'Q8', 'A7', 'Q5', 'Q3', 'e-tron', 'Q7', 'S4',
       'Q4', 'SQ5', 'SQ8', 'RS7', 'A3', 'RSQ8', 'A8', 'A6', 'R8', 'SQ7',
       'S7', 'X5', 'M2', 'M4', 'X2', 'X3', '430', '530', '330i', '440i',
       'X1', '530i', '430i', '740', '540i', '740i', 'X4', 'X7', '750i',
       'i3', '530e', 'i8', 'M550i', '228i', '320i', 'iX', '330e', 'X6',
       '228', 'M5', 'M340i', 'M850i', '230i', '745e', '540', 'M8',
       'Regal', 'Encore', 'Envision', 'Enclave', 'LaCrosse', 'Cascada',
       'CT5', 'XT5', 'Escalade', 'CT4', 'XT6', 'XT4', 'CTS', 'CT6', 'ATS',
       'Silverado', 'Malibu', 'Equinox', 'Corvette', 'Camaro', 'Tahoe',
       'Blazer', 'TrailBlazer', 'Trax', 'Traverse', 'Colorado', 'Express',
       'Cruze', 'Impala', 'Pacifica', '300', 'Voyager', '200', 'Charger',
       'Challenger', 'Durango', 'Grand', 'Journey', 'Escape', 'Fusion',
       'Mustang', 'F150', 'F250', 'E-450', 'Expedition', '

### Handling 0 values in the price column: 

In [576]:
# exploring price at 0 by manufacturer
kbb_cleaned[kbb_cleaned['price'] == 0].groupby('make').count()

Unnamed: 0_level_0,Used Vehicle Classification,year,model,packages,price,mileage,engine_type,transmission_type,drivetrain,color,vehicle_type
make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Acura,1,1,1,1,1,1,1,1,1,1,1
Audi,1,1,1,1,1,1,1,1,1,1,1
BMW,1,1,1,1,1,1,1,1,1,1,1
Buick,1,1,1,1,1,1,1,1,1,1,1
Chrysler,80,80,80,80,80,80,80,80,80,80,80
Dodge,40,40,40,40,40,40,40,40,40,40,40
Ford,321,321,321,321,321,321,321,321,321,321,321
GMC,41,41,41,41,41,41,41,41,41,41,41
Honda,2,2,2,2,2,2,2,2,2,2,2
Hyundai,1,1,1,1,1,1,1,1,1,1,1


The 0 values will pull down the average of each vehicle. 40 missing values account for about 3.7% of the data. 
Ford being the largest at 321 accounts for roughly 30% of the ford dataset. The reason the cars dont have a value is generally because they are not listed by the dealership on kelly blue book. To handle this lets explore and see what I can do with the Ford brand first as this is a large portion. 

In [581]:
#extracting just ford 
ford_cars = kbb_cleaned[kbb_cleaned['make']== 'Ford']

#seperating out the 0 values
ford_cars_zero = ford_cars[ford_cars['price'] == 0]

#viewing the missing amounts in each type
ford_cars_zero['model'].value_counts()

F150          120
Expedition     80
Transit        40
Maverick       40
Edge           40
F350            1
Name: model, dtype: int64

In [601]:
#extracting F150 models
f150 = ford_cars[ford_cars['model'] == 'F150'] 

#extracting all above 0 
f1500 = f150[f150['price'] > 0]

#looking at mean median mode 
f150_average = f1500['price'].mean()
f150_median = f1500['price'].median()
f150_mode = f1500['price'].mode()

print('median:', f150_median, 'average:', f150_average)
print ()
print('mode')
print(f150_mode)

median: 34800.0 average: 35867.87242798354

mode
0    20931
1    32995
2    34800
3    35859
4    46995
Name: price, dtype: int32


In [616]:
# looking at the the range and standard deviation ill replace the 0's with the median value of 34,800
f1500['price'].describe()

count      243.000000
mean     35867.872428
std      10795.622289
min       8995.000000
25%      32995.000000
50%      34800.000000
75%      37888.500000
max      74977.000000
Name: price, dtype: float64

In [645]:
kbb_cleaned['price'] = kbb_cleaned['price'].astype(int)

In [646]:
#updating with median 
kbb_cleaned.loc[(kbb_cleaned['make'] == 'Ford') & (kbb_cleaned['model'] == 'F-150') & (kbb_cleaned['price'] == 0), 'price'] = 34800

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['model'] == 'F150') & (kbb_cleaned['price'] > 0)].count()



Used Vehicle Classification    363
year                           363
make                           363
model                          363
packages                       363
price                          363
mileage                        363
engine_type                    363
transmission_type              363
drivetrain                     363
color                          363
vehicle_type                   363
dtype: int64

In [650]:
#extracting Expedition models
fe = ford_cars[ford_cars['model'] == 'Expedition'] 

#extracting all above 0 
fe_exp = fe[fe['price'] > 0]

print(fe_exp.count())

#looking at mean median mode std and quartile ranges
fe_exp_average = fe_exp['price'].mean()
fe_exp_median = fe_exp['price'].median()
fe_exp_mode = fe_exp['price'].mode()

print('median:', fe_exp_median, 'average:', fe_exp_average)
print ()
print('mode')
print(fe_exp_mode)
print()
print(fe_exp.describe())

Used Vehicle Classification    86
year                           86
make                           86
model                          86
packages                       86
price                          86
mileage                        86
engine_type                    86
transmission_type              86
drivetrain                     86
color                          86
vehicle_type                   86
dtype: int64
median: 53310.0 average: 60209.232558139534

mode
0    53310
1    69995
Name: price, dtype: int32

              price        mileage
count     86.000000      86.000000
mean   60209.232558   19841.058140
std     9830.530164   19622.521644
min    33447.000000    3042.000000
25%    53310.000000    3042.000000
50%    53310.000000   29207.000000
75%    69995.000000   29207.000000
max    69995.000000  107719.000000


In [652]:
#updating expedtion with the median 

#updating with median 
kbb_cleaned.loc[(kbb_cleaned['model'] == 'Expedition') & (kbb_cleaned['price'] == 0), 'price'] = 53310 

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['model'] == 'Expedition') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    166
year                           166
make                           166
model                          166
packages                       166
price                          166
mileage                        166
engine_type                    166
transmission_type              166
drivetrain                     166
color                          166
vehicle_type                   166
dtype: int64

In [655]:
#extracting Expedition models
transit = ford_cars[ford_cars['model'] == 'Transit'] 

print(transit.count())

#extracting all above 0 
transit_greater = transit[transit['price'] > 0]

print(transit_greater.count())

#looking at mean median mode std and quartile ranges
transit_average = transit_greater['price'].mean()
transit_median = transit_greater['price'].median()
transit_mode = transit_greater['price'].mode()

print('median:', fe_exp_median, 'average:', fe_exp_average)
print ()
print('mode')
print(fe_exp_mode)
print()
print(fe_exp.describe())


Used Vehicle Classification    42
year                           42
make                           42
model                          42
packages                       42
price                          42
mileage                        42
engine_type                    42
transmission_type              42
drivetrain                     42
color                          42
vehicle_type                   42
dtype: int64
Used Vehicle Classification    2
year                           2
make                           2
model                          2
packages                       2
price                          2
mileage                        2
engine_type                    2
transmission_type              2
drivetrain                     2
color                          2
vehicle_type                   2
dtype: int64
median: 53310.0 average: 60209.232558139534

mode
0    53310
1    69995
Name: price, dtype: int32

              price        mileage
count     86.000000      86.000000
m

#### Notes: 

This is not a good outcome. The transit van has 42 for sale and only 2 have a price. This is not a good sample size to use. I think the best choice for this one is to completely drop the transit van from the dataset. Since pricing is not available originally  using only 2 will not be a good representation. I am okay with dropping this vehicle because it is a utility van and not a common vehicle and my purpose is mostly for regular vehicles and not commercial use.  

In [664]:
#dropping transit 
kbb_cleaned = kbb_cleaned[kbb_cleaned['model'] != 'Transit']

#checking to make sure it dropped
kbb_cleaned[kbb_cleaned['model'] == "Transit"]

Unnamed: 0,Used Vehicle Classification,year,make,model,packages,price,mileage,engine_type,transmission_type,drivetrain,color,vehicle_type


In [668]:
ford_cars

Unnamed: 0,Used Vehicle Classification,year,make,model,packages,price,mileage,engine_type,transmission_type,drivetrain,color,vehicle_type
9690,Used,2020,Ford,Escape,SE,21500,36828,3-Cylinder Turbo,Automatic,FWD,Gray,SUV
9691,Used,2020,Ford,Fusion,Titanium,19686,62183,4-Cylinder Turbo,Automatic,FWD,White,Sedan
9692,Used,2022,Ford,Mustang,Mach-E Selec,33767,22300,Electric,Automatic,RWD,Gray,Coupe
9693,Used,2021,Ford,F150,XLT,35859,44435,8-Cylinder,Automatic,4WD,White,Pick up
9694,Used,2019,Ford,F150,XLT w/ Equipment Group 302A Luxury,34800,37203,6-Cylinder Turbo,Automatic,4WD,Gray,Pick up
9695,Used,2021,Ford,Mustang,Mach 1,49851,1318,8-Cylinder,Manual,RWD,Orange,Coupe
9696,Used,2019,Ford,F250,Laria,55972,62147,8-Cylinder Turbo,Automatic,4WD,Black,Pick up
9697,Used,2019,Ford,E-450,and Econoline 450 Super Duty w/ Power Windows ...,35990,57057,10-Cylinder,Automatic,RWD,White,Van
9698,Used,2020,Ford,Expedition,Max Platinum,53310,29207,6-Cylinder Turbo,Automatic,4WD,Black,SUV
9699,Used,2017,Ford,F150,XLT,20931,129857,8-Cylinder,Automatic,RWD,Silver,Pick up


In [713]:
#extracting Maverick models
maverick = ford_cars[ford_cars['model'] == 'Maverick'] 

print(maverick.count())

#extracting all above 0 
maverick_greater = maverick[maverick['price'] > 0]

print(maverick_greater.count())

#looking at mean median mode std and quartile ranges
maverick_average = maverick_greater['price'].mean()
maverick_median = maverick_greater['price'].median()
maverick_mode = maverick_greater['price'].mode()

print('median:', maverick_median, 'average:', maverick_average)
print ()
print('mode')
print(maverick_mode)
print()
print(maverick_greater.describe())

Used Vehicle Classification    40
year                           40
make                           40
model                          40
packages                       40
price                          40
mileage                        40
engine_type                    40
transmission_type              40
drivetrain                     40
color                          40
vehicle_type                   40
dtype: int64
Used Vehicle Classification    0
year                           0
make                           0
model                          0
packages                       0
price                          0
mileage                        0
engine_type                    0
transmission_type              0
drivetrain                     0
color                          0
vehicle_type                   0
dtype: int64
median: nan average: nan

mode
Series([], Name: price, dtype: int32)

       price  mileage
count    0.0      0.0
mean     NaN      NaN
std      NaN      NaN
min      NaN

#### Note:

It seems the webscrape picked up the same model Maverick 40 times with no value. This also will be dropped. 

In [673]:
kbb_cleaned = kbb_cleaned[kbb_cleaned['model'] != 'Maverick']

kbb_cleaned[kbb_cleaned['model'] == "Maverick"]

Unnamed: 0,Used Vehicle Classification,year,make,model,packages,price,mileage,engine_type,transmission_type,drivetrain,color,vehicle_type


In [679]:
#extracting Expedition models
edge = ford_cars[ford_cars['model'] == 'Edge'] 

print(edge.count())
print(edge)

#extracting all above 0 
edge_greater = edge[edge['price'] > 0].count()

print(edge_greater)

Used Vehicle Classification    49
year                           49
make                           49
model                          49
packages                       49
price                          49
mileage                        49
engine_type                    49
transmission_type              49
drivetrain                     49
color                          49
vehicle_type                   49
dtype: int64
      Used Vehicle Classification  year  make model  \
9715                         Used  2019  Ford  Edge   
9724                         Used  2022  Ford  Edge   
9742                         Used  2019  Ford  Edge   
9769                         Used  2019  Ford  Edge   
9778                         Used  2020  Ford  Edge   
9796                         Used  2019  Ford  Edge   
9823                         Used  2019  Ford  Edge   
9850                         Used  2019  Ford  Edge   
9877                         Used  2019  Ford  Edge   
9904                         

#### Notes: 

I have 49 Ford Edges and 9 only with a value the other 40 again are duplicates. 
Going to drop the duplicates and pull the estimated value for the 10th edge. With the mileage and trim it looks like it averages around 22,000 dollars. Ill update the 0 models with the value of 22,000 and than drop the duplicates. It will limit my size to 10 vehicles but its better than having 49 with 40 of them being 22,000. 


In [691]:
#changing price 
kbb_cleaned.loc[(kbb_cleaned['model'] == 'Edge') & (kbb_cleaned['price'] == 0), 'price'] = 22000

#dropping duplicates
kbb_cleaned.loc[(kbb_cleaned['model'] == 'Edge') & (kbb_cleaned['price'] == 22000)] = kbb_cleaned.loc[(kbb_cleaned['model'] == 'Edge') & (kbb_cleaned['price'] == 22000)].drop_duplicates()


In [693]:
kbb_cleaned.loc[(kbb_cleaned['model'] == 'Edge')].count()

Used Vehicle Classification    10
year                           10
make                           10
model                          10
packages                       10
price                          10
mileage                        10
engine_type                    10
transmission_type              10
drivetrain                     10
color                          10
vehicle_type                   10
dtype: int64

In [726]:
#checking duplicates again 

# exploring price at 0 by manufacturer
kbb_cleaned[kbb_cleaned['price'] == 0].groupby('make').count()

Unnamed: 0_level_0,Used Vehicle Classification,year,model,packages,price,mileage,engine_type,transmission_type,drivetrain,color,vehicle_type
make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Acura,1,1,1,1,1,1,1,1,1,1,1
Audi,1,1,1,1,1,1,1,1,1,1,1
BMW,1,1,1,1,1,1,1,1,1,1,1
Buick,1,1,1,1,1,1,1,1,1,1,1
Chrysler,40,40,40,40,40,40,40,40,40,40,40
Dodge,40,40,40,40,40,40,40,40,40,40,40
Ford,1,1,1,1,1,1,1,1,1,1,1
GMC,41,41,41,41,41,41,41,41,41,41,41
Honda,2,2,2,2,2,2,2,2,2,2,2
Hyundai,1,1,1,1,1,1,1,1,1,1,1


In [727]:
#extracting just chrysler
chry_cars = kbb_cleaned[kbb_cleaned['make']== 'Chrysler']

print(chry_cars.count())
#seperating out the 0 values
chry_cars_zero = chry_cars[chry_cars['price'] == 0]

#viewing the missing amounts in each type
chry_cars_zero['model'].value_counts()

Used Vehicle Classification    1080
year                           1080
make                           1080
model                          1080
packages                       1080
price                          1080
mileage                        1080
engine_type                    1080
transmission_type              1080
drivetrain                     1080
color                          1080
vehicle_type                   1080
dtype: int64


Pacifica    40
Name: model, dtype: int64

#### Notes: 

Im going to continue to go through the ones with 40+ missing values to clean the dataset further

In [728]:
#extracting chrysler 300 models with 0 price 
threehundred = chry_cars_zero[chry_cars_zero['model'] == '300'] 

print(threehundred.count())

#extracting all above 0 
threehundred_greater = chry_cars[(chry_cars['model'] == '300') & (chry_cars['price'] > 0)]

print(threehundred_greater.count())

#looking at mean median mode std and quartile ranges
threehundred_average = threehundred_greater['price'].mean()
threehundred_median = threehundred_greater['price'].median()
threehundred_mode = threehundred_greater['price'].mode()

print('median:', threehundred_median)
print ()
print('mode')
print(threehundred_mode)
print()
print(threehundred_greater.describe())

Used Vehicle Classification    0
year                           0
make                           0
model                          0
packages                       0
price                          0
mileage                        0
engine_type                    0
transmission_type              0
drivetrain                     0
color                          0
vehicle_type                   0
dtype: int64
Used Vehicle Classification    270
year                           270
make                           270
model                          270
packages                       270
price                          270
mileage                        270
engine_type                    270
transmission_type              270
drivetrain                     270
color                          270
vehicle_type                   270
dtype: int64
median: 25523.5

mode
0    17899.0
1    21863.0
2    23772.0
3    25732.0
4    27993.0
5    53310.0
Name: price, dtype: float64

              price        mi

In [729]:
#updating chrysler 300 with the median 

#updating with median 
kbb_cleaned.loc[(kbb_cleaned['make'] == 'Chrysler') & (kbb_cleaned['model'] == '300') & (kbb_cleaned['price'] == 0), 'price'] = 53310 

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'Chrysler') & (kbb_cleaned['model'] == '300') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    270
year                           270
make                           270
model                          270
packages                       270
price                          270
mileage                        270
engine_type                    270
transmission_type              270
drivetrain                     270
color                          270
vehicle_type                   270
dtype: int64

In [731]:
#extracting chrysler pacifica models with 0 price 
threehundred = chry_cars_zero[chry_cars_zero['model'] == 'Pacifica'] 

print(threehundred.count())

#extracting all above 0 
threehundred_greater = chry_cars[(chry_cars['model'] == 'Pacifica') & (chry_cars['price'] > 0)]

print(threehundred_greater.count())

#looking at mean median mode std and quartile ranges
threehundred_average = threehundred_greater['price'].mean()
threehundred_median = threehundred_greater['price'].median()
threehundred_mode = threehundred_greater['price'].mode()

print('median:', threehundred_median)
print ()
print('mode')
print(threehundred_mode)
print()
print(threehundred_greater.describe())

Used Vehicle Classification    40
year                           40
make                           40
model                          40
packages                       40
price                          40
mileage                        40
engine_type                    40
transmission_type              40
drivetrain                     40
color                          40
vehicle_type                   40
dtype: int64
Used Vehicle Classification    673
year                           673
make                           673
model                          673
packages                       673
price                          673
mileage                        673
engine_type                    673
transmission_type              673
drivetrain                     673
color                          673
vehicle_type                   673
dtype: int64
median: 24868.0

mode
0     16565.0
1     17878.0
2     18352.0
3     19800.0
4     19998.0
5     20525.0
6     21990.0
7     24498.0
8     24868.

In [733]:
#updating chrysler Pacifica with the median 

#updating with median 
kbb_cleaned.loc[(kbb_cleaned['make'] == 'Chrysler') & (kbb_cleaned['model'] == 'Pacifica') & (kbb_cleaned['price'] == 0), 'price'] = 24868 

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'Chrysler') & (kbb_cleaned['model'] == 'Pacifica') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    713
year                           713
make                           713
model                          713
packages                       713
price                          713
mileage                        713
engine_type                    713
transmission_type              713
drivetrain                     713
color                          713
vehicle_type                   713
dtype: int64

In [735]:
#extracting just Dodge
dodge_cars = kbb_cleaned[kbb_cleaned['make']== 'Dodge']

print(dodge_cars.count())
#seperating out the 0 values
dodge_cars_zero = dodge_cars[dodge_cars['price'] == 0]

#viewing the missing amounts in each type
dodge_cars_zero['model'].value_counts()

Used Vehicle Classification    1080
year                           1080
make                           1080
model                          1080
packages                       1080
price                          1080
mileage                        1080
engine_type                    1080
transmission_type              1080
drivetrain                     1080
color                          1080
vehicle_type                   1080
dtype: int64


Charger    40
Name: model, dtype: int64

In [739]:
#extracting all above 0 
dodge_greater = dodge_cars[(dodge_cars['model'] == 'Charger') & (dodge_cars['price'] > 0)]

print(dodge_greater.count())

#looking at mean median mode std and quartile ranges
dodge_median = dodge_greater['price'].median()
dodge_mode = dodge_greater['price'].mode()

print()
print('Median:', dodge_median)
print ()
print('Mode')
print(dodge_mode)
print()
print(dodge_greater.describe())

Used Vehicle Classification    365
year                           365
make                           365
model                          365
packages                       365
price                          365
mileage                        365
engine_type                    365
transmission_type              365
drivetrain                     365
color                          365
vehicle_type                   365
dtype: int64

Median: 34594.0

Mode
0    23995.0
Name: price, dtype: float64

              price       mileage
count    365.000000    365.000000
mean   37927.635616  11777.915068
std    13059.938825  12552.136766
min    21627.000000    120.000000
25%    28793.000000   4871.000000
50%    34594.000000   7499.000000
75%    52895.000000  16853.000000
max    98990.000000  85290.000000


In [740]:
#updating dodge charger with the median 

kbb_cleaned.loc[(kbb_cleaned['make'] == 'Dodge') & (kbb_cleaned['model'] == 'Charger') & (kbb_cleaned['price'] == 0), 'price'] = 34594 

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'Dodge') & (kbb_cleaned['model'] == 'Charger') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    405
year                           405
make                           405
model                          405
packages                       405
price                          405
mileage                        405
engine_type                    405
transmission_type              405
drivetrain                     405
color                          405
vehicle_type                   405
dtype: int64

In [742]:
#extracting just GMC
gmc_cars = kbb_cleaned[kbb_cleaned['make']== 'GMC']

print(gmc_cars.count())

#seperating out the 0 values
gmc_cars_zero = gmc_cars[gmc_cars['price'] == 0]

#viewing the missing amounts in each type
gmc_cars_zero['model'].value_counts()

Used Vehicle Classification    1080
year                           1080
make                           1080
model                          1080
packages                       1080
price                          1080
mileage                        1080
engine_type                    1080
transmission_type              1080
drivetrain                     1080
color                          1080
vehicle_type                   1080
dtype: int64


Sierra    41
Name: model, dtype: int64

In [744]:
#extracting all Sierra above 0 
gmc_greater = gmc_cars[(gmc_cars['model'] == 'Sierra') & (gmc_cars['price'] > 0)]

print(gmc_greater.count())

#looking at mean median mode std and quartile ranges
gmc_median = gmc_greater['price'].median()
gmc_mode = gmc_greater['price'].mode()

print()
print('Median:', gmc_median)
print ()
print('Mode')
print(gmc_mode)
print()
print(gmc_greater.describe())

Used Vehicle Classification    446
year                           446
make                           446
model                          446
packages                       446
price                          446
mileage                        446
engine_type                    446
transmission_type              446
drivetrain                     446
color                          446
vehicle_type                   446
dtype: int64

Median: 46534.0

Mode
0    21500.0
1    35563.0
2    41500.0
3    42500.0
4    45294.0
5    47774.0
6    49500.0
7    52405.0
8    69872.0
9    77777.0
Name: price, dtype: float64

              price        mileage
count    446.000000     446.000000
mean   48220.715247   51771.639013
std    14970.510675   26752.587135
min    21500.000000    3091.000000
25%    41500.000000   34882.000000
50%    46534.000000   54104.000000
75%    52405.000000   77916.000000
max    88495.000000  136300.000000


In [746]:
#updating gmc sierra with the median 

kbb_cleaned.loc[(kbb_cleaned['make'] == 'GMC') & (kbb_cleaned['model'] == 'Sierra') & (kbb_cleaned['price'] == 0), 'price'] = 46534 

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'GMC') & (kbb_cleaned['model'] == 'Sierra') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    487
year                           487
make                           487
model                          487
packages                       487
price                          487
mileage                        487
engine_type                    487
transmission_type              487
drivetrain                     487
color                          487
vehicle_type                   487
dtype: int64

In [748]:
#extracting just Jeep
jeep_cars = kbb_cleaned[kbb_cleaned['make']== 'Jeep']

print(jeep_cars.count())

#seperating out the 0 values
jeep_cars_zero = jeep_cars[jeep_cars['price'] == 0]

#viewing the missing amounts in each type
jeep_cars_zero['model'].value_counts()

Used Vehicle Classification    1080
year                           1080
make                           1080
model                          1080
packages                       1080
price                          1080
mileage                        1080
engine_type                    1080
transmission_type              1080
drivetrain                     1080
color                          1080
vehicle_type                   1080
dtype: int64


Compass     40
Cherokee     1
Name: model, dtype: int64

In [753]:
#extracting all compass above 0 
jeep_greater = jeep_cars[(jeep_cars['model'] == 'Compass') & (jeep_cars['price'] > 0)]

print(jeep_greater.count())

#looking at mean median mode std and quartile ranges
jeep_median = jeep_greater['price'].median()
jeep_mode = jeep_greater['price'].mode()

print()
print('Median:', jeep_median)
print ()
print('Mode')
print(jeep_mode)
print()
print(jeep_greater.describe())

Used Vehicle Classification    89
year                           89
make                           89
model                          89
packages                       89
price                          89
mileage                        89
engine_type                    89
transmission_type              89
drivetrain                     89
color                          89
vehicle_type                   89
dtype: int64

Median: 22578.0

Mode
0    19481.0
1    35394.0
Name: price, dtype: float64

              price       mileage
count     89.000000     89.000000
mean   26971.561798  38307.179775
std     7756.446060  31986.137861
min    18491.000000   4566.000000
25%    19481.000000   4566.000000
50%    22578.000000  54082.000000
75%    35394.000000  69495.000000
max    35394.000000  72433.000000


In [755]:
#updating jeep compass with the median 

kbb_cleaned.loc[(kbb_cleaned['make'] == 'Jeep') & (kbb_cleaned['model'] == 'Compass') & (kbb_cleaned['price'] == 0), 'price'] = 22578

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'Jeep') & (kbb_cleaned['model'] == 'Compass') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    129
year                           129
make                           129
model                          129
packages                       129
price                          129
mileage                        129
engine_type                    129
transmission_type              129
drivetrain                     129
color                          129
vehicle_type                   129
dtype: int64

In [757]:
#extracting just Lincoln
lincoln_cars = kbb_cleaned[kbb_cleaned['make']== 'Lincoln']

print(lincoln_cars.count())

#seperating out the 0 values
lincoln_cars_zero = lincoln_cars[lincoln_cars['price'] == 0]

#viewing the missing amounts in each type
lincoln_cars_zero['model'].value_counts()

Used Vehicle Classification    1080
year                           1080
make                           1080
model                          1080
packages                       1080
price                          1080
mileage                        1080
engine_type                    1080
transmission_type              1080
drivetrain                     1080
color                          1080
vehicle_type                   1080
dtype: int64


Corsair    40
Name: model, dtype: int64

In [760]:
#extracting all corsair above 0 
lincoln_greater = lincoln_cars[(lincoln_cars['model'] == 'Corsair') & (lincoln_cars['price'] > 0)]

print(lincoln_greater.count())

#looking at mean median mode std and quartile ranges
lincoln_median = lincoln_greater['price'].median()
lincoln_mode = lincoln_greater['price'].mode()

print()
print('Median:', lincoln_median)
print ()
print('Mode')
print(lincoln_mode)
print()
print(lincoln_greater.describe())

Used Vehicle Classification    103
year                           103
make                           103
model                          103
packages                       103
price                          103
mileage                        103
engine_type                    103
transmission_type              103
drivetrain                     103
color                          103
vehicle_type                   103
dtype: int64

Median: 28985.0

Mode
0    28097.0
1    28985.0
Name: price, dtype: float64

              price       mileage
count    103.000000    103.000000
mean   29732.378641  33119.242718
std     3288.545501   8420.188489
min    26000.000000    499.000000
25%    28097.000000  34089.000000
50%    28985.000000  34089.000000
75%    28985.000000  35185.000000
max    46500.000000  78571.000000


In [762]:
#updating lincoln corsair with the median 

kbb_cleaned.loc[(kbb_cleaned['make'] == 'Lincoln') & (kbb_cleaned['model'] == 'Corsair') & (kbb_cleaned['price'] == 0), 'price'] = 28985

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'Lincoln') & (kbb_cleaned['model'] == 'Corsair') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    143
year                           143
make                           143
model                          143
packages                       143
price                          143
mileage                        143
engine_type                    143
transmission_type              143
drivetrain                     143
color                          143
vehicle_type                   143
dtype: int64

In [764]:
#checking remaining duplicates again 

# exploring price at 0 by manufacturer
kbb_cleaned[kbb_cleaned['price'] == 0].groupby('make').count()

#the small counts will be left alone this they should not impact the average value as much. I will continue with the 4 last models  

Unnamed: 0_level_0,Used Vehicle Classification,year,model,packages,price,mileage,engine_type,transmission_type,drivetrain,color,vehicle_type
make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Acura,1,1,1,1,1,1,1,1,1,1,1
Audi,1,1,1,1,1,1,1,1,1,1,1
BMW,1,1,1,1,1,1,1,1,1,1,1
Buick,1,1,1,1,1,1,1,1,1,1,1
Ford,1,1,1,1,1,1,1,1,1,1,1
Honda,2,2,2,2,2,2,2,2,2,2,2
Hyundai,1,1,1,1,1,1,1,1,1,1,1
INFINITI,1,1,1,1,1,1,1,1,1,1,1
Jeep,1,1,1,1,1,1,1,1,1,1,1
Kia,1,1,1,1,1,1,1,1,1,1,1


In [766]:
#extracting just Porsche
porsche_cars = kbb_cleaned[kbb_cleaned['make']== 'Porsche']

print(porsche_cars.count())

#seperating out the 0 values
porsche_cars_zero = porsche_cars[porsche_cars['price'] == 0]

#viewing the missing amounts in each type
porsche_cars_zero['model'].value_counts()

Used Vehicle Classification    1080
year                           1080
make                           1080
model                          1080
packages                       1080
price                          1080
mileage                        1080
engine_type                    1080
transmission_type              1080
drivetrain                     1078
color                          1080
vehicle_type                   1080
dtype: int64


Macan      40
Cayenne    40
Taycan      1
Name: model, dtype: int64

In [768]:
#extracting all Macan above 0 
porsche_greater = porsche_cars[(porsche_cars['model'] == 'Macan') & (porsche_cars['price'] > 0)]

print(porsche_greater.count())

#looking at mean median mode std and quartile ranges
porsche_median = porsche_greater['price'].median()
porsche_mode = porsche_greater['price'].mode()

print()
print('Median:', porsche_median)
print ()
print('Mode')
print(porsche_mode)
print()
print(porsche_greater.describe())

Used Vehicle Classification    519
year                           519
make                           519
model                          519
packages                       519
price                          519
mileage                        519
engine_type                    519
transmission_type              519
drivetrain                     519
color                          519
vehicle_type                   519
dtype: int64

Median: 62295.0

Mode
0     30980.0
1     33000.0
2     40997.0
3     54995.0
4     61738.0
5     62295.0
6     62772.0
7     63294.0
8     63500.0
9     66439.0
10    69993.0
11    99000.0
Name: price, dtype: float64

              price       mileage
count    519.000000    519.000000
mean   58549.358382  16234.495183
std    17203.465558  22046.039613
min    29499.000000   2427.000000
25%    41519.000000   4290.000000
50%    62295.000000   5916.000000
75%    63500.000000  19547.000000
max    99000.000000  83190.000000


In [770]:
#updating macan with the median 

kbb_cleaned.loc[(kbb_cleaned['make'] == 'Porsche') & (kbb_cleaned['model'] == 'Macan') & (kbb_cleaned['price'] == 0), 'price'] = 62295

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'Porsche') & (kbb_cleaned['model'] == 'Macan') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    559
year                           559
make                           559
model                          559
packages                       559
price                          559
mileage                        559
engine_type                    559
transmission_type              559
drivetrain                     559
color                          559
vehicle_type                   559
dtype: int64

In [772]:
#extracting all Cayenne above 0 
cayenne_greater = porsche_cars[(porsche_cars['model'] == 'Cayenne') & (porsche_cars['price'] > 0)]

print(cayenne_greater.count())

#looking at mean median mode std and quartile ranges
cayenne_median = cayenne_greater['price'].median()
cayenne_mode = cayenne_greater['price'].mode()

print()
print('Median:', cayenne_median)
print ()
print('Mode')
print(cayenne_mode)
print()
print(cayenne_greater.describe())

Used Vehicle Classification    89
year                           89
make                           89
model                          89
packages                       89
price                          89
mileage                        89
engine_type                    89
transmission_type              89
drivetrain                     89
color                          89
vehicle_type                   89
dtype: int64

Median: 52796.0

Mode
0    46190.0
1    83110.0
Name: price, dtype: float64

              price       mileage
count     89.000000     89.000000
mean   64108.741573  26804.044944
std    18222.513596  21493.643289
min    44430.000000   4712.000000
25%    46190.000000   4712.000000
50%    52796.000000  46472.000000
75%    83110.000000  46472.000000
max    83110.000000  65707.000000


In [774]:
#updating macan with the median 

kbb_cleaned.loc[(kbb_cleaned['make'] == 'Porsche') & (kbb_cleaned['model'] == 'Cayenne') & (kbb_cleaned['price'] == 0), 'price'] = 52796

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'Porsche') & (kbb_cleaned['model'] == 'Cayenne') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    129
year                           129
make                           129
model                          129
packages                       129
price                          129
mileage                        129
engine_type                    129
transmission_type              129
drivetrain                     129
color                          129
vehicle_type                   129
dtype: int64

In [777]:
#extracting just Ram
ram_cars = kbb_cleaned[kbb_cleaned['make']== 'RAM']

print(ram_cars.count())

#seperating out the 0 values
ram_cars_zero = ram_cars[ram_cars['price'] == 0]

#viewing the missing amounts in each type
ram_cars_zero['model'].value_counts()

Used Vehicle Classification    1080
year                           1080
make                           1080
model                          1080
packages                       1080
price                          1080
mileage                        1080
engine_type                    1080
transmission_type              1080
drivetrain                     1080
color                          1080
vehicle_type                   1080
dtype: int64


Tradesman    40
Laram        40
Name: model, dtype: int64

In [779]:
#extracting all Tradesman above 0 
ram_greater = ram_cars[(ram_cars['model'] == 'Tradesman') & (ram_cars['price'] > 0)]

print(ram_greater.count())

#looking at mean median mode std and quartile ranges
ram_median = ram_greater['price'].median()
ram_mode = ram_greater['price'].mode()

print()
print('Median:', ram_median)
print ()
print('Mode')
print(ram_mode)
print()
print(ram_greater.describe())

Used Vehicle Classification    92
year                           92
make                           92
model                          92
packages                       92
price                          92
mileage                        92
engine_type                    92
transmission_type              92
drivetrain                     92
color                          92
vehicle_type                   92
dtype: int64

Median: 39894.0

Mode
0    29995.0
1    39993.0
Name: price, dtype: float64

              price        mileage
count     92.000000      92.000000
mean   35531.380435   39836.130435
std     5576.407499   15875.265675
min    28417.000000    3532.000000
25%    29995.000000   29942.000000
50%    39894.000000   42119.500000
75%    39993.000000   46926.000000
max    53985.000000  107440.000000


In [782]:
#updating Tradesman with the median 

kbb_cleaned.loc[(kbb_cleaned['make'] == 'RAM') & (kbb_cleaned['model'] == 'Tradesman') & (kbb_cleaned['price'] == 0), 'price'] = 39894

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'RAM') & (kbb_cleaned['model'] == 'Tradesman') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    132
year                           132
make                           132
model                          132
packages                       132
price                          132
mileage                        132
engine_type                    132
transmission_type              132
drivetrain                     132
color                          132
vehicle_type                   132
dtype: int64

In [784]:
#extracting all Laram above 0 
laram_greater = ram_cars[(ram_cars['model'] == 'Laram') & (ram_cars['price'] > 0)]

print(laram_greater.count())

#looking at mean median mode std and quartile ranges
laram_median = laram_greater['price'].median()
laram_mode = laram_greater['price'].mode()

print()
print('Median:', laram_median)
print ()
print('Mode')
print(laram_mode)
print()
print(laram_greater.describe())

Used Vehicle Classification    94
year                           94
make                           94
model                          94
packages                       94
price                          94
mileage                        94
engine_type                    94
transmission_type              94
drivetrain                     94
color                          94
vehicle_type                   94
dtype: int64

Median: 58990.0

Mode
0    58990.0
1    65986.0
Name: price, dtype: float64

              price        mileage
count     94.000000      94.000000
mean   60186.127660   44767.329787
std     7342.139717   23651.206415
min    35999.000000       9.000000
25%    58990.000000   22960.000000
50%    58990.000000   46209.500000
75%    65986.000000   65127.000000
max    67113.000000  110423.000000


In [786]:
#updating Laram with the median 

kbb_cleaned.loc[(kbb_cleaned['make'] == 'RAM') & (kbb_cleaned['model'] == 'Laram') & (kbb_cleaned['price'] == 0), 'price'] = 58990

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'RAM') & (kbb_cleaned['model'] == 'Laram') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    134
year                           134
make                           134
model                          134
packages                       134
price                          134
mileage                        134
engine_type                    134
transmission_type              134
drivetrain                     134
color                          134
vehicle_type                   134
dtype: int64

In [789]:
#extracting just Tesla
tesla_cars = kbb_cleaned[kbb_cleaned['make']== 'Tesla']

print(tesla_cars.count())

#seperating out the 0 values
tesla_cars_zero = tesla_cars[tesla_cars['price'] == 0]

#viewing the missing amounts in each type
tesla_cars_zero['model'].value_counts()

Used Vehicle Classification    1080
year                           1080
make                           1080
model                          1080
packages                       1080
price                          1080
mileage                        1080
engine_type                    1080
transmission_type              1080
drivetrain                     1080
color                          1080
vehicle_type                   1080
dtype: int64


ModelY    41
Name: model, dtype: int64

In [791]:
#extracting all model Y  above 0 
tesla_greater = tesla_cars[(tesla_cars['model'] == 'ModelY') & (tesla_cars['price'] > 0)]

print(tesla_greater.count())

#looking at mean median mode std and quartile ranges
tesla_median = tesla_greater['price'].median()
tesla_mode = tesla_greater['price'].mode()

print()
print('Median:', tesla_median)
print ()
print('Mode')
print(tesla_mode)
print()
print(tesla_greater.describe())


Used Vehicle Classification    231
year                           231
make                           231
model                          231
packages                       231
price                          231
mileage                        231
engine_type                    231
transmission_type              231
drivetrain                     231
color                          231
vehicle_type                   231
dtype: int64

Median: 41999.0

Mode
0    45999.0
Name: price, dtype: float64

              price       mileage
count    231.000000    231.000000
mean   42939.826840  36406.584416
std     3229.496097  14279.657290
min    37504.000000   5158.000000
25%    41000.000000  30590.000000
50%    41999.000000  36249.000000
75%    45999.000000  38743.000000
max    54599.000000  62727.000000


In [795]:
#updating ModelY with the median 

kbb_cleaned.loc[(kbb_cleaned['make'] == 'Tesla') & (kbb_cleaned['model'] == 'ModelY') & (kbb_cleaned['price'] == 0), 'price'] = 41999

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'Tesla') & (kbb_cleaned['model'] == 'ModelY') & (kbb_cleaned['price'] > 0)].count()


Used Vehicle Classification    272
year                           272
make                           272
model                          272
packages                       272
price                          272
mileage                        272
engine_type                    272
transmission_type              272
drivetrain                     272
color                          272
vehicle_type                   272
dtype: int64

In [797]:
#extracting just Volkswagen
vw_cars = kbb_cleaned[kbb_cleaned['make']== 'Volkswagen']

print(vw_cars.count())

#seperating out the 0 values
vw_cars_zero = vw_cars[vw_cars['price'] == 0]

#viewing the missing amounts in each type
vw_cars_zero['model'].value_counts()


Used Vehicle Classification    1080
year                           1080
make                           1080
model                          1080
packages                       1080
price                          1080
mileage                        1080
engine_type                    1080
transmission_type              1080
drivetrain                     1080
color                          1080
vehicle_type                   1080
dtype: int64


Jetta     40
Tiguan    40
Passat     1
Name: model, dtype: int64

In [799]:
#extracting all Jetta above 0 
jetta_greater = vw_cars[(vw_cars['model'] == 'Jetta') & (vw_cars['price'] > 0)]

print(jetta_greater.count())

#looking at mean median mode std and quartile ranges
jetta_median = jetta_greater['price'].median()
jetta_mode = jetta_greater['price'].mode()

print()
print('Median:', jetta_median)
print ()
print('Mode')
print(jetta_mode)
print()
print(jetta_greater.describe())


Used Vehicle Classification    27
year                           27
make                           27
model                          27
packages                       27
price                          27
mileage                        27
engine_type                    27
transmission_type              27
drivetrain                     27
color                          27
vehicle_type                   27
dtype: int64

Median: 20000.0

Mode
0    19991.0
Name: price, dtype: float64

              price       mileage
count     27.000000     27.000000
mean   20718.259259  42445.185185
std     3304.466378  25034.774718
min    15599.000000   1431.000000
25%    18939.000000  20321.000000
50%    20000.000000  41567.000000
75%    22424.500000  62538.000000
max    29914.000000  88753.000000


#### Notes: 

The jetta has 67 vehicles in the data set 40 of which have no price. 27 do have a price. 
If I add any value to the jetta this will alter the values significantly. The mode is 19991 and the median is 20,000. the average is 20718. The Standaard deviation is about 3,300 which is not too bad of a range. Lets see the mileage on the missing vehicles. 


In [803]:
jetta_zero = vw_cars_zero[vw_cars_zero['model'] == 'Jetta']
print(jetta_zero)
#looking at it its the same vehicle so I will update the value and drop the duplicates. the median is about the average of with the cars with similar mileage. they range between 19795 and 20,300

      Used Vehicle Classification  year        make  model packages  price  \
34501                   Certified  2017  Volkswagen  Jetta       SE    0.0   
34528                   Certified  2017  Volkswagen  Jetta       SE    0.0   
34555                   Certified  2017  Volkswagen  Jetta       SE    0.0   
34582                   Certified  2017  Volkswagen  Jetta       SE    0.0   
34609                   Certified  2017  Volkswagen  Jetta       SE    0.0   
34636                   Certified  2017  Volkswagen  Jetta       SE    0.0   
34663                   Certified  2017  Volkswagen  Jetta       SE    0.0   
34690                   Certified  2017  Volkswagen  Jetta       SE    0.0   
34717                   Certified  2017  Volkswagen  Jetta       SE    0.0   
34744                   Certified  2017  Volkswagen  Jetta       SE    0.0   
34771                   Certified  2017  Volkswagen  Jetta       SE    0.0   
34798                   Certified  2017  Volkswagen  Jetta      

In [805]:
#updating Jetta with the median 

kbb_cleaned.loc[(kbb_cleaned['make'] == 'Volkswagen') & (kbb_cleaned['model'] == 'Jetta') & (kbb_cleaned['price'] == 0), 'price'] = 20000

#dropping the duplicates
kbb_cleaned.loc[(kbb_cleaned['model'] == 'Jetta') & (kbb_cleaned['price'] == 20000)] = kbb_cleaned.loc[(kbb_cleaned['model'] == 'Jetta') & (kbb_cleaned['price'] == 20000)].drop_duplicates()

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'Volkswagen') & (kbb_cleaned['model'] == 'Jetta') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    28
year                           28
make                           28
model                          28
packages                       28
price                          28
mileage                        28
engine_type                    28
transmission_type              28
drivetrain                     28
color                          28
vehicle_type                   28
dtype: int64

In [808]:
#extracting all Tiguan above 0 
vw_greater = vw_cars[(vw_cars['model'] == 'Tiguan') & (vw_cars['price'] > 0)]

print(vw_greater.count())

#looking at mean median mode std and quartile ranges
vw_median = vw_greater['price'].median()
vw_mode = vw_greater['price'].mode()

print()
print('Median:', vw_median)
print ()
print('Mode')
print(vw_mode)
print()
print(vw_greater.describe())

Used Vehicle Classification    472
year                           472
make                           472
model                          472
packages                       472
price                          472
mileage                        472
engine_type                    472
transmission_type              472
drivetrain                     472
color                          472
vehicle_type                   472
dtype: int64

Median: 22883.0

Mode
0     17768.0
1     21056.0
2     21715.0
3     21810.0
4     22345.0
5     22883.0
6     24989.0
7     27987.0
8     30157.0
9     32986.0
10    35093.0
Name: price, dtype: float64

              price       mileage
count    472.000000    472.000000
mean   25256.423729  32515.105932
std     5199.315576  24023.319817
min    16195.000000   3928.000000
25%    21715.000000   9952.000000
50%    22883.000000  24208.000000
75%    30157.000000  65594.000000
max    35093.000000  98907.000000


In [810]:
#updating Tiguan with the median 

kbb_cleaned.loc[(kbb_cleaned['make'] == 'Volkswagen') & (kbb_cleaned['model'] == 'Tiguan') & (kbb_cleaned['price'] == 0), 'price'] = 22883

#checking to make sure it updated. 
kbb_cleaned[(kbb_cleaned['make'] == 'Volkswagen') & (kbb_cleaned['model'] == 'Tiguan') & (kbb_cleaned['price'] > 0)].count()

Used Vehicle Classification    512
year                           512
make                           512
model                          512
packages                       512
price                          512
mileage                        512
engine_type                    512
transmission_type              512
drivetrain                     512
color                          512
vehicle_type                   512
dtype: int64

In [813]:
kbb_cleaned.to_csv('kbb_cleaned.csv')

### Notes: 

#### Now the data is cleaned and in good shape. I will use power bi to build a dashboard for the cleaned dataset. Before I go ahead with the project in Power Bi, I really want to recapture the Ford Maverick since it was dropped from the dataset. I feel like missing this model will be doing a diservice. I am not really concerned about the Transit van since its generally a work van.  I also noticed I did not pull any Volvo vehicles in my original scrape. So i would like to bring volvo in as well. 

#### This will result in 2 datasets that I will need to merge together. 

In [6]:
# Base URL for the search results
base_url = "https://www.kbb.com/cars-for-sale/used/ford/maverick/safety-harbor-fl/?endYear=2023&isNewSearch=true&marketExtension=include&numRecords=24&searchRadius=0&sortBy=relevance&startYear=2017&zip=34695&page="

# Number of pages to scrape
num_pages = 2

# Create empty lists to store the scraped data
car_details_list = []

# Loop through each page
for page_number in range(1, num_pages + 1):
    # Construct the URL for the current page
    url = f"{base_url}{page_number}"

    # Send a GET request to the URL
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the HTML content
        html_content = response.content
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find and process car listings on this page
        car_listings = soup.find_all('script', {'data-cmp': 'lstgSchema'})
        
            # Process each car listing


        for car_listing in car_listings:
            car_data = json.loads(car_listing.contents[0])
            
            car_details = {
                "name": car_data.get("name", ""),
                "price": car_data.get("offers", {}).get("price", ""),
                "mileage": car_data.get("mileageFromOdometer", {}).get("value", ""),
                "engine_type": car_data.get("vehicleEngine", ""),
                "transmission_type": car_data.get("vehicleTransmission", ""),
                "color": car_data.get("color", ""),
                "drivewheel_configuration": car_data.get("driveWheelConfiguration", "")
            }
            
            car_details_list.append(car_details)

        print(f"Finished scraping page {page_number}")
    else:
        print(f"Failed to retrieve page {page_number}")
    
    # Add a 10-second delay before making the next request
    time.sleep(10)

print("Scraping complete!")

df_maverick = pd.DataFrame(car_details_list)
df_maverick.to_csv("maverick_csv.csv")


Finished scraping page 1
Finished scraping page 2
Scraping complete!


### Note: 

The webscrape picked up 4 cars with no price. Checking the website they say call dealer for price. Since I have a large sample of 53 i will just drop these 4. Before I clean I will webscrape volvo and adjust my time interval for sleep. I will randomize it between 10 and 15 seconds and I will adjust my scrape to skip 0 values. 

In [818]:
import random

# Base URL for the search results
base_url = "https://www.kbb.com/cars-for-sale/used/volvo/safety-harbor-fl/?endYear=2023&isNewSearch=true&marketExtension=include&numRecords=24&searchRadius=0&sortBy=relevance&startYear=2017&zip=34695&page="

# Number of pages to scrape
num_pages = 40

# Create an empty list to store the scraped data
car_details_list = []

# Loop through each page
for page_number in range(1, num_pages + 1):
    # Construct the URL for the current page
    url = f"{base_url}{page_number}"

    # Send a GET request to the URL
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the HTML content
        html_content = response.content
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find and process car listings on this page
        car_listings = soup.find_all('script', {'data-cmp': 'lstgSchema'})

        # Process each car listing
        for car_listing in car_listings:
            car_data = json.loads(car_listing.contents[0])

            # Check if the "price" field is present and not empty
            if "offers" in car_data and "price" in car_data["offers"] and car_data["offers"]["price"]:
                car_details = {
                    "name": car_data.get("name", ""),
                    "price": car_data["offers"]["price"],
                    "mileage": car_data.get("mileageFromOdometer", {}).get("value", ""),
                    "engine_type": car_data.get("vehicleEngine", ""),
                    "transmission_type": car_data.get("vehicleTransmission", ""),
                    "color": car_data.get("color", ""),
                    "drivewheel_configuration": car_data.get("driveWheelConfiguration", "")
                }

                car_details_list.append(car_details)

        print(f"Finished scraping page {page_number}")
    else:
        print(f"Failed to retrieve page {page_number}")

    # Add a random sleep interval between 10 and 15 seconds
    sleep_interval = random.uniform(10, 15)
    print(f"Sleeping for {sleep_interval:.2f} seconds...")
    time.sleep(sleep_interval)

print("Scraping complete!")

df_volvo = pd.DataFrame(car_details_list)
df_volvo.to_csv("volvo_csv.csv")


Finished scraping page 1
Sleeping for 10.47 seconds...
Finished scraping page 2
Sleeping for 14.09 seconds...
Finished scraping page 3
Sleeping for 14.91 seconds...
Finished scraping page 4
Sleeping for 10.30 seconds...
Finished scraping page 5
Sleeping for 13.55 seconds...
Finished scraping page 6
Sleeping for 11.07 seconds...
Finished scraping page 7
Sleeping for 14.08 seconds...
Finished scraping page 8
Sleeping for 11.37 seconds...
Finished scraping page 9
Sleeping for 10.29 seconds...
Finished scraping page 10
Sleeping for 11.71 seconds...
Finished scraping page 11
Sleeping for 10.00 seconds...
Finished scraping page 12
Sleeping for 14.32 seconds...
Finished scraping page 13
Sleeping for 11.43 seconds...
Finished scraping page 14
Sleeping for 11.47 seconds...
Finished scraping page 15
Sleeping for 14.99 seconds...
Finished scraping page 16
Sleeping for 10.09 seconds...
Finished scraping page 17
Sleeping for 11.73 seconds...
Finished scraping page 18
Sleeping for 12.02 seconds...
F

In [483]:
#had to restart workbook
df_volvo = pd.read_csv('volvo_csv.csv')
df_maverick = pd.read_csv('maverick_csv.csv')

In [484]:
df_volvo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Unnamed: 0                1080 non-null   int64 
 1   name                      1080 non-null   object
 2   price                     1080 non-null   int64 
 3   mileage                   1080 non-null   object
 4   engine_type               1080 non-null   object
 5   transmission_type         1080 non-null   object
 6   color                     1038 non-null   object
 7   drivewheel_configuration  1080 non-null   object
dtypes: int64(2), object(6)
memory usage: 67.6+ KB


### Note: Cleaning and reshaping the maverick and volvo dataset to match the kbb_cleaned dataset. 

In [485]:
df_maverick.columns

Index(['Unnamed: 0', 'Used Vehicle Classification', 'year', 'make', 'model',
       'packages', 'price', 'mileage', 'engine_type', 'transmission_type',
       'drivetrain', 'color'],
      dtype='object')

In [486]:
df_maverick['transmission_type'].value_counts()

Automatic    50
Name: transmission_type, dtype: int64

In [487]:
#cleaning transmission
df_maverick['transmission_type'] = df_maverick['transmission_type'].replace({"8-Speed Automatic", "Continuously Variable Automatic"}, "Automatic")
df_maverick['transmission_type'].value_counts()

Automatic    50
Name: transmission_type, dtype: int64

In [488]:
df_maverick['color'].value_counts()

Blue       17
Black       9
Gray        9
Unknown     4
Silver      4
White       3
Red         2
Orange      2
Name: color, dtype: int64

In [489]:
#cleaning color column
df_maverick['color'] = df_maverick['color'].replace({"","Unavail"}, "Unknown")
df_maverick['color'].value_counts()

Blue       17
Black       9
Gray        9
Unknown     4
Silver      4
White       3
Red         2
Orange      2
Name: color, dtype: int64

In [490]:
#creating drivetrain 
replacement_dict = {
    "All wheel drive": "AWD",
    "All Wheel Drive": "AWD",
    "2 wheel drive - front": "FWD",
    "2 Wheel Drive - Front": "FWD"
}

df_maverick["drivetrain"] = df_maverick["drivewheel_configuration"].replace(replacement_dict)
df_maverick['drivetrain'].value_counts()

KeyError: 'drivewheel_configuration'

In [None]:
# cleaning mileage column 
df_maverick['mileage'] = df_maverick['mileage'].str.replace(",","")
df_maverick['mileage'] = df_maverick['mileage'].astype(int)

In [None]:
df_maverick['price'] = df_maverick['price'].astype(int)
df_maverick['price'][df_maverick['price'] == 0]

In [None]:
#dropping 0 value
dropping = [2,24,29,51]

df_maverick = df_maverick.drop(2)
df_maverick = df_maverick.drop(24)
df_maverick = df_maverick.drop(29)
df_maverick = df_maverick.drop(51)

df_maverick = df_maverick.reset_index(drop=True)

df_maverick.count()

In [None]:
#unnamed column wont drop with reset index 
df_maverick = df_maverick.drop(columns=["Unnamed: 0"])

In [None]:
#checking df 
df_maverick

In [None]:
#extracting vehicle classification 
df_maverick["Used Vehicle Classification"] = df_maverick["name"].str.extract('(Used|Certified)', expand=False)
df_maverick["name"] = df_maverick["name"].str.strip('(Used|Certified)')
df_maverick.head(15)


In [None]:
#extracting year from name 
df_maverick['year'] = df_maverick['name'].str.extract(r'(\d{4})')
df_maverick['name'] = df_maverick['name'].str.strip("")
df_maverick.head()


In [None]:
#removing year from name column 
df_maverick['name'] = df_maverick['name'].str.replace(r'(\d{4})', "")
df_maverick['name'] = df_maverick['name'].str.strip()
df_maverick.head()


In [None]:
#creating make and model column 
df_maverick['make'] = df_maverick['name'].str.split().str[0]
df_maverick['model'] = df_maverick['name'].str.split().str[1]

#stripping make and model and renaming column 
df_maverick['name'] = df_maverick['name'].str.split(n=2).str[2:].str.join(' ')

df_maverick = df_maverick.rename(columns={'name': 'packages'})

maverick_mapping = {"maverick": "Pick up"}

#mapping and checking vehicle type 
df_maverick['vehicle_type'] = df_maverick['model'].map(maverick_mapping)



In [None]:
#reorganizing the dataframe
df_maverick = df_maverick[['Used Vehicle Classification', 'year','make','model','packages', 'price', 'mileage', 'engine_type', 'transmission_type','drivetrain', 'color']]


In [None]:
df_maverick['color'] = df_maverick['color'].fillna('Unknown')

In [None]:
df_maverick

In [None]:
df_maverick.to_csv("maverick_csv.csv")

In [493]:
df_volvo.head()

Unnamed: 0.1,Unnamed: 0,name,price,mileage,engine_type,transmission_type,color,drivewheel_configuration
0,0,Used 2022 Volvo XC60 T8 R-Design,55497,15462,Hybrid,8-Speed Automatic,White,All Wheel Drive
1,1,Certified 2020 Volvo XC40 T5 Momentum,32968,27709,4-Cylinder Turbo,8-Speed Automatic,Black,All Wheel Drive
2,2,Used 2020 Volvo XC60 T6 Momentum,29990,57215,4-Cylinder Turbo,8-Speed Automatic,Black,All Wheel Drive
3,3,Used 2018 Volvo S90 T5 Momentum,27991,52303,4-Cylinder Turbo,8-Speed Automatic,Brown,All Wheel Drive
4,4,Used 2018 Volvo XC60 T5 Momentum w/ Vision Pac...,32999,36000,4-Cylinder Turbo,8-Speed Automatic,White,All Wheel Drive


In [495]:
#identifying the transmission types 
df_volvo['transmission_type'].value_counts()

8-Speed Automatic    624
Automatic            414
Single-Speed          42
Name: transmission_type, dtype: int64

In [496]:
#consolidating transmission types 
df_volvo['transmission_type'] = df_volvo['transmission_type'].replace({"8-Speed Automatic", "Single-Speed"}, "Automatic")
df_volvo['transmission_type'].value_counts()

Automatic    1080
Name: transmission_type, dtype: int64

In [497]:
#checking color column 
df_volvo['color'].value_counts(dropna=False)

White     444
Blue      210
Black     180
Gray       62
Silver     55
Red        46
NaN        42
Brown      40
Gold        1
Name: color, dtype: int64

In [498]:
#converting 42 Nan to unknown 
df_volvo['color'] = df_volvo['color'].fillna('Unknown')
df_volvo['color'].value_counts(dropna=False)

White      444
Blue       210
Black      180
Gray        62
Silver      55
Red         46
Unknown     42
Brown       40
Gold         1
Name: color, dtype: int64

In [499]:
df_volvo['drivewheel_configuration'].value_counts(dropna=False)

All Wheel Drive          563
All wheel drive          378
2 Wheel Drive - Front     85
2 wheel drive - front     54
Name: drivewheel_configuration, dtype: int64

In [500]:
#creating drivetrain 
replacement_dict = {
    "All wheel drive": "AWD",
    "All Wheel Drive": "AWD",
    "2 wheel drive - front": "FWD",
    "2 Wheel Drive - Front": "FWD"
}

df_volvo["drivetrain"] = df_volvo["drivewheel_configuration"].replace(replacement_dict)
df_volvo['drivetrain'].value_counts()

AWD    941
FWD    139
Name: drivetrain, dtype: int64

In [501]:
#removing comma and converting to integer 
df_volvo['mileage'] = df_volvo['mileage'].str.replace(',','')

df_volvo['mileage'] = df_volvo['mileage'].astype(int)

In [502]:
#checking column 
df_volvo['mileage'].head()

0    15462
1    27709
2    57215
3    52303
4    36000
Name: mileage, dtype: int32

In [503]:
#checking for nan values 
df_volvo['mileage'].isna().sum()

0

In [504]:
#checking if any Nan values 
df_volvo['price'].isna().sum()

0

In [505]:
#exploring engine type for consolidation 
df_volvo['engine_type'].value_counts()

4-Cylinder Turbo         898
Hybrid                    86
Electric                  42
Plug-in Hybrid            36
Gas / Electric Hybrid     18
Name: engine_type, dtype: int64

In [506]:
#replacing 1 hybrid type 
replacement_dict_engine = {
    "Gas / Electric Hybrid": "Hybrid"
  
}

df_volvo["engine_type"] = df_volvo["engine_type"].replace(replacement_dict_engine)


In [507]:
#extracting vehicle classification 
df_volvo["Used Vehicle Classification"] = df_volvo["name"].str.extract('(Used|Certified)', expand=False)
df_volvo["name"] = df_volvo["name"].str.strip('(Used|Certified)')
df_volvo.head(15)

Unnamed: 0.1,Unnamed: 0,name,price,mileage,engine_type,transmission_type,color,drivewheel_configuration,drivetrain,Used Vehicle Classification
0,0,2022 Volvo XC60 T8 R-Design,55497,15462,Hybrid,Automatic,White,All Wheel Drive,AWD,Used
1,1,2020 Volvo XC40 T5 Momentum,32968,27709,4-Cylinder Turbo,Automatic,Black,All Wheel Drive,AWD,Certified
2,2,2020 Volvo XC60 T6 Momentum,29990,57215,4-Cylinder Turbo,Automatic,Black,All Wheel Drive,AWD,Used
3,3,2018 Volvo S90 T5 Momentum,27991,52303,4-Cylinder Turbo,Automatic,Brown,All Wheel Drive,AWD,Used
4,4,2018 Volvo XC60 T5 Momentum w/ Vision Packag,32999,36000,4-Cylinder Turbo,Automatic,White,All Wheel Drive,AWD,Used
5,5,2022 Volvo XC60 B5 Momentum w/ Climate Packag,37990,11818,4-Cylinder Turbo,Automatic,White,All Wheel Drive,AWD,Used
6,6,2021 Volvo XC90 T8 Inscription Expression,48717,23783,Hybrid,Automatic,Black,All Wheel Drive,AWD,Used
7,7,2018 Volvo XC60 T6 Momentum w/ Convenience Pa...,26274,50729,4-Cylinder Turbo,Automatic,Gray,All Wheel Drive,AWD,Used
8,8,2019 Volvo XC40 T5 R-Design,26499,60384,4-Cylinder Turbo,Automatic,Red,All Wheel Drive,AWD,Used
9,9,2020 Volvo XC60 T5 Momentum,28999,27269,4-Cylinder Turbo,Automatic,White,2 Wheel Drive - Front,FWD,Used


In [508]:
#extracting year from name 
df_volvo['year'] = df_volvo['name'].str.extract(r'(\d{4})')
df_volvo['name'] = df_volvo['name'].str.strip("")
df_volvo.head()

Unnamed: 0.1,Unnamed: 0,name,price,mileage,engine_type,transmission_type,color,drivewheel_configuration,drivetrain,Used Vehicle Classification,year
0,0,2022 Volvo XC60 T8 R-Design,55497,15462,Hybrid,Automatic,White,All Wheel Drive,AWD,Used,2022
1,1,2020 Volvo XC40 T5 Momentum,32968,27709,4-Cylinder Turbo,Automatic,Black,All Wheel Drive,AWD,Certified,2020
2,2,2020 Volvo XC60 T6 Momentum,29990,57215,4-Cylinder Turbo,Automatic,Black,All Wheel Drive,AWD,Used,2020
3,3,2018 Volvo S90 T5 Momentum,27991,52303,4-Cylinder Turbo,Automatic,Brown,All Wheel Drive,AWD,Used,2018
4,4,2018 Volvo XC60 T5 Momentum w/ Vision Packag,32999,36000,4-Cylinder Turbo,Automatic,White,All Wheel Drive,AWD,Used,2018


In [509]:
#removing year from name column 
df_volvo['name'] = df_volvo['name'].str.replace(r'(\d{4})', "")
df_volvo['name'] = df_volvo['name'].str.strip()
df_volvo.head()


  df_volvo['name'] = df_volvo['name'].str.replace(r'(\d{4})', "")


Unnamed: 0.1,Unnamed: 0,name,price,mileage,engine_type,transmission_type,color,drivewheel_configuration,drivetrain,Used Vehicle Classification,year
0,0,Volvo XC60 T8 R-Design,55497,15462,Hybrid,Automatic,White,All Wheel Drive,AWD,Used,2022
1,1,Volvo XC40 T5 Momentum,32968,27709,4-Cylinder Turbo,Automatic,Black,All Wheel Drive,AWD,Certified,2020
2,2,Volvo XC60 T6 Momentum,29990,57215,4-Cylinder Turbo,Automatic,Black,All Wheel Drive,AWD,Used,2020
3,3,Volvo S90 T5 Momentum,27991,52303,4-Cylinder Turbo,Automatic,Brown,All Wheel Drive,AWD,Used,2018
4,4,Volvo XC60 T5 Momentum w/ Vision Packag,32999,36000,4-Cylinder Turbo,Automatic,White,All Wheel Drive,AWD,Used,2018


In [510]:
#creating make and model column 
df_volvo['make'] = df_volvo['name'].str.split().str[0]
df_volvo['model'] = df_volvo['name'].str.split().str[1]

#stripping make and model and renaming column 
df_volvo['name'] = df_volvo['name'].str.split(n=2).str[2:].str.join(' ')

df_volvo = df_volvo.rename(columns={'name': 'packages'})

In [511]:
df_volvo['model'].value_counts()

XC90    393
XC60    312
XC40    185
S90      52
S60      50
V60      46
C40      41
V90       1
Name: model, dtype: int64

In [512]:
volvo_mapping = {"XC90": "SUV", "XC60": "SUV", "XC40":"SUV", "S90": "Sedan", "S60": "Sedan", "V60":"Sedan", "C40": "SUV", "V90": "Sedan"}

#mapping and checking vehicle type 
df_maverick['vehicle_type'] = df_maverick['model'].map(volvo_mapping)

In [514]:
#reorganizing the dataframe
df_volvo = df_volvo[['Used Vehicle Classification', 'year','make','model','packages', 'price', 'mileage', 'engine_type', 'transmission_type','drivetrain', 'color']]

In [515]:
df_volvo.head()

Unnamed: 0,Used Vehicle Classification,year,make,model,packages,price,mileage,engine_type,transmission_type,drivetrain,color
0,Used,2022,Volvo,XC60,T8 R-Design,55497,15462,Hybrid,Automatic,AWD,White
1,Certified,2020,Volvo,XC40,T5 Momentum,32968,27709,4-Cylinder Turbo,Automatic,AWD,Black
2,Used,2020,Volvo,XC60,T6 Momentum,29990,57215,4-Cylinder Turbo,Automatic,AWD,Black
3,Used,2018,Volvo,S90,T5 Momentum,27991,52303,4-Cylinder Turbo,Automatic,AWD,Brown
4,Used,2018,Volvo,XC60,T5 Momentum w/ Vision Packag,32999,36000,4-Cylinder Turbo,Automatic,AWD,White


In [518]:
df_volvo.to_csv("volvo_csv.csv")

In [519]:
kbb_cleaned = pd.read_csv("kbb_cleaned.csv")

df_combined = [kbb_cleaned, df_maverick, df_volvo]

kbb_cleaned_final = pd.concat(df_combined)

In [521]:
kbb_cleaned_final['make'].value_counts()

Acura            1080
Jeep             1080
Toyota           1080
Tesla            1080
Subaru           1080
RAM              1080
Porsche          1080
Nissan           1080
MINI             1080
Mercedes-Benz    1080
MAZDA            1080
Maserati         1080
Lincoln          1080
Lexus            1080
Land Rover       1080
Alfa Romeo       1080
Kia              1080
Jaguar           1080
Dodge            1080
Audi             1080
BMW              1080
Chevrolet        1080
Chrysler         1080
INFINITI         1080
Volvo            1080
Genesis          1080
GMC              1080
Honda            1080
Hyundai          1080
Buick            1077
Cadillac         1053
Volkswagen       1041
Mitsubishi       1041
Ford             1009
Name: make, dtype: int64

In [522]:
kbb_ford_cleaned = kbb_cleaned_final[kbb_cleaned_final['make'] == 'Ford']

kbb_ford_cleaned['model'].value_counts()

F150          363
Expedition    166
Mustang       123
F250           85
Maverick       50
Explorer       50
Fusion         45
Ranger         44
F350           44
Escape         13
Edge           10
Bronco         10
EcoSport        2
E-450           1
Focus           1
Fiesta          1
Taurus          1
Name: model, dtype: int64

In [523]:
kbb_cleaned_final.to_csv("kbb_final_csv.csv")

In [None]:
kbb['make'].value_counts(dropna=False)
# Drop rows with NaN values in the "make" column
kbb_cleaned = kbb.dropna(subset=['make'])

In [None]:
kbb_cleaned['make'].value_counts(dropna=False)

In [None]:
kbb_cleaned.to_csv("kbb_final_csv.csv")