In [47]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [48]:
#Create empty lists to store the data we are about to collect from the website
#We would like to collect the names of the cars, the mileage, the price, the rating, the number of reviews the car received

car_name = []
car_mileage = []
car_price = []

In [49]:
#We need to create a variable to store the url of the website, I'm calling this variable website_url. We then need to send a request to the website
#and scrape multiple pages

for i in range(1,2):
    website_url ="https://www.truecar.com/used-cars-for-sale/listings/dodge/charger/location-north-haven-ct/?excludeExpandedDelivery=true&searchRadius=50" + str(i)
    #make request to the website
    web_request = requests.get(website_url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'})
    
    # Create an object to store the html elements of the request
    soup = BeautifulSoup(web_request.content,'html.parser')
    
    # Create a list that contains the results of the web page
    results_list = soup.find_all('div', {'class':'flex w-full flex-col h-full'})
    
    for i in results_list:
    
        #get the names of the cars. If the name of the car is not present, append 'n/a' to the list
        try:
            car_name.append(i.find('h2').get_text())
        except:
            car_name.append('n/a')

        #get the mileage of the cars
        try:
            car_mileage.append(i.find('div', {'data-test': 'vehicleMileage'}).get_text())
        except:
            car_mileage.append('n/a')

        #get the price of the car
        try:
            car_price.append(i.find('span', {'data-test': 'vehicleCardPricingPrice'}).get_text())
        except:
            car_price.append('n/a')


In [50]:
# Create a DataFrame

car_info = pd.DataFrame({'Name': car_name, 'Mileage': car_mileage, 'Price': car_price})

In [51]:
#print the DataFrame
car_info.head()

Unnamed: 0,Name,Mileage,Price
0,Used 2022 DodgeCharger SXT RWD,"58,269 miles","$14,088"
1,Used 2023 DodgeCharger GT RWD,"51,533 miles","$20,294"
2,Used 2020 DodgeCharger SXT RWD,"90,521 miles","$13,000"
3,Used 2016 DodgeCharger SE RWD,"26,400 miles","$14,995"
4,Used 2023 DodgeCharger GT RWD,"55,230 miles","$16,471"


In [52]:
#Let's clean the data frame

#The first thing we should do is to edit the Reviews column

car_info['Mileage'] = car_info['Mileage'].apply(lambda x:x.replace('miles', '')) #remove 'mi' from the mileage column
car_info['Mileage'] = car_info['Mileage'].apply(lambda x:x.replace('k', '000')) #remove 'k' from the mileage column
car_info['Price'] = car_info['Price'].apply(lambda x:x.replace('$', '')) # remove the $ from the Price column
car_info['Price'] = car_info['Price'].apply(lambda x:x.replace(',', '')) #remove the ',' from the Price column

In [53]:
car_info.head()

Unnamed: 0,Name,Mileage,Price
0,Used 2022 DodgeCharger SXT RWD,58269,14088
1,Used 2023 DodgeCharger GT RWD,51533,20294
2,Used 2020 DodgeCharger SXT RWD,90521,13000
3,Used 2016 DodgeCharger SE RWD,26400,14995
4,Used 2023 DodgeCharger GT RWD,55230,16471


In [54]:
#The Name column has a lot of important data on the Year, Company, Vehicle make and model.
# Use regex to split the 'Name' column into 4 parts: Stock type, Year, Brand, and the rest
#(\w+): Matches the stock type (e.g., "Used").
#(\d{4}): Matches the year (e.g., "2021").
#([A-Za-z]+): Matches the brand (e.g., "Toyota").
#(.*): Captures the rest of the string (e.g., "Tacoma SR5 Double Cab 5' Bed V6 4WD Automatic").

car_info[['Stock type', 'Year', 'Brand', 'Rest']] = car_info['Name'].str.extract(r'(\w+)\s(\d{4})\s([A-Za-z]+)(.*)')


In [55]:
car_info.head()

Unnamed: 0,Name,Mileage,Price,Stock type,Year,Brand,Rest
0,Used 2022 DodgeCharger SXT RWD,58269,14088,Used,2022,DodgeCharger,SXT RWD
1,Used 2023 DodgeCharger GT RWD,51533,20294,Used,2023,DodgeCharger,GT RWD
2,Used 2020 DodgeCharger SXT RWD,90521,13000,Used,2020,DodgeCharger,SXT RWD
3,Used 2016 DodgeCharger SE RWD,26400,14995,Used,2016,DodgeCharger,SE RWD
4,Used 2023 DodgeCharger GT RWD,55230,16471,Used,2023,DodgeCharger,GT RWD


In [56]:
# For my example, the car name and model are displayed in the brand column. I will use regex to extract the Brand and Model
#NOTE: This may not be the case for you, if it isn't so not run this cell. I suggest you look at your data

#We use the regular expression r'([A-Za-z]+)([A-Z][a-zA-Z]*)' to capture:
#([A-Za-z]+): The first part of the string (brand), which is made up of any sequence of letters.
#([A-Z][a-zA-Z]*): The second part (model), which starts with a capital letter followed by more letters.

car_info[['Make', 'Model']] = car_info['Brand'].str.extract(r'([A-Za-z]+)([A-Z][a-zA-Z]*)')

car_info

Unnamed: 0,Name,Mileage,Price,Stock type,Year,Brand,Rest,Make,Model
0,Used 2022 DodgeCharger SXT RWD,58269,14088,Used,2022,DodgeCharger,SXT RWD,Dodge,Charger
1,Used 2023 DodgeCharger GT RWD,51533,20294,Used,2023,DodgeCharger,GT RWD,Dodge,Charger
2,Used 2020 DodgeCharger SXT RWD,90521,13000,Used,2020,DodgeCharger,SXT RWD,Dodge,Charger
3,Used 2016 DodgeCharger SE RWD,26400,14995,Used,2016,DodgeCharger,SE RWD,Dodge,Charger
4,Used 2023 DodgeCharger GT RWD,55230,16471,Used,2023,DodgeCharger,GT RWD,Dodge,Charger
5,Used 2023 DodgeCharger SXT RWD,137,26190,Used,2023,DodgeCharger,SXT RWD,Dodge,Charger
6,Used 2022 DodgeCharger SXT RWD,58269,14088,Used,2022,DodgeCharger,SXT RWD,Dodge,Charger
7,Used 2023 DodgeCharger GT RWD,56630,16804,Used,2023,DodgeCharger,GT RWD,Dodge,Charger
8,Used 2023 DodgeCharger GT RWD,50480,17741,Used,2023,DodgeCharger,GT RWD,Dodge,Charger
9,Used 2022 DodgeCharger SXT RWD,48223,17985,Used,2022,DodgeCharger,SXT RWD,Dodge,Charger


In [57]:
#For my example, there are a few NaNs in the Make and Model columns, I need to replace these with the values Toyota, Tacoma

# Fill NaN values in the 'Brand' column with 'Toyota'
# car_info['Make'].fillna('Toyota', inplace=True)

# Fill NaN values in the 'Model' column with 'Tacoma'
# car_info['Model'].fillna('Tacoma', inplace=True)

car_info.fillna({'Make': 'Toyota', 'Model': 'Tacoma'}, inplace=True)

# Display the DataFrame
car_info.head()

Unnamed: 0,Name,Mileage,Price,Stock type,Year,Brand,Rest,Make,Model
0,Used 2022 DodgeCharger SXT RWD,58269,14088,Used,2022,DodgeCharger,SXT RWD,Dodge,Charger
1,Used 2023 DodgeCharger GT RWD,51533,20294,Used,2023,DodgeCharger,GT RWD,Dodge,Charger
2,Used 2020 DodgeCharger SXT RWD,90521,13000,Used,2020,DodgeCharger,SXT RWD,Dodge,Charger
3,Used 2016 DodgeCharger SE RWD,26400,14995,Used,2016,DodgeCharger,SE RWD,Dodge,Charger
4,Used 2023 DodgeCharger GT RWD,55230,16471,Used,2023,DodgeCharger,GT RWD,Dodge,Charger


In [58]:
# Drop the column Name and its no longer needed
car_info = car_info.drop(["Name"], axis = 1)
car_info = car_info.drop(["Rest"], axis = 1)
car_info = car_info.drop(["Brand"], axis = 1)

In [59]:
car_info

Unnamed: 0,Mileage,Price,Stock type,Year,Make,Model
0,58269,14088,Used,2022,Dodge,Charger
1,51533,20294,Used,2023,Dodge,Charger
2,90521,13000,Used,2020,Dodge,Charger
3,26400,14995,Used,2016,Dodge,Charger
4,55230,16471,Used,2023,Dodge,Charger
5,137,26190,Used,2023,Dodge,Charger
6,58269,14088,Used,2022,Dodge,Charger
7,56630,16804,Used,2023,Dodge,Charger
8,50480,17741,Used,2023,Dodge,Charger
9,48223,17985,Used,2022,Dodge,Charger


In [60]:
#You can export the DataFrame to excel if you choose to proceed with excel
car_info.to_excel('multiple_pages.xlsx', index = False)

In [61]:
#You can export the DataFrame to a csv file if needed
car_info.to_csv('multiple_pages.csv', index = False)