# Web Data Extraction / Web scraper - INFO7390 Assignment01

This project is focused on scraping the data from the web. The website used for this exercise is www.autotrader.com It's one of the largest American online marketplaces for car buyers and sellers.

Data has been scraped for multiple vehicle types, example: Sedan, SUV, Truck, Luxury etc. Beatiful Soup is utilized to parse the HTML content from web pages and further extracted the relevant data from HTML.



## Installing Dependencies

In [9]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from typing import List, Dict
from pandas import DataFrame

## Defining Constants

In [10]:
# list of all vehicles to be scraped data for
all_vehicles = ['sedan','truck','suv-crossover','coupe','hatchback','van-minivan',\
                'convertible','wagon','awd-4wd','luxury','hybrid-electric','commercial']
# list of expected column names in the final dataframe
col_list = ['vehicle_name','vehicle_type','image_url','listing','description',\
            'miles','price','price_comment','owner','owner_contact']

## Function Definition

In [11]:
def extract_data(vehicle_list: List[str], data_dict: Dict, vehicle_type: str) -> Dict:
  """
  This function extracts required information out of the scraped HTML content.
  vehicle_list: HTML content for a particular vehicle
  data_dict: Dictionary with expected column names as keys and empty lists as values
  vehicle: Type of Vehicle
  returns: Dictionary with expected column names as keys and scaped data in lists as values
  """
  # Looping through each vehicle and fetching the relevant details like vehicle name, price etc.
  for vehicle in vehicle_list:
    # vehicle name
    name_element = vehicle.find('h3', class_='text-bold text-size-300 link-unstyled')
    vehicle_name = name_element.get_text(strip=True) if name_element else None
    data_dict['vehicle_name'].append(vehicle_name)
    # vehicle_type
    data_dict['vehicle_type'].append(vehicle_type)
    # image url
    image_element = vehicle.find('img', class_='image-vertically-aligned')
    image_url = image_element['src'] if image_element and 'src' in image_element.attrs else None
    data_dict['image_url'].append(image_url)
    # newly listed
    listing_element = vehicle.find('span', class_='text-accent text-bold text-antialiased margin-right-2')
    listing = listing_element.get_text(strip=True) if listing_element else None
    data_dict['listing'].append(listing)
    # description
    description_element = vehicle.find('ul', class_='list list-inline display-inline margin-bottom-0 margin-top-2 text-size-200')
    description = description_element.get_text(strip=True) if description_element else None
    data_dict['description'].append(description)
    # miles
    miles_element = vehicle.find_all('span', class_='text-bold')
    miles_text = None
    for element in miles_element:
      if 'miles' in element.get_text(strip=True).lower():
          miles_text = element.get_text(strip=True)
          break
    data_dict['miles'].append(miles_text)
    # price
    price_element = vehicle.find('span', class_='first-price text-ultra-bold')
    price = price_element.get_text(strip=True) if price_element else None
    data_dict['price'].append(price)
    # price comment
    pricecom_element = vehicle.find('div', class_='ribbon-content-right')
    pricecom = pricecom_element.get_text(strip=True) if pricecom_element else None
    data_dict['price_comment'].append(pricecom)
    # owner
    owner_element = vehicle.find('div', class_='text-bold text-subdued')
    owner = owner_element.get_text(strip=True) if owner_element else None
    data_dict['owner'].append(owner)
    # owner's contact
    ownerc_element = vehicle.find(lambda tag: tag.name == 'span' and tag.get('data-cmp') == 'phoneNumber')
    ownerc = ownerc_element.get_text(strip=True) if ownerc_element else None
    data_dict['owner_contact'].append(ownerc)
  return data_dict

In [12]:
def search_and_extract(all_vehicles : List[str], col_list : List[str]) -> DataFrame:
  """"
  This function searches for each vehicle type data on the website, scrapes the data and stores in a dataframe
  all_vehicles: List of all vehicle types to be scraped data for
  col_list: List of column names expected in the final dataframe
  returns: Dataframe with scraped information
  """
  # Building a dictionary with keys as column names and values as empty list, to be utilized later for data storage
  vehicle_data = {col: [] for col in col_list}
  # Looping through every vehicle type to access HTML content
  for vehicle in all_vehicles:
    # Sending HTTP requestss
    response = requests.get(f'https://www.autotrader.com/cars-for-sale/{vehicle}')
    # Parsing HTML content
    soup=BeautifulSoup(response.text,'html.parser')
    # Collecting all classes with individual vehicle information
    vehicle_list = soup.find_all('div', class_='item-card row display-flex align-items-stretch flex-column')
    # Extracting relevant information from HTML content
    vehicle_data = extract_data(vehicle_list,vehicle_data, vehicle)
    # Building a pandas dataframe from collected information
    df = pd.DataFrame(vehicle_data)
  return df

## Function Calling

In [13]:
# Calling the fucntion to extract all data
df_vehicle = search_and_extract(all_vehicles,col_list)

## Dataframe Row Count

In [14]:
df_vehicle.count()

vehicle_name     336
vehicle_type     336
image_url         24
listing          245
description      336
miles            336
price            336
price_comment    127
owner            336
owner_contact    331
dtype: int64

## Dataframe Display

In [15]:
df_vehicle

Unnamed: 0,vehicle_name,vehicle_type,image_url,listing,description,miles,price,price_comment,owner,owner_contact
0,Used 2017 Honda Civic EX,sedan,https://images.autotrader.com/scaler/408/306/h...,,Compact Sedan31 City / 40 Highway,"67,572 miles",18888,GREAT PRICE,Victory Honda of San Bruno,(650) 515-3002
1,Used 2007 Toyota Camry LE,sedan,,,Midsize Sedan,"153,212 miles",7500,,Kearny Mesa Hyundai,1 (844) 567-1751
2,Used 2018 Chevrolet Malibu LT,sedan,,,Midsize Sedan27 City / 36 Highway,"57,481 miles",17999,GOOD PRICE,Drive Smart Auto Sales,(513) 713-0606
3,Used 2022 Mercedes-Benz S 580 4MATIC Sedan,sedan,https://images.autotrader.com/scaler/408/306/h...,Newly Listed,16 City / 25 Highway,"3,106 miles",109988,,Mall of Georgia MINI,(470) 655-0791
4,Used 2023 Dodge Charger SRT Hellcat,sedan,,Newly Listed,Fullsize Sedan12 City / 21 Highway,33 miles,109988,,Mall of Georgia MINI,(470) 655-0791
...,...,...,...,...,...,...,...,...,...,...
331,Used 2021 Cadillac Escalade ESV Sport Platinum,commercial,,Newly Listed,,"17,011 miles",103997,,Rick Hendrick Chrysler Dodge Jeep Ram Duluth,(470) 394-0338
332,Used 2024 GMC Sierra 2500 Denali,commercial,,Newly Listed,,"6,382 miles",97757,,Buick GMC Cadillac Fort Walton Beach,(850) 357-8915
333,Used 2024 BMW M4 Competition,commercial,,Newly Listed,,474 miles,96990,,Jenkins Chevrolet of Venice,(941) 584-4243
334,Used 2024 Chevrolet Silverado 3500 LTZ,commercial,,Newly Listed,,"6,706 miles",96888,,Rick Hendrick Chevrolet Buford,(470) 655-2409


## Storing the Data

In [16]:
# Storing the data in CSV format
df_vehicle.to_csv('vehicle_webscraped.csv')

## References

- https://tedboy.github.io/bs4_doc
- https://medium.com/ymedialabs-innovation/web-scraping-using-beautiful-soup-and-selenium-for-dynamic-page-2f8ad15efe25
