# **Data Science Project** (Web Scraping & Analysis from Cars.com)

- This project demonstrates data scraping, cleaning, and analysis of used car listings from Cars.com using Python. It is designed in Jupyter Lab and focuses on extracting real-world automotive data and generating insights using data science tools.

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [None]:
# Website URL
website = 'https://www.cars.com/shopping/results/?stock_type=cpo&makes%5B%5D=mercedes_benz&models%5B%5D=&list_price_max=&maximum_distance=20&zip='

In [None]:
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Retry aur session setup
session = requests.Session()
retry = Retry(connect=3, backoff_factor=1)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Headers to look like a browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

In [None]:
# Website ko request bhejna
try:
    response = session.get(website, headers=headers, timeout=30)  # Timeout badha diya
    print(response.status_code)  # Optional: status check
except requests.exceptions.ReadTimeout:
    print("Request timeout ho gaya.")


#response.status_code

200


In [None]:
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
results = soup.find_all('div', {'class' : 'vehicle-card'})

In [None]:
len(results)

22

In [None]:
# results[0]

In [None]:
# Name
# Mileage
# Dealer Name
# Rating
# Rating Count
# Price
# Location

In [None]:
results[0].find('h2').get_text()

'\n    2022 Mercedes-Benz GLS 450 4MATIC\n  '

In [None]:
results[0].find('spark-rating')['rating'].strip() if results[0].find('spark-rating') and results[0].find('spark-rating').has_attr('rating') else 'N/A'

'4.8'

In [None]:
results[0].find('span', {'class':'sds-rating__link'}).get_text()

'(852 reviews)'

In [None]:
results[0].find('span', {'class':'primary-price'}).get_text()

'\n  $51,490\n'

In [None]:
results[0].find('div', {'class':'dealer-name'}).get_text().strip()

'Mercedes-Benz of Kansas City South'

In [None]:
results[0].find('div', {'class':'mileage'}).get_text()

'70,498 mi.'

In [None]:
results[0].find('div', {'data-qa': 'miles-from-user'}).get_text(strip=True)

'Kansas City, MO'

In [None]:
name = []
rating = []
review_count = []
price = []
dealer_name = []
mileage = []
location = []

for i in range (1,500):   # Set loop according to your needs like (10 instead of 500)

    # Use website URL and set Variable Loop variable
    website = 'https://www.cars.com/shopping/results/?page='+ str(i) +'&page_size=20&dealer_id=&list_price_max=&list_price_min=&makes[]=mercedes_benz&maximum_distance=20&mileage_max=&sort=best_match_desc&stock_type=cpo&year_max=&year_min=&zip='

    # request to website
    from requests.adapters import HTTPAdapter
    from urllib3.util.retry import Retry

    # Retry aur session setup
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=1)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    # Headers to look like a browser
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }

    # soup object
    soup = BeautifulSoup(response.content, 'html.parser')

    # results
    results = soup.find_all('div', {'class' : 'vehicle-card'})

    # loop through results
    for result in results:

        # name
        try:
            name.append(result.find('h2').get_text())
        except:
            name.append('n/a')

        # rating
        # try:
            # rating.append(result.find('span', {'class':'sds-rating__count'}).get_text())
        # except:
            # rating.append('n/a')
        try:
            rating.append(result.find('spark-rating')['rating'].strip())
        except:
            rating.append('n/a')

        # review_count
        try:
            review_count.append(result.find('span', {'class':'sds-rating__link'}).get_text())
        except:
            review_count.append('n/a')

        #price
        try:
            price.append(result.find('span', {'class':'primary-price'}).get_text())
        except:
            price.append('n/a')

        # dealer_name
        try:
            dealer_name.append(result.find('div', {'class':'dealer-name'}).get_text().strip())
        except:
            dealer_name.append('n/a')

        # mileage
        try:
            mileage.append(result.find('div', {'class':'mileage'}).get_text())
        except:
            mileage.append('n/a')

        #location
        try:
            location.append(result.find('div', {'data-qa': 'miles-from-user'}).get_text(strip=True))
        except:
            location.append('n/a')

In [None]:
# dictionary
car_dealer = pd.DataFrame({
    'Name': name,
    'Rating': rating,
    'Review Count': review_count,
    '$ Price': price,
    'Dealer Name': dealer_name,
    'Mileage': mileage,
    'Location': location
})

In [None]:
car_dealer

Unnamed: 0,Name,Rating,Review Count,$ Price,Dealer Name,Mileage,Location
0,\n 2022 Mercedes-Benz GLS 450 4MATIC\n,4.8,(852 reviews),"\n $51,490\n",Mercedes-Benz of Kansas City South,"70,498 mi.","Kansas City, MO"
1,\n 2022 Mercedes-Benz E-Class 4MATIC\n,4.5,(454 reviews),"\n $47,479\n",Sullivan-Parkhill,"38,594 mi.","Champaign, IL"
2,\n 2023 Mercedes-Benz E-Class 4MATIC\n,4.8,"(5,063 reviews)","\n $57,000\n",Jackie Cooper Mercedes Benz,"11,254 mi.","Tulsa, OK"
3,\n 2023 Mercedes-Benz E-Class E 350\n,4.5,"(1,950 reviews)","\n $43,655\n",Mercedes-Benz of Houston North,"22,909 mi.","Houston, TX"
4,\n 2022 Mercedes-Benz S-Class 4MATIC\n,4.4,"(1,433 reviews)","\n $71,998\n",Mercedes-Benz of Pompano,"31,921 mi.","Pompano Beach, FL"
...,...,...,...,...,...,...,...
10973,\n 2023 Mercedes-Benz E-Class E 450 4MATIC\n,4.8,"(1,517 reviews)","\n $58,094\n",Mercedes-Benz of Littleton,"16,382 mi.","Littleton, CO"
10974,\n 2021 Mercedes-Benz S-Class S 560 4MATIC\n,4.6,"(1,071 reviews)","\n $83,988\n",RBM of Atlanta,"20,366 mi.","Atlanta, GA"
10975,\n 2021 Mercedes-Benz S-Class S 580 4MATIC\n,4.5,"(1,747 reviews)","\n $76,994\n",Mercedes-Benz Of Henderson,"17,742 mi.","Henderson, NV"
10976,\n 2020 Mercedes-Benz E-Class 4MATIC\n,5.0,"(6,418 reviews)","\n $36,999\n",RBM of Alpharetta,"15,051 mi.","Alpharetta, GA"


In [None]:
car_dealer.to_csv('a_car_data.csv', index=False)

In [None]:
# Data Cleaning and Save in csv

In [None]:
# remove \n in name column
# remove comma's in Review column
# remove \n in price column
# remove comma's in Mileage column
# remove comma from location column (optional)

In [None]:
car_dealer['Name'] = car_dealer['Name'].apply(lambda x: x.replace('\n', '') if isinstance(x, str) else x)

In [None]:
car_dealer['Review Count'] = car_dealer['Review Count'].apply(
    lambda x: x.replace('reviews', '').replace('(', '').replace(')', '').replace(',', '').strip() if isinstance(x, str) else x
)

In [None]:
car_dealer['$ Price'] = car_dealer['$ Price'].apply(lambda x: x.replace('\n', '').replace(',', '').replace('$', '').strip())

In [None]:
car_dealer['Mileage'] = car_dealer['Mileage'].apply(lambda x: x.replace('mi.', '').replace(',', '').strip() if isinstance(x, str) else x)

In [None]:
car_dealer['Location'] = car_dealer['Location'].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)

In [None]:
car_dealer

Unnamed: 0,Name,Rating,Review Count,$ Price,Dealer Name,Mileage,Location
0,2022 Mercedes-Benz GLS 450 4MATIC,4.8,852,51490,Mercedes-Benz of Kansas City South,70498,Kansas City MO
1,2022 Mercedes-Benz E-Class 4MATIC,4.5,454,47479,Sullivan-Parkhill,38594,Champaign IL
2,2023 Mercedes-Benz E-Class 4MATIC,4.8,5063,57000,Jackie Cooper Mercedes Benz,11254,Tulsa OK
3,2023 Mercedes-Benz E-Class E 350,4.5,1950,43655,Mercedes-Benz of Houston North,22909,Houston TX
4,2022 Mercedes-Benz S-Class 4MATIC,4.4,1433,71998,Mercedes-Benz of Pompano,31921,Pompano Beach FL
...,...,...,...,...,...,...,...
10973,2023 Mercedes-Benz E-Class E 450 4MATIC,4.8,1517,58094,Mercedes-Benz of Littleton,16382,Littleton CO
10974,2021 Mercedes-Benz S-Class S 560 4MATIC,4.6,1071,83988,RBM of Atlanta,20366,Atlanta GA
10975,2021 Mercedes-Benz S-Class S 580 4MATIC,4.5,1747,76994,Mercedes-Benz Of Henderson,17742,Henderson NV
10976,2020 Mercedes-Benz E-Class 4MATIC,5.0,6418,36999,RBM of Alpharetta,15051,Alpharetta GA


In [None]:
car_dealer.to_csv('a_car_data.csv', index=False)

In [None]:
print("Alhumdulillah Done (^_^)")

Alhumdulillah Done (^_^)
