<a href="https://colab.research.google.com/github/miguel-peralta/cars_ista322/blob/main/cars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cars Relational Databases
ISTA 322 Final Project, Spring 2024 <br>
Miguel Candido Aurora Peralta <br>
## Extract
### KBB Web Scraping



In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
import pandas as pd

First we need to get a list of all of the URLs for the car models on KBB so that more information can be extracted from those pages.

In [2]:
def get_html_doc(url):
  '''Returns the HTML document for the given URL'''
  # requests HTML document for URL
  response = requests.get(url)
  # returns text
  return response.text

In [3]:
def get_kbb_df():
  '''
  Returns a dataframe containing the url, make, model, and year from the
  relative URLs listed on the car models list pages.
  Returns:
    car_info (DataFrame): make, model, year, and url of each model
  '''
  # Create dataframe and lists to store info
  car_info = pd.DataFrame()
  urls = []
  make = []
  model = []
  year = []
  base_url = 'https://www.kbb.com'

  for page in ['new', 'used']:
    url = f'https://www.kbb.com/car-make-model-list/{page}'
    # Create HTML object from url
    html = get_html_doc(url)
    soup = BeautifulSoup(html, 'html.parser')
    # Create list to store relative URLS from the page
    links = []
    # Get all links from the page (the links to models all have the same style)
    for link in soup.find_all('a', attrs={'style':"padding:12px 8px;display:inline-block"}):
      # Add links to the list
      links.append(link.get('href'))
    # Split links using / as delimeter and add information to lists
    for car in links:
      urls.append(base_url+car)
      link_split = car.split('/')
      make.append(link_split[1])
      model.append(link_split[2])
      year.append(link_split[3])

  # Use lists to populate dataframe
  car_info['url'] = urls
  car_info['make'] = make
  car_info['model'] = model
  car_info['year'] = year

  return car_info


In [4]:
kbb = get_kbb_df()

In [5]:
def get_styles_urls(url):
    '''
    Given the URL to a year's model of a car, returns a list of the urls to the
    styles of that model. If there is no style information available, returns a
    1-element list with just the model page URL.
    Args:
      url (string): url to a year's model of a car
    Returns:
      styles (list): list of urls for that model's styles
    '''
    # Create HTML object from url
    url = 'https://www.kbb.com/audi/a3/2022/'
    base_url = 'https://www.kbb.com/'
    html = get_html_doc(url)
    soup = BeautifulSoup(html, 'html.parser')
    styles = []
    # The elements containing the style links are always 220px wide
    for style in soup.find_all('a', attrs={'width': '220px'}):
      # Add links to the list
      styles.append(base_url+style.get('href'))
    if len(styles) < 1:
      styles.append(url)
    return styles

In [6]:
kbb['styles'] = kbb['url'].apply(lambda url: get_styles_urls(url))

In [7]:
kbb.loc[0, 'styles']

['https://www.kbb.com//audi/a3/2022/premium-plus-sedan-4d/',
 'https://www.kbb.com//audi/a3/2022/premium-sedan-4d/',
 'https://www.kbb.com//audi/a3/2022/prestige-sedan-4d/']

#### Create make table

In [8]:
make = pd.DataFrame(columns = ['make'])
make_list = sorted(kbb['make'].unique())
make['make'] = make_list
make = make.reset_index()
make = make.rename(columns={'index':'make_id'})

In [9]:
print(make.head())

   make_id      make
0        0     acura
1        1      audi
2        2   bentley
3        3       bmw
4        4  cadillac


#### Creating model table

In [10]:
# model_id, make_id, year, styles
model = pd.DataFrame(columns=['make', 'model_name', 'year', 'styles'])
model['make'] = kbb['make']
model['model_name'] = kbb['model']
model['year'] = kbb['year']
model['styles'] = kbb['styles']

In [11]:
model = pd.merge(model, make, on='make', how='left')
model = model.drop(columns=['make'])
model = model.reset_index()
model = model.rename(columns={'index':'model_id'})

In [38]:
styles = ['https://www.kbb.com/audi/a3/2022/premium-plus-sedan-4d/']
df = pd.DataFrame(columns = ['style_name', 'city_fuel_econ', 'hwy_fuel_econ',
                              'combo_fuel_econ', 'fueltype', 'seating', 'hp',
                              'cylinders', 'engine_size_l',
                              'cargo_space_cubic_ft'])
for style in styles:
  style_html = get_html_doc(style)
  # Find the div with direction="stacked"
  soup = BeautifulSoup(style_html, 'html.parser')

  # Find all div elements with direction='stacked'
  stacked_divs = soup.find_all('div')
  print(stacked_divs)
  # Iterate over each stacked div and extract text
  for div in stacked_divs:
      text = div.get_text()
      print(text)



[<div id="__next"><style data-emotion="css-global 1wl4cwt">body{font-family:'Open Sans',sans-serif,Tahoma,Arial;font-size:14px;margin:0;min-height:100%;}html{scroll-behavior:auto;}</style><style data-emotion="css-global dh2fmr">body{background:#1e3b6f;background:#fff;border:0;box-sizing:border-box;font-weight:normal;height:100%;letter-spacing:0;line-height:1.5;padding:0;position:relative;vertical-align:baseline;}body::before{background-color:#fff;bottom:0;content:'';display:block;left:50%;margin-left:-499px;max-width:100vw;position:absolute;top:0;width:998px;z-index:-1;}</style><style data-emotion="css-global fku50h">*,*::before,*::after{box-sizing:border-box;}html{-webkit-tap-highlight-color:rgba(0,92,176,.2);}body{font-family:Open Sans,sans-serif,Tahoma,Arial,sans-serif,Tahoma,Arial;font-size:14px;margin:0;min-height:100%;background-color:#fff;border:0;box-sizing:border-box;font-weight:normal;height:100%;letter-spacing:0;line-height:24px;padding:0;position:relative;vertical-align:bas

In [None]:
model.head()

In [59]:
test = model.loc[0, 'styles']

In [60]:
test[0].split('/')

['https:',
 '',
 'www.kbb.com',
 'audi',
 'a3',
 '2022',
 '',
 'audi',
 'a3',
 '2022',
 'premium-plus-sedan-4d',
 '']

In [None]:
styles = pd.DataFrame(columns=['model_id', 'style_name', 'city_fuel_econ',
                      'hwy_fuel_econ', 'combo_fuel_econ', 'fueltype_id',
                      'seating', 'hp', 'cylinders', 'engine_size_l',
                      'cargo_spce_cubic_ft'])
for i in model.index:


In [None]:
def create_style_table(kbb):
  styles = []
  for i in kbb.index:


In [None]:
def create_make_table(kbb):


In [None]:
make = create_make_table(kbb)

In [None]:
def create_models_table(kbb, make):
  models = pd.DataFrame(columns = ['make_id', 'year', 'styles'])
  for i in kbb.index:
    models = kbb

Next we need to retrieve the ends of the urls for the styles for each model of car. Some cars that are too new don't have any styles listed yet. In this case, we will just return an empty array as all of the information that would be contained on the individual style pages is already on the model page.