# Part 1

In [192]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [193]:
def get_city_state(state_city_txt):
    city = 'N/A'
    state = 'N/A'
    if 'call' in state_city_txt.lower():
        return 'N/A', 'N/A'
    city_state = re.sub(r'([^\s\w]|_)+', '', state_city_txt).split()
    if len(city_state) > 1:
        state = city_state[-1]
        city = ' '.join(city_state[:-1])
    else:
        city = city_state[0]
    
    return city, state

In [194]:
def handle_car_rows(car_rows):
    car_list = []
    for car in car_rows:
        single_car = {}
        if car.find('a', class_='result-title') != None:
            single_car['Url'] = car.find('a', class_='result-title')['href']
        else:
            single_car['Url'] = 'N/A'
        if car.find('a', class_='result-title') != None:
            single_car['Title'] = car.find('a', class_='result-title').get_text()
        else:
            single_car['Title'] = 'N/A'
        if car.find('span', class_='result-price') != None:
            single_car['Price'] = car.find('span', class_='result-price').get_text()[1:]
        else:
            single_car['Price'] = 'N/A'
        if car.find('span', class_='result-hood') != None:
            single_car['City'], single_car['State'] = get_city_state(car.find('span', class_='result-hood').get_text())
        else:
            single_car['City'], single_car['State'] = 'N/A', 'N/A'
        car_list.append(single_car)
    return car_list

In [195]:
page = requests.get("https://northmiss.craigslist.org/search/cta?s=0")
page.status_code

200

In [196]:
soup = BeautifulSoup(page.content, 'html.parser')

In [197]:
car_rows = soup.find_all('li', class_='result-row')

In [198]:
all_cars = handle_car_rows(car_rows)

In [199]:
car_df = pd.DataFrame.from_dict(all_cars)

In [200]:
car_df.to_csv('part_1.tsv', sep='\t')

# Part 2

In [201]:
s_val = 0
has_next = True
all_page_cars = []

In [202]:
while has_next:
    page = requests.get("https://northmiss.craigslist.org/search/cta?s=" + str(s_val))
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
        car_rows = soup.find_all('li', class_='result-row')
        if len(car_rows) > 0:
            all_page_cars += handle_car_rows(car_rows)
            s_val += 120
        else:
            has_next = False
    else:
        has_next = False

In [203]:
all_page_cars_df = pd.DataFrame.from_dict(all_page_cars)
all_page_cars_df.to_csv('part_2.tsv', sep='\t')

# Part 3

In [242]:
all_car_infos = []

In [None]:
for index, value in all_page_cars_df.iterrows():
    car_url = value['Url']
    page_rq = requests.get(car_url)
    if page_rq.status_code == 200:
        each_car = {}
        each_car['Url'], each_car['Time'], each_car['Num_Image'], each_car['Description'], each_car['Year'], each_car['Make_Model'], each_car['Condition'], each_car['Cylinders'], each_car['Drive'], each_car['Fuel'], each_car['Odometer'], each_car['Paint_Color'], each_car['Size'], each_car['Title_Status'], each_car['Transmission'], each_car['VIN'] = ['N/A'] * 16
        soup = BeautifulSoup(page_rq.content, 'html.parser')
        each_car['Url'] = car_url
        if soup.find('time', class_='date timeago') != None:
            posted_date = soup.find('time', class_='date timeago').text
            each_car['Time'] = posted_date.strip()
        if soup.find('span', class_='slider-info') != None:
            each_car['Num_Image'] = int(soup.find('span', class_='slider-info').text.split()[-1])
        if soup.find('section', id='postingbody') != None:
            description = soup.find('section', id='postingbody').text
            each_car['Description'] = description.strip().replace("\n", "").replace("\t","").replace("QR Code Link to This Post","")
        attributes = soup.find_all('p', class_='attrgroup')
        if len(attributes) > 0:
            if attributes[0].find('span') != None:
                title_year = attributes[0].find('span').text.split()
                if len(title_year) > 0:
                    each_car['Year'] = int(title_year[0])
                    each_car['Make_Model'] = ' '.join(title_year[1:])
            if len(attributes) > 1:
                other_attrs = attributes[1].find_all('span')
                if len(other_attrs) > 0:
                    for att in other_attrs:
                        att_text = att.text
                        param_name = att_text.split(":")[0].title()
                        param_val = ' '.join(att_text.split(":")[1:])
                        if len(param_name.split()) > 0:
                            param_name = '_'.join(param_name.split())
                        if param_name.lower() == 'vin':
                            param_name = "VIN"
                        each_car[param_name] = param_val
        all_car_infos.append(each_car)

In [209]:
all_car_info_df = pd.DataFrame.from_dict(all_car_infos)

In [None]:
all_car_info_df.drop(['Cryptocurrency_Ok', 'Type'], axis=1)

In [210]:
all_car_info_df.to_csv('part_3.tsv', sep='\t', encoding='utf-8')

## Bonus

In [252]:
all_car_info_df = pd.read_csv('part_3.tsv', sep='\t', encoding='utf-8')

In [255]:
all_car_info_df = all_car_info_df.drop(['Unnamed: 0'], axis=1)

In [256]:
merged = pd.merge(all_page_cars_df, all_car_info_df, on='Url', how='outer')

In [257]:
merged.shape

(2306, 20)

In [217]:
merged.to_csv('part_3.tsv', sep='\t', encoding='utf-8')

(2296, 5)

In [218]:
all_car_info_df.shape

(2296, 19)

In [220]:
all_car_info_df.columns

Index(['Condition', 'Cryptocurrency_Ok', 'Cylinders', 'Description', 'Drive',
       'Fuel', 'Make_Model', 'Num_Image', 'Odometer', 'Paint_Color', 'Size',
       'Time', 'Title_Status', 'Transmission', 'Type', 'Url', 'VIN', 'Vin',
       'Year'],
      dtype='object')

In [None]:
Url, Time, Num_Image, Description, Year, Make_Model, Condition, Cylinders, Drive, Fuel, Odometer, Paint_Color, Size, Title_Status, Transmission, VIN