In [92]:
from requests import get
import requests
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import random
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import datetime as dt
import csv 
import psycopg2
import time
from sklearn.feature_extraction.text import TfidfVectorizer

from helper_funcs.urls_to_soup_objects import *
from helper_funcs.clean_three_or_more_prices import *
from helper_funcs.clean_two_prices import *
from helper_funcs.soup_objects_to_df import *
from helper_funcs.dropping_funcs import *

In [2]:
# Create a Session and Retry object to manage the quota Craigslist imposes on HTTP get requests within a certain time period 
session = requests.Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Extracting Craigslist Data
## Get all state/region names

In [3]:
# Parse URL that contains all regions of Craigslist
all_sites_response = session.get('https://craigslist.org/about/sites')
all_sites_soup = BeautifulSoup(all_sites_response.text, 'html.parser')

# Extract part of webpage corresponding to regions in the US
us_sites = all_sites_soup.body.section.div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling

# Extract HTML tags corresponding to the state name and region
states_tags = us_sites.find_all('h4')
regions_tags = us_sites.find_all('ul')

In [4]:
# Create dictionary which maps a state_name (key) to a list of regions in that state (values)
state_to_region_dict = get_state_to_region_dict(states_tags, regions_tags)

## Get URL for each individual posting in a state/region combo

In [8]:
all_urls = process_and_get_urls(state_to_region_dict)

Process started at Sat Mar  5 17:30:05 2022


Total Progress:   0%|          | 0/52 [00:00<?, ?it/s]

Currently extracting URLs for 9 regions in Alabama:   0%|          | 0/9 [00:00<?, ?it/s]

Currently extracting URLs for 4 regions in Alaska:   0%|          | 0/4 [00:00<?, ?it/s]

Currently extracting URLs for 8 regions in Arizona:   0%|          | 0/8 [00:00<?, ?it/s]

Currently extracting URLs for 5 regions in Arkansas:   0%|          | 0/5 [00:00<?, ?it/s]

Currently extracting URLs for 28 regions in California:   0%|          | 0/28 [00:00<?, ?it/s]

Currently extracting URLs for 8 regions in Colorado:   0%|          | 0/8 [00:00<?, ?it/s]

Currently extracting URLs for 4 regions in Connecticut:   0%|          | 0/4 [00:00<?, ?it/s]

Currently extracting URLs for 1 regions in Delaware:   0%|          | 0/1 [00:00<?, ?it/s]

Currently extracting URLs for 1 regions in District of Columbia:   0%|          | 0/1 [00:00<?, ?it/s]

Currently extracting URLs for 24 regions in Florida:   0%|          | 0/24 [00:00<?, ?it/s]

Currently extracting URLs for 11 regions in Georgia:   0%|          | 0/11 [00:00<?, ?it/s]

Currently extracting URLs for 1 regions in Hawaii:   0%|          | 0/1 [00:00<?, ?it/s]

Currently extracting URLs for 4 regions in Idaho:   0%|          | 0/4 [00:00<?, ?it/s]

Currently extracting URLs for 11 regions in Illinois:   0%|          | 0/11 [00:00<?, ?it/s]

Currently extracting URLs for 10 regions in Indiana:   0%|          | 0/10 [00:00<?, ?it/s]

Currently extracting URLs for 11 regions in Iowa:   0%|          | 0/11 [00:00<?, ?it/s]

Currently extracting URLs for 8 regions in Kansas:   0%|          | 0/8 [00:00<?, ?it/s]

Currently extracting URLs for 6 regions in Kentucky:   0%|          | 0/6 [00:00<?, ?it/s]

Currently extracting URLs for 8 regions in Louisiana:   0%|          | 0/8 [00:00<?, ?it/s]

Currently extracting URLs for 1 regions in Maine:   0%|          | 0/1 [00:00<?, ?it/s]

Currently extracting URLs for 6 regions in Maryland:   0%|          | 0/6 [00:00<?, ?it/s]

Currently extracting URLs for 5 regions in Massachusetts:   0%|          | 0/5 [00:00<?, ?it/s]

Currently extracting URLs for 18 regions in Michigan:   0%|          | 0/18 [00:00<?, ?it/s]

Currently extracting URLs for 8 regions in Minnesota:   0%|          | 0/8 [00:00<?, ?it/s]

Currently extracting URLs for 6 regions in Mississippi:   0%|          | 0/6 [00:00<?, ?it/s]

Currently extracting URLs for 9 regions in Missouri:   0%|          | 0/9 [00:00<?, ?it/s]

Currently extracting URLs for 8 regions in Montana:   0%|          | 0/8 [00:00<?, ?it/s]

Currently extracting URLs for 5 regions in Nebraska:   0%|          | 0/5 [00:00<?, ?it/s]

Currently extracting URLs for 3 regions in Nevada:   0%|          | 0/3 [00:00<?, ?it/s]

Currently extracting URLs for 1 regions in New Hampshire:   0%|          | 0/1 [00:00<?, ?it/s]

Currently extracting URLs for 4 regions in New Jersey:   0%|          | 0/4 [00:00<?, ?it/s]

Currently extracting URLs for 6 regions in New Mexico:   0%|          | 0/6 [00:00<?, ?it/s]

Currently extracting URLs for 20 regions in New York:   0%|          | 0/20 [00:00<?, ?it/s]

Currently extracting URLs for 12 regions in North Carolina:   0%|          | 0/12 [00:00<?, ?it/s]

Currently extracting URLs for 4 regions in North Dakota:   0%|          | 0/4 [00:00<?, ?it/s]

Currently extracting URLs for 15 regions in Ohio:   0%|          | 0/15 [00:00<?, ?it/s]

Currently extracting URLs for 5 regions in Oklahoma:   0%|          | 0/5 [00:00<?, ?it/s]

Currently extracting URLs for 10 regions in Oregon:   0%|          | 0/10 [00:00<?, ?it/s]

Currently extracting URLs for 15 regions in Pennsylvania:   0%|          | 0/15 [00:00<?, ?it/s]

Currently extracting URLs for 1 regions in Rhode Island:   0%|          | 0/1 [00:00<?, ?it/s]

Currently extracting URLs for 6 regions in South Carolina:   0%|          | 0/6 [00:00<?, ?it/s]

Currently extracting URLs for 5 regions in South Dakota:   0%|          | 0/5 [00:00<?, ?it/s]

Currently extracting URLs for 8 regions in Tennessee:   0%|          | 0/8 [00:00<?, ?it/s]

Currently extracting URLs for 27 regions in Texas:   0%|          | 0/27 [00:00<?, ?it/s]

Currently extracting URLs for 5 regions in Utah:   0%|          | 0/5 [00:00<?, ?it/s]

Currently extracting URLs for 1 regions in Vermont:   0%|          | 0/1 [00:00<?, ?it/s]

Currently extracting URLs for 11 regions in Virginia:   0%|          | 0/11 [00:00<?, ?it/s]

Currently extracting URLs for 10 regions in Washington:   0%|          | 0/10 [00:00<?, ?it/s]

Currently extracting URLs for 8 regions in West Virginia:   0%|          | 0/8 [00:00<?, ?it/s]

Currently extracting URLs for 11 regions in Wisconsin:   0%|          | 0/11 [00:00<?, ?it/s]

Currently extracting URLs for 1 regions in Wyoming:   0%|          | 0/1 [00:00<?, ?it/s]

Currently extracting URLs for 3 regions in Territories:   0%|          | 0/3 [00:00<?, ?it/s]

URLs of search pages finished extracting at Sat Mar  5 17:56:09 2022
Total process time: 1564.1507849693298


Extracting URLs:   0%|          | 0/416 [00:00<?, ?it/s]

In [9]:
# # Calculate how many posts in total are to be scraped for countdown timer

# num_regions = len(all_urls)

# num_posts = 0
# for state_and_region in all_urls:
#     num_posts += len(all_urls[state_and_region])

## Getting soup object response for each individual post in a state/region combo

In [10]:
soup_objects = convert_urls_to_soup_objs(all_urls)

Total Progress:   0%|          | 0/416 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Alabama: auburn:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Alabama: bham:   0%|          | 0/6 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Alabama: dothan:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Alabama: shoals: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Alabama: gadsden:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Alabama: huntsville:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Alabama: mobile:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Alabama: montgomery:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Alabama: tuscaloosa:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Alaska: anchorage:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Alaska: fairbanks: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Alaska: kenai:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Alaska: juneau:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Arizona: flagstaff:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Arizona: mohave: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Arizona: phoenix:   0%|          | 0/78 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Arizona: prescott:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Arizona: showlow:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Arizona: sierravista: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Arizona: tucson:   0%|          | 0/9 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Arizona: yuma: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Arkansas: fayar:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Arkansas: fortsmith:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Arkansas: jonesboro:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Arkansas: littlerock:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Arkansas: texarkana: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in California: bakersfield:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: chico:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: fresno:   0%|          | 0/15 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: goldcountry:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: hanford:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: humboldt:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: imperial:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: inlandempire:   0%|          | 0/40 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: losangeles:   0%|          | 0/322 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: mendocino:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: merced:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: modesto:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: monterey:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: orangecounty:   0%|          | 0/156 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: palmsprings:   0%|          | 0/5 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: redding:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: sacramento:   0%|          | 0/78 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: sandiego:   0%|          | 0/173 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: sfbay:   0%|          | 0/409 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: slo:   0%|          | 0/5 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: santabarbara:   0%|          | 0/26 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: santamaria:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: siskiyou:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: stockton:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: susanville:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: ventura:   0%|          | 0/16 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: visalia:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in California: yubasutter:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Colorado: boulder:   0%|          | 0/16 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Colorado: cosprings:   0%|          | 0/8 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Colorado: denver:   0%|          | 0/72 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Colorado: eastco: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Colorado: fortcollins:   0%|          | 0/5 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Colorado: rockies:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Colorado: pueblo:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Colorado: westslope:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Connecticut: newlondon:   0%|          | 0/5 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Connecticut: hartford:   0%|          | 0/33 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Connecticut: newhaven:   0%|          | 0/5 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Connecticut: nwct:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Delaware: delaware:   0%|          | 0/15 [00:00<?, ?it/s]

Creating Soup Objects for each posting in District of Columbia: washingtondc:   0%|          | 0/163 [00:00<?,…

Creating Soup Objects for each posting in Florida: miami:   0%|          | 0/156 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: daytona: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Florida: keys: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Florida: fortmyers:   0%|          | 0/10 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: gainesville:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: cfl: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Florida: jacksonville:   0%|          | 0/16 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: lakeland:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: lakecity: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Florida: ocala:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: okaloosa: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Florida: orlando:   0%|          | 0/29 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: panamacity: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Florida: pensacola:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: sarasota:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: spacecoast:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: staugustine:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: tallahassee:   0%|          | 0/8 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: tampa:   0%|          | 0/34 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Florida: treasure:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Georgia: albanyga: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Georgia: athensga:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Georgia: atlanta:   0%|          | 0/101 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Georgia: augusta: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Georgia: brunswick: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Georgia: columbusga: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Georgia: macon: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Georgia: nwga:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Georgia: savannah: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Georgia: statesboro: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Georgia: valdosta: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Hawaii: honolulu:   0%|          | 0/39 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Idaho: boise:   0%|          | 0/11 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Idaho: eastidaho:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Idaho: lewiston: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Idaho: twinfalls:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Illinois: bn: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Illinois: chambana:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Illinois: chicago:   0%|          | 0/88 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Illinois: decatur: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Illinois: lasalle: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Illinois: mattoon: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Illinois: peoria:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Illinois: rockford: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Illinois: carbondale: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Illinois: springfieldil:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Illinois: quincy: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Indiana: bloomington:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Indiana: evansville: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Indiana: fortwayne:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Indiana: indianapolis:   0%|          | 0/9 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Indiana: kokomo: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Indiana: tippecanoe: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Indiana: muncie: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Indiana: richmondin:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Indiana: southbend: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Indiana: terrehaute: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Iowa: ames:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Iowa: cedarrapids:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Iowa: desmoines:   0%|          | 0/6 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Iowa: dubuque: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Iowa: fortdodge: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Iowa: iowacity: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Iowa: masoncity:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Iowa: quadcities:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Iowa: siouxcity: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Iowa: ottumwa:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Iowa: waterloo: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Kansas: lawrence:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Kansas: ksu: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Kansas: nwks:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Kansas: salina: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Kansas: seks: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Kansas: swks: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Kansas: topeka: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Kansas: wichita:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Kentucky: bgky: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Kentucky: eastky:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Kentucky: lexington: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Kentucky: louisville:   0%|          | 0/5 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Kentucky: owensboro: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Kentucky: westky:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Louisiana: batonrouge:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Louisiana: cenla: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Louisiana: houma: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Louisiana: lafayette:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Louisiana: lakecharles: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Louisiana: monroe:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Louisiana: neworleans:   0%|          | 0/11 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Louisiana: shreveport: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Maine: maine:   0%|          | 0/10 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Maryland: annapolis:   0%|          | 0/5 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Maryland: baltimore:   0%|          | 0/54 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Maryland: easternshore:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Maryland: frederick: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Maryland: smd: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Maryland: westmd:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Massachusetts: boston:   0%|          | 0/160 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Massachusetts: capecod:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Massachusetts: southcoast: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Massachusetts: westernmass:   0%|          | 0/6 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Massachusetts: worcester:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Michigan: annarbor:   0%|          | 0/10 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Michigan: battlecreek: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Michigan: centralmich:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Michigan: detroit:   0%|          | 0/28 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Michigan: flint: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Michigan: grandrapids:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Michigan: holland: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Michigan: jxn:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Michigan: kalamazoo: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Michigan: lansing: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Michigan: monroemi:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Michigan: muskegon: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Michigan: nmi:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Michigan: porthuron: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Michigan: saginaw: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Michigan: swmi:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Michigan: thumb: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Michigan: up: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Minnesota: bemidji: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Minnesota: brainerd:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Minnesota: duluth:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Minnesota: mankato: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Minnesota: minneapolis:   0%|          | 0/47 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Minnesota: rmn: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Minnesota: marshall: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Minnesota: stcloud:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Mississippi: gulfport:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Mississippi: hattiesburg: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Mississippi: jackson:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Mississippi: meridian: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Mississippi: northmiss:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Mississippi: natchez: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Missouri: columbiamo: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Missouri: joplin: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Missouri: kansascity:   0%|          | 0/8 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Missouri: kirksville: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Missouri: loz: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Missouri: semo: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Missouri: springfield:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Missouri: stjoseph: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Missouri: stlouis:   0%|          | 0/34 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Montana: billings: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Montana: bozeman:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Montana: butte:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Montana: greatfalls:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Montana: helena: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Montana: kalispell: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Montana: missoula:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Montana: montana:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Nebraska: grandisland: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Nebraska: lincoln:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Nebraska: northplatte:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Nebraska: omaha:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Nebraska: scottsbluff:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Nevada: elko:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Nevada: lasvegas:   0%|          | 0/47 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Nevada: reno:   0%|          | 0/12 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New Hampshire: nh:   0%|          | 0/22 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New Jersey: cnj:   0%|          | 0/58 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New Jersey: jerseyshore:   0%|          | 0/9 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New Jersey: newjersey:   0%|          | 0/41 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New Jersey: southjersey:   0%|          | 0/11 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New Mexico: albuquerque:   0%|          | 0/13 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New Mexico: clovis: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New Mexico: farmington:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New Mexico: lascruces: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New Mexico: roswell:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New Mexico: santafe:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New York: albany:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New York: binghamton:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New York: buffalo:   0%|          | 0/8 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New York: catskills: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New York: chautauqua: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New York: elmira: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New York: fingerlakes: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New York: glensfalls: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New York: hudsonvalley:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New York: ithaca:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New York: longisland:   0%|          | 0/50 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New York: newyork:   0%|          | 0/480 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New York: oneonta: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New York: plattsburgh: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New York: potsdam: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New York: rochester:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New York: syracuse:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in New York: twintiers: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New York: utica: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in New York: watertown: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in North Carolina: asheville:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in North Carolina: boone: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in North Carolina: charlotte:   0%|          | 0/48 [00:00<?, ?it/s]

Creating Soup Objects for each posting in North Carolina: eastnc: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in North Carolina: fayetteville:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in North Carolina: greensboro:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in North Carolina: hickory: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in North Carolina: onslow: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in North Carolina: outerbanks: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in North Carolina: raleigh:   0%|          | 0/63 [00:00<?, ?it/s]

Creating Soup Objects for each posting in North Carolina: wilmington:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in North Carolina: winstonsalem:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in North Dakota: bismarck: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in North Dakota: fargo:   0%|          | 0/5 [00:00<?, ?it/s]

Creating Soup Objects for each posting in North Dakota: grandforks: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in North Dakota: nd:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Ohio: akroncanton:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Ohio: ashtabula: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Ohio: athensohio: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Ohio: chillicothe: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Ohio: cincinnati:   0%|          | 0/15 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Ohio: cleveland:   0%|          | 0/24 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Ohio: columbus:   0%|          | 0/16 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Ohio: dayton:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Ohio: limaohio: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Ohio: mansfield: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Ohio: sandusky: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Ohio: toledo:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Ohio: tuscarawas: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Ohio: youngstown: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Ohio: zanesville: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Oklahoma: lawton: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Oklahoma: enid: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Oklahoma: oklahomacity:   0%|          | 0/6 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Oklahoma: stillwater: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Oklahoma: tulsa:   0%|          | 0/5 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Oregon: bend:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Oregon: corvallis:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Oregon: eastoregon:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Oregon: eugene:   0%|          | 0/8 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Oregon: klamath: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Oregon: medford:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Oregon: oregoncoast: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Oregon: portland:   0%|          | 0/94 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Oregon: roseburg: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Oregon: salem:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: altoona: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: chambersburg: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: erie: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: harrisburg:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: lancaster: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: allentown:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: meadville: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: philadelphia:   0%|          | 0/116 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: pittsburgh:   0%|          | 0/18 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: poconos: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: reading: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: scranton:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: pennstate:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: williamsport: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Pennsylvania: york:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Rhode Island: providence:   0%|          | 0/18 [00:00<?, ?it/s]

Creating Soup Objects for each posting in South Carolina: charleston:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in South Carolina: columbia:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in South Carolina: florencesc:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in South Carolina: greenville:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in South Carolina: hiltonhead:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in South Carolina: myrtlebeach:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in South Dakota: nesd: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in South Dakota: csd:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in South Dakota: rapidcity:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in South Dakota: siouxfalls:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in South Dakota: sd: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Tennessee: chattanooga:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Tennessee: clarksville: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Tennessee: cookeville: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Tennessee: jacksontn: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Tennessee: knoxville:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Tennessee: memphis:   0%|          | 0/7 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Tennessee: nashville:   0%|          | 0/35 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Tennessee: tricities: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: abilene: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: amarillo: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: austin:   0%|          | 0/98 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Texas: beaumont: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: brownsville: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: collegestation:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Texas: corpuschristi:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Texas: dallas:   0%|          | 0/133 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Texas: nacogdoches: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: delrio: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: elpaso:   0%|          | 0/9 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Texas: galveston: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: houston:   0%|          | 0/131 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Texas: killeen: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: laredo: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: lubbock:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Texas: mcallen: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: odessa:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Texas: sanangelo: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: sanantonio:   0%|          | 0/61 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Texas: sanmarcos: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: bigbend: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: texoma: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: easttexas: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: victoriatx: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Texas: waco:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Texas: wichitafalls: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Utah: logan: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Utah: ogden: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Utah: provo:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Utah: saltlakecity:   0%|          | 0/13 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Utah: stgeorge:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Vermont: vermont:   0%|          | 0/8 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Virginia: charlottesville:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Virginia: danville: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Virginia: fredericksburg: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Virginia: norfolk:   0%|          | 0/4 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Virginia: harrisonburg: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Virginia: lynchburg: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Virginia: blacksburg: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Virginia: richmond:   0%|          | 0/21 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Virginia: roanoke:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Virginia: swva:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Virginia: winchester: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Washington: bellingham:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Washington: kpr:   0%|          | 0/16 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Washington: moseslake: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Washington: olympic:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Washington: pullman:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Washington: seattle:   0%|          | 0/131 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Washington: skagit:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Washington: spokane:   0%|          | 0/17 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Washington: wenatchee:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Washington: yakima:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in West Virginia: charlestonwv: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in West Virginia: martinsburg: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in West Virginia: huntington: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in West Virginia: morgantown:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in West Virginia: wheeling:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in West Virginia: parkersburg: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in West Virginia: swv:   0%|          | 0/2 [00:00<?, ?it/s]

Creating Soup Objects for each posting in West Virginia: wv: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Wisconsin: appleton: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Wisconsin: eauclaire:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Wisconsin: greenbay:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Wisconsin: janesville: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Wisconsin: racine: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Wisconsin: lacrosse: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Wisconsin: madison:   0%|          | 0/6 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Wisconsin: milwaukee: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Wisconsin: northernwi: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Wisconsin: sheboygan: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Wisconsin: wausau:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Wyoming: wyoming:   0%|          | 0/6 [00:00<?, ?it/s]

Creating Soup Objects for each posting in Territories: micronesia: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Territories: puertorico: 0it [00:00, ?it/s]

Creating Soup Objects for each posting in Territories: virgin:   0%|          | 0/1 [00:00<?, ?it/s]

## Pre-Processing

### Extracting post information from each soup_object

In [11]:
concat_df = extract_post_features(soup_objects)

df shape: (4705, 10)


### Adding more detailed location information from US Census data

In [82]:
# # Add US_region division for eastern us, western us, etc., using census data to classify each region

# census_regions = pd.read_csv('./census-regions/us_census_regions.csv')
# concat_df_w_regions = concat_df.merge(right=census_regions[['State','Region','Division']], how='left', left_on='state', right_on='State')

# concat_df_w_regions.drop(labels='State', axis=1, inplace=True)
# concat_df_w_regions.rename(columns={'Region':'US_region', "Division":"US_division"}, inplace=True)

# concat_df_w_regions.head()

Unnamed: 0,date_posted,link,price,city,subregion,region,state,post_text,price_list,posts_scraped_on,US_region,US_division
0,2022-03-02T15:36:36-0600,https://auburn.craigslist.org/lss/d/alexander-...,,no city found,no subregion found,auburn,Alabama,\n\n\n\n\nOur tutors are real full-time teache...,[],2022-03-05,South,East South Central
1,2022-03-02T07:35:07-0600,https://bham.craigslist.org/lss/d/docena-math-...,46.0,Birmingham,no subregion found,"birmingham,al",Alabama,\n\n\n\n\nBlakeTutoring.com\n\nOVER 450 five-s...,"[57, 35]",2022-03-05,South,East South Central
2,2022-02-08T12:43:35-0600,https://bham.craigslist.org/lss/d/birmingham-s...,,Birmingham,no subregion found,"birmingham,al",Alabama,\n\n\n\n\nI am a current student at UAB School...,[],2022-03-05,South,East South Central
3,2022-02-08T09:39:12-0600,https://bham.craigslist.org/lss/d/docena-math-...,46.0,Birmingham,no subregion found,"birmingham,al",Alabama,\n\n\n\n\nBlakeTutoring.com\n\nOVER 450 five-s...,"[57, 35]",2022-03-05,South,East South Central
4,2022-02-15T16:41:22-0600,https://bham.craigslist.org/lss/d/birmingham-5...,,Online,no subregion found,"birmingham,al",Alabama,\n\n\n\n\nTAMMY CHANG TEST PREP ⭐️⭐️⭐️⭐️⭐️- Le...,[],2022-03-05,South,East South Central


In [84]:
# concat_df_w_regions[concat_df_w_regions['US_region'].isna()==True]

Unnamed: 0,date_posted,link,price,city,subregion,region,state,post_text,price_list,posts_scraped_on,US_region,US_division
4704,2022-01-31T13:42:05-0400,https://virgin.craigslist.org/sks/d/michael-ch...,,St. Thomas,no subregion found,virginislands,Territories,"\n\n\n\n\nTutor for high school, middle school...",[],2022-03-05,,


### Dropping Duplicate posts

In [85]:
# Count duplicates.
concat_df['post_text'].duplicated().value_counts()

True     3439
False    1266
Name: post_text, dtype: int64

In [13]:
df_exact_txt_dropped = drop_exact_duplicates(concat_df)

Number of rows before dropping duplicates: 4705
Number of rows before after duplicates: 1266
A difference of 3439 rows.


### Dropping posts that are above a certain similarity threshold.  

Many posts on Craigslist are from the same person, who changes the text of the post slightly to avoid being flagged and removed.  If a post has a similarity_ratio of 1, it's identical to another post in the df.  All posts with a similarity_ratio >= similarity threshold will be dropped.  In theory, this should leave us with a df that has no more duplicates of any kind and each row represents a unique post.

In [14]:
df_similar_txt_dropped = drop_posts_with_similar_text(df_exact_txt_dropped, similarity_threshold=0.63)

  return asarray(a).ndim
  arr_value = np.asarray(value)


Number of rows before dropping duplicates: 1266
Number of rows before after duplicates: 693
A difference of 573 rows.


### Dropping posts that contained no prices, which aren't helpful for our analysis

In [15]:
df_with_prices = drop_posts_without_prices(df_similar_txt_dropped)

Number of rows before dropping duplicates: 693
Number of rows before after duplicates: 302
A difference of 391 rows.


In [16]:
unique_posts_count = len(df_similar_txt_dropped)
post_with_prices_count = len(df_with_prices)
num_posts = len(concat_df)

percent_unique = unique_posts_count / num_posts * 100
percent_with_prices = post_with_prices_count / num_posts * 100

print(F"Out of {num_posts} posts, there were {unique_posts_count} that were unique, or {percent_unique:.2f}%.")
print(F"Out of those, there were {post_with_prices_count} posts that had prices included.")

print(F"Only {percent_with_prices:.2f}% of the posts that we scraped remain.")

Out of 4705 posts, there were 693 that were unique, or 14.73%.
Out of those, there were 302 posts that had prices included.
Only 6.42% of the posts that we scraped remain.


### Extracting complete.

# *Transforming* Craigslist data: Post-processing

## Are there any posts that might need manual cleaning?  This would include:
* Posts that had 3 or more prices and `price` was marked as null
* Posts where the price wasn't able to convert from `str` -> `int` and `price` was marked as null during pre-processing

There are the entries that were marked as `Null`.  Let's investigate them manually:

In [17]:
df_null_prices = df_with_prices[df_with_prices['price'].isnull()==True]
#df_null_prices[['price', 'price_list']]

Unnamed: 0,price,price_list
2,,"[50, 10, 50]"
7,,"[30, 35, 45]"
9,,"[50, 100, 135]"
10,,"[25, 28, 30, 30]"
14,,"[20, 25, 30, 40]"
20,,"[35, 40, 55, 55]"
21,,"[40, 40, 40]"
26,,"[30, 45, 60]"
27,,"[25, 30, 50, 50]"
28,,"[30, 50, 30]"


In [18]:
posts_with_mult_prices = df_null_prices.shape[0]
print(F"There were {posts_with_mult_prices} posts with price marked null.")

There were 44 posts with price marked null.


In [27]:
# Store posts with null prices to CSV to manually inspect later

date_of_html_request = str(dt.date.today())

#df_null_prices = df_null_prices.drop(columns=['len_of_price_list'])
df_null_prices.to_csv('./posts_to_investigate/{}_posts_with_null_prices.csv'.format(date_of_html_request), index=False)

In [28]:
# Inspect links manually, one by one, to decide what to do about price information
with pd.option_context('display.max_colwidth', None):
  x=3
  #display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['link'])
  display(df_with_prices.iloc[x]['price'])

'https://anchorage.craigslist.org/lss/d/anchorage-experienced-interpersonal/7444915138.html'

10.0

### Cleaning posts with three or more prices manually - distilling down to one price

We distill posts that had more complicated text that involved three or more prices, such as :

* $40$/hr, $50$/1.5hr, $60$/2hr
  * Complicated pricing schedule
* $40$/hr but $10$ additional per person, if a group session is desired
  * Group rates
* $30$/hr Science, $40$/hr math, come and try a first session for the reduced price of $20$.
  * Special offers

into a single price.  Other posts repeated their prices multiple times, so we distill those down to a single price as well.

In [53]:
df_with_prices= clean_3_plus_prices(df_with_prices)

#### Checking results - Are there any posts that were marked as needing to be cleaned that we missed?

In [54]:
num_still_null = len(df_with_prices[df_with_prices['price'].isnull()==True])

if num_still_null==0:
    print("There are no posts with null prices still needing cleaning.")
else:
    print(F"There are {num_still_null} posts that need cleaning.")

There are 26 posts that need cleaning.


### Checking Posts that have two prices listed to see if averaging them is reasonable

In [55]:
df_with_prices[df_with_prices['len_of_price_list']==2][['price','price_list']]

Unnamed: 0,price,price_list
0,46.0,"[57, 35]"
4,27.0,"[29, 25]"
5,37.5,"[50, 25]"
8,37.5,"[45, 30]"
12,30.0,"[25, 35]"
...,...,...
277,42.0,"[39, 45]"
278,50.0,"[55, 45]"
284,42.5,"[40, 45]"
295,35.0,"[65, 5]"


In [56]:
# Inspect posts manually, one by one
with pd.option_context('display.max_colwidth', None):
  x=136
  #display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['link'])
  display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['price'])

'https://cosprings.craigslist.org/lss/d/colorado-springs-tutor-college/7445984357.html'

"\n\n\n\n\nTutoring services offered***** Experience 8+ years.\n\nAvailable to tutor online, library, your house, school, local library/cafe. . . Based out of Cimarron Hills, Colorado Springs. Willing to travel a bit out of town if needed. \n\nI am a 28 year old female with a Bachelor's in Biomedical Engineering and Management. Graduated cum laude at University of California, Irvine. Can tutor any subject, any age/level, from geometry to spanish to calculus to literature and study skills /time management... I also tutor test prep (SATs/ACTSs) in all subjects (5+ years experience) !! Let's raise your score! I got a perfect score in math and top 1% in other subjects.\n\nCan help with military standardized tests too such as ASVAB and GED prep.\n\nElementary to to college level tutoring available as well, experience with all age students including those with special needs too. \n\nAbout me:\nI am a California native that just moved back to Colorado. I spent the past years teaching English 

65.0

#### Manually cleaning certain postings that had two prices listed

While averaging is helpful for some posts, it doesn't apply to all of them.  The clean_two_prices() function is meant to update our data with correct pricing information in the posting where using an average to deal with the two prices in the post isn't ideal.

In [69]:
df_with_prices = clean_two_prices(df_with_prices)

## Investigating posts with extreme prices.  Are there any price outliers that we need to clean?

Prices >= 100 or <= 20 are what I would consider to be extreme prices.  Let's investigate them.

In [70]:
df_with_prices[(df_with_prices['price']>=100) | (df_with_prices['price']<=20)][['price', 'post_text', 'price_list']] 

Unnamed: 0,price,post_text,price_list
3,10.0,"\n\n\n\n\nHello, I am an experienced math tuto...",[10]
17,20.0,"\n\n\n\n\nLocal, Experienced, Knowledgeable. \...",[20]
24,17.5,"\n\n\n\n\nHello, my name is Neil, and I have b...","[30, 5]"
25,15.0,\n\n\n\n\nNeed homework help? Is your child be...,[15]
29,20.0,\n\n\n\n\nI have over 8 years of tutoring expe...,[20]
30,20.0,\n\n\n\n\nText 2133408660 or register at peerl...,[20]
47,100.0,\n\n\n\n\nHello! My name is Connor and I've be...,[100]
51,120.0,"\n\n\n\n\nG'day! My name's Daniel, and I'm a f...",[120]
52,150.0,\n\n\n\n\nG'day! My name is Daniel. I graduate...,"[200, 100]"
62,20.0,\n\n\n\n\nUC Irvine educated math and science ...,[20]


In [71]:
# Manually inspect these posts one by one
with pd.option_context('display.max_colwidth', None):
  x=40
  #display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['link'])
  display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['price'])

'https://losangeles.craigslist.org/sgv/lss/d/pasadena-tutor-available-for-math-engr/7446573475.html'

"\n\n\n\n\nDegreed Engineer (BSEE/MSEE) available to aid you in multiple areas which you may need help with while we all try to recover from the lockdown disaster.\n\nClasses are =finally= beginning to startup again and, sadly, many people are WAY behind in their studies.  Thus, this is a good time to catch up with all that missed school work and studying for tests.\n\nI usually look for tutoring gigs for Adult / College / High-School (with parental approval) for misc MATH classes ranging from pre-algebra through trigonometry and calculus with a smattering of linear algebra, statistics and other things - you will need to ask.  I can also tutor other topics in the Engineering curriculum - for special classes please let me know the class and book ahead of time.\n\nAlso available for possible Testing, Debugging and Troubleshooting of your Electronic Design or your pet project if it is within my wheelhouse.  Good at thinking logically and tracking down intricate problems.  I can layout a p

50.0

### Dropping posts with extreme prices that aren't relevant

In [72]:
# This ad is for poker tutoring/coaching, not really what I'm competing against, so we drop it.  He also mentions he tutors math in this post, but he has a separate post, that we've captured, which has his math tutoring pricing information.
australia_daniel_idx = df_with_prices[df_with_prices['post_text'].str.contains("I'm available as a dealer if you need one", regex=False)==True].index

df_with_prices.drop(labels=australia_daniel_idx, inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

### Correcting pricing information for posts with extreme prices

In [73]:
# This ad says $50/hr but then mentions a prepay plan for $160 for 4 hours.  Since these are the only two prices in the post, our code averages them, so we set the correct price to $50
google_maps_idx = df_with_prices[df_with_prices['post_text'].str.contains("willing to travel if Google Maps", regex=False)==True].index

try:
    df_with_prices.iloc[google_maps_idx, price_col_idx] = 50

except:
    print("Issue with google_maps_idx and iloc.")
    pass 

Issue with google_maps_idx and iloc.


In [74]:
# This ad says $45/hr for high school or college, but then mentions a $35 for middle school.  Since these are the only two prices in the post, our code averages them, so we set the correct price to $45, since I primarily tutor high school or college students.
rancho_penasquitos_idx = df_with_prices[df_with_prices['post_text'].str.contains("Rancho Penasquitos (Park Village Neighborhood)", regex=False)==True].index

try:
    df_with_prices.iloc[rancho_penasquitos_idx, price_col_idx] = 45

except:
    print("Issue with rancho_penasquitos_idx and iloc.")
    pass 

Issue with rancho_penasquitos_idx and iloc.


### Transforming Complete

# *Load* - Saving results

### Store results locally as CSV files

In [75]:
date_of_html_request = str(dt.date.today())

# Drop unnecessary columns.
df_for_sql = df_with_prices.drop(labels=['link', 'price_list', 'len_of_price_list'], axis=1)

# In order for psycopg2 to parse our CSV file correctly later, we need to escape all new line characters by adding an additional \ in front of \n.
df_for_sql['post_text'] = df_for_sql['post_text'].str.replace('\n', '\\n')

# Store cleaned data as CSV file in preparation for importing to SQL database
df_for_sql.to_csv("./csv_files/{}_all_regions_with_prices.csv".format(date_of_html_request), index=False, sep=';')

# Store original data, before we applied any cleaning to it, in case it's needed for something later on.
concat_df.to_csv("./csv_files/{}_all_regions_posts.csv".format(date_of_html_request), index=False)

In [76]:
df_similar_txt_dropped.to_csv('./csv_files/{}_all_regions_no_dups.csv'.format(date_of_html_request), index=False, sep=';')

### Importing into PostgreSQL database

In [77]:
# Establish connection to PSQL database
conn = psycopg2.connect("host=localhost dbname=rancher user=rancher port=5430")

# Instantiate a cursor object
cur = conn.cursor()

# Use cursor object to create a database for storing the information we scraped and cleaned, if one doesn't already exist.
cur.execute("""    
    CREATE TABLE IF NOT EXISTS cl_tutoring(
    id SERIAL primary key,
    date_scraped date,
    price decimal,
    city text,
    subregion text,
    region text,
    state text,
    post_text text,
    date_posted timestamp
);
""")

# Commit changes to database
conn.commit()

In [78]:
# Instantiate a new cursor object
cur = conn.cursor()

# Copy data from our CSV file into database.  
### Note, we can use the ; separator freely because we replaced all instances of semicolons in post_text to commas during the preprocessing stage, ensuring that psycopg2 won't misinterpret a semicolon in the body of a post as a separator.
### Also, we must specify null="" because Python represents null values as an empty string when writing to a CSV file and psycopg2 needs to know how null values are represented in the CSV file in order to properly insert null values into the database
with open('./csv_files/' + str(date_of_html_request) + '_all_regions_with_prices.csv', 'r') as file:
    next(file) # Skip the header row
    cur.copy_from(file, 'cl_tutoring', sep=';', null="", columns=('date_posted', 'price', 'city', 'subregion', 'region', 'state', 'post_text', 'date_scraped'))
    
# Commit changes to database
conn.commit()

### Done!!!