In [1]:
from requests import get
import requests
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import random
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import datetime as dt
import csv 
import psycopg2
import time

In [2]:
# I picked the 10 largest metropolitan areas by population to scrape data from, as well as Sacramento, since it's nearby
regions_to_scrape = ['sf_bay_area',
                    'new_york',
                    'los_angeles',
                    'sacramento',
                    'chicago',
                    'san_diego',
                    'houston',
                    'phoenix',
                    'philadelphia',
                    'dallas',
                    'san_antonio']

# Extract Craigslist Data

In [3]:
# Create a Session and Retry object to manage the quota Craigslist imposes on HTTP get requests within a certain time period 
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [4]:
# Walk through each region in our list of regions_to_scrape to get the HTML page corresponding to a search for "math tutor" in the services section

response_dict = {}
sleep_timer = 10
num_regions = len(regions_to_scrape)
current_time = dt.datetime.now()
finish_time = current_time + dt.timedelta(seconds = num_regions * sleep_timer)

print(F"Current time is {current_time.strftime('%H:%M:%S')}")
print(F"Process will finish at {finish_time.strftime('%H:%M:%S')}")

for count, region in enumerate(regions_to_scrape):
    # Impose a timer to help prevent too many HTTP requests that would result in a
    # ban
    time_remaining = (num_regions * sleep_timer) - (count * sleep_timer)
    print(F'Time remaining: {time_remaining} seconds.')
    time.sleep(sleep_timer)
    
    current_region = region.replace('_', '')
    current_response = session.get('https://' + current_region + '.craigslist.org/d/services/search/bbb?query=math%20tutor&sort=rel')
    response_dict[region] = current_response
    if count != num_regions - 1:
        print()
        print(current_region + " response received.")
        print()
        print("Waiting for next response...")
    else:
        print()
        print(current_region + " response received.  Process completed.")

Current time is 17:31:03
Process will finish at 17:32:53
Time remaining: 110 seconds.

sfbayarea response received.

Waiting for next response...
Time remaining: 100 seconds.

newyork response received.

Waiting for next response...
Time remaining: 90 seconds.

losangeles response received.

Waiting for next response...
Time remaining: 80 seconds.

sacramento response received.

Waiting for next response...
Time remaining: 70 seconds.

chicago response received.

Waiting for next response...
Time remaining: 60 seconds.

sandiego response received.

Waiting for next response...
Time remaining: 50 seconds.

houston response received.

Waiting for next response...
Time remaining: 40 seconds.

phoenix response received.

Waiting for next response...
Time remaining: 30 seconds.

philadelphia response received.

Waiting for next response...
Time remaining: 20 seconds.

dallas response received.

Waiting for next response...
Time remaining: 10 seconds.

sanantonio response received.  Process 

In [5]:
# Walk through each region to get a list of all individual postings for math tutoring 
# in the results page we searched up earlier.
posts_dict = {}
for region in response_dict:
    #current_region = region
    current_html_soup = BeautifulSoup(response_dict[region].text, 'html.parser')
    current_posts = current_html_soup.find_all('li', class_='result-row')
    posts_dict[region] = current_posts

In [6]:
soup_objects_dict = {}

current_time = dt.datetime.now()
num_seconds = num_regions * 120 * 10
max_finish_time = current_time + dt.timedelta(seconds=num_seconds)

print(F"Current time is {current_time.strftime('%H:%M:%S')}")
print(F"Process will finish by {max_finish_time.strftime('%H:%M:%S')}")
print()

for count, region in enumerate(posts_dict, start=1):
    # Walk through each region and create a list of soup_objects to scrape from by 
    # storing them into memory.  This way we only have to send these get requests 
    # once and Craigslist doesn't ban us for sending the same https requests over 
    # and over
    soup_objects_list = []
    #link_list = []
    for i, post in enumerate(posts_dict[region]):
        # Impose a timer so that we send each get request between 5 and 10 seconds.
        # This is again to help prevent from getting banned for too many HTTP 
        # requests.
        random_int = random.randint(5,10)
        time.sleep(random_int)
        current_link = post.a.get('href')
        #link_list.append(current_link)
        response_object = session.get(current_link)
        soup_object = BeautifulSoup(response_object.text, 'html.parser')
        soup_objects_list.append(soup_object) 
        # Impose condition that every 10th post will trigger something printed
        # to the screen.  This part of the code is a long process and I wanted
        # something to help keep track of how much progress has been made
        if (i !=0) and ((i-1) % 10 == 9):
            print(F"Post number {i} in {region} is being extracted.")
    
    soup_objects_dict[region] = soup_objects_list
    if count != len(posts_dict):
        print()
        print(F"Soup objects for {region} acquired.  Waiting for next region...")
        print()
    else:
        print()
        print(F"Soup objects for {region} acquired.  Process complete.")

Current time is 17:33:02
Process will finish by 21:13:02

Post number 10 in sf_bay_area is being extracted.
Post number 20 in sf_bay_area is being extracted.
Post number 30 in sf_bay_area is being extracted.
Post number 40 in sf_bay_area is being extracted.
Post number 50 in sf_bay_area is being extracted.
Post number 60 in sf_bay_area is being extracted.
Post number 70 in sf_bay_area is being extracted.
Post number 80 in sf_bay_area is being extracted.
Post number 90 in sf_bay_area is being extracted.
Post number 100 in sf_bay_area is being extracted.
Post number 110 in sf_bay_area is being extracted.

Soup objects for sf_bay_area acquired.  Waiting for next region...
Post number 10 in new_york is being extracted.
Post number 20 in new_york is being extracted.
Post number 30 in new_york is being extracted.
Post number 40 in new_york is being extracted.
Post number 50 in new_york is being extracted.
Post number 60 in new_york is being extracted.
Post number 70 in new_york is being extr

## Pre-processing Craigslist Data

In [7]:
df_list = []
error_list_text = []
error_list_links = []

# Walk through each region that contains a list of soup objects corresponding to the # search of services for math tutors.
for search_region in soup_objects_dict:
    # Initialize several lists to store relevant information for analysis
    price_list = []
    city_list = []
    datetime_list = []
    body_text_list = []
    subregion_list = []
    region_list = []
    link_list = []
    search_region_price_list = []
    
    # Walk through each soup object in the list corresponding to the search region 
    # and get the link of the soup object to scrape from.
    for soup in soup_objects_dict[search_region]:
        try:
            link = soup.find("meta", property="og:url")['content']
        except:
            # In case a link can't be found, we add the soup object to a list
            # to inspect later and set link to 'None', which we'll use to filter
            # these results out later
            link = 'None'
            error_list_links.append(soup)
            print("Couldn't get link")

        # Extract region of post from Craigslist
        post_region = soup.find_all('li',class_='crumb area')[0].find('a').get_text()
        post_region = post_region.replace(' ', '_')
        post_region = post_region.lower()
        
        # Only let posts through that have a link to scrape from and those posts 
        # where the region of the post matches the region of the search.  Some CL 
        # search results are for neighboring areas, ones that come up in a different
        # region than the region your search was from, which leads to duplicates in 
        # areas like Los Angeles and San Diego.  This will weed out duplicates.
        if post_region == search_region and link != 'None':
            region_list.append(post_region)
            link_list.append(link)

            # Get text of postingbody of the post and remove unwanted text.
            try:
                text = soup.find('section', id='postingbody').get_text()
                #text = text.replace('\n', '')
                text = text.replace(';', ',') # We do this so that we can use ; as 
                                              # a delimiter when copying data from a 
                                              # CSV file into a SQL database later.
                text = text.replace('QR Code Link to This Post', '') # We do this 
                                                                     # because this
                                                                     # text from one
                                                                     # post in
                                                                     # particular was                                                                      # giving me 
                                                                     # trouble and
                                                                     # the best way I 
                                                                     # could find to 
                                                                     # handle it was 
                                                                     # to remove the 
                                                                     # text.
                text = text.replace(u'\xa0', u' ')
                body_text_list.append(text)
                
            except:
                error_list_text.append(soup)
                body_text_list.append('None')
                print("Couldn't get text")

            # Use regular expressions to find all instances of prices in the text
            #old_prices = re.findall('(?:[\$]{1}[,\d]+.?\d*)', text)
            old_prices = re.findall('(?:[\$]{1}[,\d]+\d*)', text)
            # Alternative, if trying to capture decimals 
            # ^(?:\${1}\d+(?:,\d{3})*(?:\.{1}\d{2}){0,1})?$
            
            # Append prices before they're processed to a separate list, in case we
            # need to isolate issues and fix them later.
            search_region_price_list.append(old_prices)
            
            # Intialize empty list to store the new prices after processing old
            # prices.
            new_prices = []

            # Walk through each price in the post.
            for price in old_prices:
                # Clean unwanted characters.
                price = price.replace('$', '')
                price = price.replace('/', '')
                price = price.replace('!', '')
                price = price.replace('h', '')
                price = price.replace('.', '')
                price = price.replace(')', '')
                price = price.replace(',', '')
                price = price.replace('>', '')
                price = price.rstrip()   
                # Some tutors give prices as a range ie '$30-40'.  In order to
                # work with this data, I split based on the hyphen, then I can 
                # use each price individually.
                split_prices = price.split('-')

                # Walk through each price in the posting, after any necessary splits 
                # have been made.
                for p in split_prices:
                    # Only proceed if the post contained prices, ie if p is a non-
                    # empty string.
                    if len(p)!=0:
                        
                        try:
                            # Convert string price to int.
                            new_int = int(p)
                            new_prices.append(new_int)
                        
                        except:
                            # Show which prices aren't able to convert to an int and 
                            # the post they came from so we can isolate and fix the 
                            # issue.
                            print(F'Error converting this price: {p}')
                            print(old_prices)
                            print()
                            print('Here is the text of the post:')
                            print()
                            print(text)
                            print('-'*50)
                            print()
                            # Set prices that can't be covered to NaN so the process 
                            # can finish.
                            new_prices.append(np.nan) 

                            
            # For posts that had no prices listed, we append new_prices with "None"
            if len(new_prices)==0:
                price_list.append('None')
            # For posts that had a single price, we use it.
            elif len(new_prices)==1:
                price_list.append(new_prices[0])
            # For posts that contained two prices, we average them.  This helps with 
            # posts that give a range of prices (ie $25-30).
            elif len(new_prices)==2:
                avg_price_2 = np.average(new_prices)
                price_list.append(avg_price_2)
            # If a post has more than 3 prices, we append them, but this means we 
            # have to inspect them manually and deal with them later.
            else:
                price_list.append(new_prices)


            # Get city information for each posting.
            try:
                city = soup.find(class_='postingtitletext').small.get_text()
                
                # Because of the way CL operates, one has to choose a city from a
                # radio button list that CL provides when one creates a post to offer 
                # a service, however later, there's a field where they can type in 
                # any city they want.  Many people will randomly choose a city from 
                # the radio button list, but then  post their city as "online".  This 
                # makes sure we capture them. 
                re_pattern = re.compile('online')
                online_flag = re.search(re_pattern, city.lower())
                if online_flag:
                    city_list.append('Online')
                else:
                    # Strip out leading and trailing white spaces, replace
                    # parentheses, and capitalize each word in the str.
                    city = city.strip()
                    city = city.replace('(', '').replace(')', '')        
                    city = city.title()
                    city_list.append(city)
            except:
                # If a post has no city information, use None
                city_list.append('None')

            # Extract subregion of Craigslist that the post was made in.
            # This will allow for comparison of prices across different cities
            # within the same metropolitan sub_region.
            try:
                subregion = soup.find_all('li', class_='crumb subarea')[0].find('a').get_text()
                subregion = subregion.title()
                subregion_list.append(subregion)
            except:
                subregion_list.append('None')


            # Extract time the posting was made.
            try:
                dt_object = soup.find('time')['datetime']
                datetime_list.append(dt_object)
            except:
                datetime_list.append('None')
        else:
            pass
    
    # Create temporary df to store results for each region
    temp_df = pd.DataFrame(data=zip(datetime_list,
                                    link_list, 
                                    price_list, 
                                    city_list, 
                                    subregion_list, 
                                    region_list, 
                                    body_text_list,
                                    search_region_price_list),
                        columns=['date_posted', 
                                 'link', 
                                 'price', 
                                 'city', 
                                 'subregion', 
                                 'region', 
                                 'post_text',
                                 'price_list']
                          )
    
    # Append each temporary df to a list, which we can concatenate into one larger 
    # df, later.
    df_list.append(temp_df)

In [8]:
# Check for errors in getting text from a post, or from getting the URL of a post.
len(error_list_text), len(error_list_links)

(0, 0)

In [9]:
# Concatenate the dfs for each region into one larger df and check its shape.
df = pd.concat(df_list, ignore_index=True)
df.shape

(855, 8)

In [10]:
# Get date of html request to label our output with.
date_of_html_request = str(dt.date.today())

# Include the date posts were scraped on to track tutoring prices over time.
df['posts_scraped_on'] = date_of_html_request

# Count duplicates.
df['post_text'].duplicated().value_counts()

True     495
False    360
Name: post_text, dtype: int64

### Dropping Duplicates or posts that contained no prices

In [11]:
# Find indices of duplicate results, then drop them and reset indices.
duplicate_indices = df[df['post_text'].duplicated()==True].index
df_no_dups = df.drop(index=duplicate_indices)
df_no_dups = df_no_dups.reset_index(drop=True)

In [12]:
# Filter out results that don't have a price and reset indices.
df_with_prices = df_no_dups[df_no_dups['price']!='None']
df_with_prices = df_with_prices.reset_index(drop=True)

In [13]:
post_with_prices_count = len(df_with_prices)
num_posts = len(df_no_dups)

percent_with_prices = post_with_prices_count/num_posts * 100

print(F"There were {post_with_prices_count} posts that had prices included and weren't duplicates.")
print(F"Only {percent_with_prices:.2f}% of the posts that we scraped remain.")

There were 138 posts that had prices included and weren't duplicates.
Only 38.33% of the posts that we scraped remain.


### Extracting complete.

# Transforming Craigslist data

### Are there any posts that might need manual cleaning?  This would include:
* Posts that had 3 or more prices
* Posts that were marked as being null during pre-processing

In [14]:
# Keep track of which prices are still lists, as opposed to single values, we will 
# need to investigate these later.
df_with_prices['prices_need_cleaning'] = df_with_prices['price'].apply(lambda x: isinstance(x, list) and len(x) >= 3)

## Investigating posts that had three or more prices listed.

In [15]:
# These are the entries with 3 or more prices listed, let's investigate why
df_with_prices[df_with_prices['prices_need_cleaning']==True]['price']

0                               [90, 60, 40]
11               [100, 115, 130, 65, 30, 60]
13                      [40, 40, 40, 40, 40]
14                              [30, 35, 45]
17                              [90, 60, 40]
20                      [40, 40, 40, 40, 40]
22                          [45, 55, 40, 50]
28                  [35, 35, 40, 40, 55, 80]
31                          [40, 40, 45, 45]
35                              [40, 40, 40]
42               [100, 115, 130, 65, 30, 60]
50                      [40, 40, 40, 40, 40]
52                          [30, 30, 60, 90]
53     [40, 80, 40, 10, 40, 30, 40, 80, 100]
56                          [25, 30, 50, 50]
59                          [40, 40, 40, 40]
68              [40, 30, 40, 30, 40, 40, 40]
83                            [50, 100, 135]
84              [40, 30, 40, 30, 40, 40, 40]
86                              [65, 55, 55]
90                          [20, 25, 30, 30]
93                              [25, 35, 20]
94        

In [16]:
# Inspect links manually, one by one, to decide what to do about price information
with pd.option_context('display.max_colwidth', None):
  display(df_with_prices.iloc[1]['link'])

'https://sfbay.craigslist.org/eby/lss/d/oakland-3rd-year-medical-student-as/7426289294.html'

### Cleaning posts with three or more prices manually.

In [17]:
price_col_idx = df_with_prices.columns.get_loc('price')
need_clean_col_idx = df_with_prices.columns.get_loc('prices_need_cleaning')

#### Dropping duplicates that weren't spotted earlier

These posts come up multiple times and are duplicates, but they change the text of their posting just slightly, so Pandas is unable to detect the duplicate postings on it's own, and we have to find and drop them manually.

In [18]:
# Find indices of duplicates, set the first instance to the price that makes the most 
# sense, then drop all remaining duplicate posts.
kenari_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('kenaritutor.com')==True].index

try:
    # Because the ad says $90 in person, $60 for online, and Corona Virus pricing of
    # $40 for online weekdays, I'm using the $40 per hour rate because it seems the
    # most reasonable.  We also set prices_need_cleaning to False b/c the prices have
    # been cleaned
    df_with_prices.iloc[kenari_tutor_idx[0],
                        [price_col_idx,
                         need_clean_col_idx]
                       ] = 40, False
except:
    print('Issue with kenari_tutor_idx and iloc.')
    pass

# Drop duplicates and reset indices
df_with_prices.drop(labels=kenari_tutor_idx[1:], inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

In [19]:
# Find indices of duplicates.
park_academy_idx = df_with_prices[df_with_prices['post_text'].str.contains('(949) 490-0872', regex=False)==True].index

try:
    # The add says $55/hr for K-12, then $65/hr for AP/Honors, as well as Pre-calc, 
    # etc., I'm going to average the two prices.  Set needs cleaning column to False 
    # b/c the prices have been cleaned.
    df_with_prices.iloc[park_academy_idx[0],
                        [price_col_idx,
                         need_clean_col_idx]
                       ] = 60, False

except:
    print("Issue with park_academy_idx and iloc.")
    pass

# Drop duplicates and reset indices
df_with_prices.drop(labels=park_academy_idx[1:], inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

In [20]:
# Find duplicates, correct price, drop duplicates
star_star_college_math_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('https://www.youtube.com/channel/UCqhFZRmUqOAAPMQpo58TV7g'
                   ) == True].index

try:
    # The ad repeats the price of $40 over and over, so I'm replacing the price with 
    # a single instance.  We also set prices_need_cleaning to False b/c the prices 
    # have been cleaned.
    df_with_prices.iloc[star_star_college_math_tutor_idx[0], [price_col_idx, need_clean_col_idx]] = 40, False
    
except:
    print("Issue with star_star_college_math_tutor_idx and iloc.")
    pass

# Drop duplicates and reset indices
df_with_prices.drop(labels=star_star_college_math_tutor_idx[1:], inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

In [21]:
# Find duplicates, correct price, drop duplicates
poway_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('CSUSM: 342')==True].index

try:
    # This ad says $30 for one hour.
    df_with_prices.iloc[poway_tutor_idx[0], [price_col_idx, need_clean_col_idx]] = 30, False
    
except:
    print("Issue with poway_tutor_idx and iloc.")
    pass

df_with_prices.drop(labels=poway_tutor_idx[1:], inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

#### Distilling long lists of prices down to one price

Next, we distill posts that had more complicated text that involved three or more prices, such as :

* $40$/hr, $50$/1.5hr, $60$/2hr
  * Complicated pricing schedule
* $40$/hr but $10$ additional per person, if a group session is desired
  * Group rates
* $30$/hr Science, $40$/hr math, come and try a first session for the reduced price of $20$.
  * Special offers

into a single price.  Other posts repeated their prices multiple times, so we distill those down to a single price as well, then mark any of the entries we changed as being cleaned.

In [22]:
# This ad mentions several prices for different subjects, but explicitly says $30 for math.
la_honda_idx = df_with_prices[df_with_prices['post_text'].str.contains('909-640-3570')].index

try:
    df_with_prices.iloc[la_honda_idx,[price_col_idx, need_clean_col_idx]] = 30, False
    
except:
    print("Issue with la_honda_idx and iloc.")
    pass


# This ad mentions $45 for lower division college courses, which are a large segment of the subjects I help with, so I'm using that price to compare myself against.
ucb_phd_student_and_ta_idx = df_with_prices[df_with_prices['post_text'].str.contains('Former UC-Berkeley economics Ph.D. student and TA')].index

try:
    df_with_prices.iloc[ucb_phd_student_and_ta_idx,[price_col_idx, need_clean_col_idx]] = 45, False

except:
    print("Issue with ucb_phd_student_and_ta_idx and iloc.")
    pass


# Says $40 for in person, or $45 for at home, so I took the average.
san_mateo_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('I mainly tutor, in person, at the Downtown Redwood City, downtown San Mateo')].index

try:
    df_with_prices.iloc[san_mateo_tutor_idx,[price_col_idx, need_clean_col_idx]] = 42.5, False

except:
    print("Issue with san_mateo_tutor and iloc.")
    pass

In [23]:
#This guy has weird price structuring, but I used his hourly rate for each time interval, $100 for 80 minutes, $115 for 100 minutes, $130 for 120 minutes, then averaged those hourly rates to estimate for what a single hour would cost.
oakland_exp_tutor_online_idx = df_with_prices[df_with_prices['post_text'].str.contains('I received a full scholarship to University of Cincinnati and held a 3.8 GPA through my master’s program in aerospace')==True].index

oakland_tutor_avg_rate = ((100/80) + (115/100) + (130/120)) * 60 / 3

try:
    df_with_prices.iloc[oakland_exp_tutor_online_idx,[price_col_idx, need_clean_col_idx]] = oakland_tutor_avg_rate, False

except:
    print("Issue with oakland_exp_tutor_online_idx and iloc.")
    pass

# This guy's ad explcityly says $57 per hour.
blake_tutoring_indices = df_with_prices[df_with_prices['post_text'].str.contains('BlakeTutoring.com', case=False)==True].index

df_with_prices.iloc[blake_tutoring_indices, price_col_idx] = 57


# Charges $50 per hour for sessions under 3 hours
spss_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('Worked for 2 companies named', case=False)==True].index

try:
    df_with_prices.iloc[spss_tutor_idx, [price_col_idx, need_clean_col_idx]] = 50, False
    
except:
    print("Issue with spss_tutor_idx and iloc.")
    pass

In [24]:
# Says $25/hr for high school, $30/hr for college, just went with $30/hr
sharp_mind_idx = df_with_prices[df_with_prices['post_text'].str.contains('(650) 398-9490', regex=False)==True].index

try:
    df_with_prices.iloc[sharp_mind_idx, [price_col_idx, need_clean_col_idx]] = 30, False
    
except:
    print("Issue with sharp_mind_idx and iloc.")
    pass
    
    
# Says $50/hr    
trevor_skelly_idx = df_with_prices[df_with_prices['post_text'].str.contains('trevorskelly')==True].index

try:
    df_with_prices.iloc[trevor_skelly_idx, [price_col_idx, need_clean_col_idx]] = 50, False
    
except:
    print("Issue with trevor_skelly_idx and iloc.")
    pass

In [25]:
# $25/hr if meeting near CSU Sac, $35/hr if they drive to you, $20/hr for online.
# I chose $30/hr to split the difference between the in person prices.
best_math_idx = df_with_prices[df_with_prices['post_text'].str.contains('bestmathtutoring.com')==True].index

try:
    df_with_prices.iloc[best_math_idx, [price_col_idx, need_clean_col_idx]] = 30, False
    
except:
    print("Issue with best_math_idx and iloc.")
    pass    

# Says #60 per hour.
glasses_lady_idx = df_with_prices[df_with_prices['post_text'].str.contains("offering virtual one-on-one Math tutoring via Zoom")==True].index

try:
    df_with_prices.iloc[glasses_lady_idx, [price_col_idx, need_clean_col_idx]] = 60, False
except:
    print("Issue with glasses_lady_idx and iloc.")
    pass    


ucla_grad_henry_idx = df_with_prices[df_with_prices['post_text'].str.contains("916 390-7923", regex=False)==True].index

try:
    df_with_prices.iloc[ucla_grad_henry_idx, [price_col_idx, need_clean_col_idx]] = 35, False

except:
    print("Issue with ucla_grad_henry_idx and iloc.")
    pass    

In [26]:
peter_d_idx = df_with_prices[df_with_prices['post_text'].str.contains('Peter D.')==True].index

try:
    df_with_prices.iloc[peter_d_idx, [price_col_idx, need_clean_col_idx]] = 40, False
except:
    print("Issue with peter_d_idx and iloc.")
    pass    

In [27]:
# $20/hr online, $30/hr in person, split the difference at $25
austin_sabrina_idx = df_with_prices[df_with_prices['post_text'].str.contains('My girlfriend Sabrina')==True].index

try:
    df_with_prices.iloc[austin_sabrina_idx, [price_col_idx, need_clean_col_idx]] = 25, False
    
except:
    print("Issue with austin_sabrina_idx and iloc.")
    pass    

In [28]:
alex_farrell_idx = df_with_prices[df_with_prices['post_text'].str.contains('Alexander Farrell')==True].index

try:
    df_with_prices.iloc[alex_farrell_idx, [price_col_idx, need_clean_col_idx]] = 25, False

except:
    print("Issue with alex_farrell_idx and iloc.")
    pass    

In [29]:
# Post says $30/hr for Precalc/Trig and $50/hr for Calculus, so I took the average
lonzo_tutoring_idx = df_with_prices[df_with_prices['post_text'].str.contains('951-795-5027', regex=False)==True].index

try:
    df_with_prices.iloc[lonzo_tutoring_idx, [price_col_idx, need_clean_col_idx]] = 40, False

except:
    print("Issue with lonzo_tutoring_idx and iloc.")
    pass    

In [64]:
# Post includes many prices, but states $55/hr for Precalc and $80/hr for Calculus, so I took the average of those prices
aerospace_engineer_idx = df_with_prices[df_with_prices['post_text'].str.contains('undergraduate students at UC San Diego', regex=False)==True].index

try:
    df_with_prices.iloc[aerospace_engineer_idx, [price_col_idx, need_clean_col_idx]] = (55 + 80)/2, False

except:
    print("Issue with aerospace_engineer_idx and iloc.")
    pass    

## Checking results

#### Are there any posts that were marked as needing to be cleaned that we missed?

In [37]:
num_still_as_list = len(df_with_prices[df_with_prices['prices_need_cleaning']==True]['price'])

if num_still_as_list==0:
    print("There are no posts that had multiple prices still needing cleaning.")
else:
    print(F"There are {num_still_as_list} posts that still have multiple prices needing cleaning.")

There are no posts that had multiple prices still needing cleaning.


In [38]:
# These are the posts with three or more that still need cleaning.
df_with_prices[df_with_prices['prices_need_cleaning']==True]['price']

Series([], Name: price, dtype: object)

#### Are there any posts with a price that was marked as being null during pre-processing?

In [39]:
num_null_prices = len(df_with_prices[df_with_prices['price'].isnull()==True])

if num_null_prices==0:
    print("There are no posts that have null prices.")
else:
    print(F"There are {num_null_prices} posts that have null prices.")

There are no posts that have null prices.


In [40]:
# These are the posts with prices that are NaN because Python wasn't able to parse them properly during pre-processing.
df_with_prices[df_with_prices['price'].isnull()==True]

Unnamed: 0,date_posted,link,price,city,subregion,region,post_text,price_list,posts_scraped_on,prices_need_cleaning


## Investigating posts with extreme prices.  Are there any price outliers that we need to clean?

Prices >= 100 or <= 20 are what I would consider to be extreme prices.  Let's flag and use the flag to locate and then investigate them.

In [41]:
# Keep track of which prices are what I would consider to be unusual
df_with_prices['price_to_investigate'] = df_with_prices['price'].apply(lambda x: (x>=100) | (x<=20))

In [42]:
df_with_prices[df_with_prices['price_to_investigate']==True][['price', 'post_text', 'price_list']]

Unnamed: 0,price,post_text,price_list
9,20.0,"\n\n\n\n\n""Hey there!\n\nMy name is Angel and ...",[$20]
15,20.0,\n\n\n\n\nText 2133408660 or register at peerl...,[$20]
27,200.0,\n\n\n\n\nHello! My name is Connor and I've be...,[$200]
32,20000.0,\n\n\n\n\nGMAT/GREI'm a full-time GMAT/GRE ins...,"[$20,000]"
33,19.0,\n\n\n\n\nHi! \n\nI am a certified teacher wit...,[$19]
44,15.0,\n\n\n\n\njargon free math tutor $15 all level...,[$15]
50,20.0,\n\n\n\n\nLocated in NYC. I graduated with a b...,[$20]
51,20.0,\n\n\n\n\nText 2133408660 or register at peerl...,[$20]
53,20.0,\n\n\n\n\nText 2133408660 or register at peerl...,[$20]
57,20.0,\n\n\n\n\nText 2133408660 or register at peerl...,[$20]


In [43]:
with pd.option_context('display.max_colwidth', None):
  x=26
  display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['link'])

'\n\n\n\n\nHello, \n\nI am an engineering working professional in the aerospace industry looking to offer math tutoring services to middle school, high school, and college students. I used to tutor undergraduate students at UC San Diego for over four years and want to continue to work with students to improve their learning outcomes and academic achievements. Please see below for more information on sessions and rates. \n\nPre-Algebra: $35/hr\nAlgebra I: $35/hr\nAlgebra II: $40/hr\nGeometry: $40/hr\nPre-Calculus: $55/hr\nCalculus I and II: $80/hr\n\nSessions will be conducted on Zoom. \n\nMy availability:\nMonday - Thursday 4pm - 9pm (PT)\nI am flexible with my schedule so please let me know what works best for the student. \n\nThanks!\n\n\n\n'

'https://sfbay.craigslist.org/sfc/lss/d/san-francisco-math-tutoring-available/7426882297.html'

### Dropping posts with extreme prices that are duplicates or aren't relevant

In [44]:
# This person mentions no prices, the only $ amount specified is how much they earned # in scholarships, so we drop all instances
at_geemale_idx = df_with_prices[df_with_prices['post_text'].str.contains("at geemale")==True].index

df_with_prices.drop(labels=at_geemale_idx, inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

In [45]:
# Find all instances, keep first instance, drop all the remaining duplicates
ansari_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains("peerlinc.com", regex=False)==True].index

df_with_prices.drop(labels=ansari_tutor_idx[1:], inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

In [46]:
# Find all instances, keep first instance, drop all the remaining duplicates
ridgewood_nyc_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains("(646) 326-2191", regex=False)==True].index

df_with_prices.drop(labels=ridgewood_nyc_tutor_idx[1:], inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

In [47]:
# This tutor's ad only mentions the rate per student for an hour long group session, so we drop all instances
why_exceptional_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains("countless time-saving, test-taking strategies", regex=False)==True].index

df_with_prices.drop(labels=why_exceptional_tutor_idx, inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

In [48]:
# This ad is for SAT prep only, not really what I'm competing against, so we drop all instances
study_house_idx = df_with_prices[df_with_prices['post_text'].str.contains("STUDY HOUSE LLC", regex=False)==True].index

df_with_prices.drop(labels=study_house_idx, inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

In [49]:
# This ad is for poker tutoring/coaching, not really what I'm competing against, so we drop all instances.  He also mentions he tutors math in this post, but he has a separate post up that we've captured which has his math tutoring pricing information.
australia_daniel_idx = df_with_prices[df_with_prices['post_text'].str.contains("I'm available as a dealer if you need one", regex=False)==True].index

df_with_prices.drop(labels=australia_daniel_idx, inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

In [50]:
# This ad is for MCAT tutoring, not really what I'm competing against, so we drop all instances.
connor_MCAT_idx = df_with_prices[df_with_prices['post_text'].str.contains("My name is Connor and I've been teaching the MCAT since 2016", regex=False)==True].index

df_with_prices.drop(labels=connor_MCAT_idx, inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

In [51]:
# This ad doesn't mention a price, so we drop all instances.
john_baptist_nguyen_idx = df_with_prices[df_with_prices['post_text'].str.contains("john-baptist-nguyen", regex=False)==True].index

df_with_prices.drop(labels=john_baptist_nguyen_idx, inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

### Correct pricing information for posts with extreme prices

In [52]:
price_to_investigate_col_idx = df_with_prices.columns.get_loc('price_to_investigate')

In [53]:
# This ad says $25/hr but then mentions a prepay plan for $225.  Since these are the only two prices in the post, our code averages them, so we set the correct price to $25
james_edward_nassir_idx = df_with_prices[df_with_prices['post_text'].str.contains("James Edward Nassir, EE, Educator, and Discoverer of the 5th Force", regex=False)==True].index

try:
    df_with_prices.iloc[james_edward_nassir_idx, [price_col_idx, price_to_investigate_col_idx]] = 25, False

except:
    print("Issue with james_edward_nassir_idx and iloc.")
    pass 

In [54]:
# This ad says $50/hr but then mentions a prepay plan for $160 for 4 hours.  Since these are the only two prices in the post, our code averages them, so we set the correct price to $50
google_maps_idx = df_with_prices[df_with_prices['post_text'].str.contains("willing to travel if Google Maps", regex=False)==True].index

try:
    df_with_prices.iloc[google_maps_idx, [price_col_idx, price_to_investigate_col_idx]] = 50, False

except:
    print("Issue with google_maps_idx and iloc.")
    pass 

In [55]:
# This ad says $84/hr but then mentions a $125 for 1.5 hours.  Since these are the only two prices in the post, our code averages them, so we set the correct price to $84
rescue_animals_idx = df_with_prices[df_with_prices['post_text'].str.contains("TestTrainerinc", regex=False)==True].index

try:
    df_with_prices.iloc[rescue_animals_idx, [price_col_idx, price_to_investigate_col_idx]] = 84, False

except:
    print("Issue with rescue_animals_idx and iloc.")
    pass 

In [56]:
# This ad says $45/hr for high school or college, but then mentions a $35 for middle school.  Since these are the only two prices in the post, our code averages them, so we set the correct price to $45, since I primarily tutor high school or college students.
rancho_penasquitos_idx = df_with_prices[df_with_prices['post_text'].str.contains("Rancho Penasquitos (Park Village Neighborhood)", regex=False)==True].index

try:
    df_with_prices.iloc[rancho_penasquitos_idx, [price_col_idx, price_to_investigate_col_idx]] = 45, False

except:
    print("Issue with rancho_penasquitos_idx and iloc.")
    pass 

## Checking results

#### Are there any posts with extreme prices that we marked which still need investigation?

In [57]:
# These are the posts that remain, but they all have prices which agree with the posting, so no cleaning is needed.
df_with_prices[df_with_prices['price_to_investigate']==True][['price', 'post_text', 'price_list']]

Unnamed: 0,price,post_text,price_list
9,20.0,"\n\n\n\n\n""Hey there!\n\nMy name is Angel and ...",[$20]
30,19.0,\n\n\n\n\nHi! \n\nI am a certified teacher wit...,[$19]
41,15.0,\n\n\n\n\njargon free math tutor $15 all level...,[$15]
47,20.0,\n\n\n\n\nLocated in NYC. I graduated with a b...,[$20]
63,120.0,"\n\n\n\n\nG'day! My name's Daniel, and I'm a f...",[$120]
74,20.0,"\n\n\n\n\nHi everyone, do you need math, physi...","[$15, $25]"
78,100.0,\n\n\n\n\nTenured math professor at a major un...,[$100]
106,20.0,"\n\n\n\n\nYes, you read right! I--Mr. C--am of...","[$20, $20]"


In [58]:
# If any prices still need cleaning, inspect their links to decide what to do about price information
with pd.option_context('display.max_colwidth', None):
  display(df_with_prices.iloc[9]['link'])

'https://sfbay.craigslist.org/sby/lss/d/palo-alto-free-hour-experienced-tutor/7423496674.html'

In [59]:
# Check size of results after transforming is complete
df_with_prices.shape

(108, 11)

#### Everything looks good.  Transforming complete.

# Saving results

### Store results locally as CSV files

In [60]:
# Drop unnecessary columns.  CL links will expire after some number of days, the prices_need_cleaning and price_to_investigate columns have been manually inspected, and lastly we've distilled the multiple prices in the price_list down to a single value
df_for_sql = df_with_prices.drop(labels=['prices_need_cleaning','link', 'price_list', 'price_to_investigate'], axis=1)

# In order for psycopg2 to parse our CSV file correctly later, we need to escape all new line characters by adding an additional \ in front of \n.
df_for_sql['post_text'] = df_for_sql['post_text'].str.replace('\n', '\\n')

# Store cleaned data as CSV file in preparation for importing to SQL database
df_for_sql.to_csv("./{}_all_regions_with_prices.csv".format(date_of_html_request), index=False, sep=';')

# Store original data, before we applied any cleaning to it, in case it's needed for something later on.
df.to_csv("./{}_all_regions_posts.csv".format(date_of_html_request), index=False)

### Importing into PostgreSQL database

In [61]:
# Establish connection to PSQL database
conn = psycopg2.connect("host=localhost dbname=rancher user=rancher")

# Instantiate a cursor object
cur = conn.cursor()

# Use cursor object to create a database for storing the information we scraped and cleaned, if one doesn't already exist.
cur.execute("""    
    CREATE TABLE IF NOT EXISTS cl_tutoring2(
    id SERIAL primary key,
    date_scraped date,
    price decimal,
    city text,
    subregion text,
    region text,
    post_text text,
    date_posted timestamp
);
""")

# Commit changes to database
conn.commit()

In [62]:
# Instantiate a new cursor object
cur = conn.cursor()

# Copy data from our CSV file into database.  
### Note, we can use the ; separator freely because we replaced all instances of semicolons in a post to commas during the preprocessing stage, ensuring that psycopg2 won't misinterpret a semicolon in the body of a post as a separator, splitting a row in the CSV file into too many columns as a result.
### Also, we must specify null="" because Python represents null values as an empty string when writing to a CSV file and psycopg2 needs to know how null values are represented in the CSV file in order to properly insert null values into the database
with open(str(date_of_html_request) + '_all_regions_with_prices.csv', 'r') as file:
    next(file) # Skip the header row
    cur.copy_from(file, 'cl_tutoring2', sep=';', null="", columns=('date_posted', 'price', 'city', 'subregion', 'region', 'post_text', 'date_scraped'))
    
# Commit changes to database
conn.commit()

# Scratch work