In [1]:
from requests import get
import requests
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import random
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import datetime as dt
import csv 
import psycopg2
import time
from sklearn.feature_extraction.text import TfidfVectorizer




In [2]:
# I picked the 10 largest metropolitan areas by population to scrape data from, as well as Sacramento, since it's nearby to me and is another major city
regions_to_scrape = ['sf_bay_area',
                    'new_york',
                    'los_angeles',
                    'sacramento',
                    'chicago',
                    'san_diego',
                    'houston',
                    'phoenix',
                    'philadelphia',
                    'dallas',
                    'san_antonio']

num_regions = len(regions_to_scrape)

# *Extract* Craigslist Data

In [3]:
# Create a Session and Retry object to manage the quota Craigslist imposes on HTTP get requests within a certain time period 
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [4]:
# Walk through each region in our list of regions_to_scrape to get the HTML page corresponding to a search for "math tutor" in the services section

response_dict = {}
sleep_timer = 10

for count, region in enumerate(regions_to_scrape):
    # This gets the first page of search results
    i=1
    current_region = region.replace('_', '')
    current_response = session.get('https://' + current_region + '.craigslist.org/d/services/search/bbb?query=math%20tutor&sort=rel')
    print(F"{region} {i} response received.")
    print(F"Waiting {sleep_timer} seconds...")
    print()
    
    time.sleep(sleep_timer)
    
    region_response_list = []
    region_response_list.append(current_response)

    # This gets all subsequent pages, using the next button
    is_next_button = True
    while is_next_button:
        i+=1
        try:
            next_response = current_response
            next_soup = BeautifulSoup(next_response.text, 'html.parser')
            html_suffix = next_soup.find(class_='button next').get('href')
            if html_suffix != '':
                new_button = 'https://' + current_region + '.craigslist.org' + html_suffix
                current_response = session.get(new_button)
                region_response_list.append(current_response)
                
                
                time.sleep(sleep_timer)
                print(F"{region} {i} response received.")
                print(F"Waiting {sleep_timer} seconds...")
                print()
            else:
                is_next_button = False
                print(F"Last response for {region} received.  Process completed.")
        except:
            pass
    
    # Store all search pages for math tutor
    response_dict[region] = region_response_list

sf_bay_area 1 response received.
Waiting 10 seconds...

sf_bay_area 2 response received.
Waiting 10 seconds...

sf_bay_area 3 response received.
Waiting 10 seconds...

Last response for sf_bay_area received.  Process completed.
new_york 1 response received.
Waiting 10 seconds...

new_york 2 response received.
Waiting 10 seconds...

new_york 3 response received.
Waiting 10 seconds...

Last response for new_york received.  Process completed.
los_angeles 1 response received.
Waiting 10 seconds...

los_angeles 2 response received.
Waiting 10 seconds...

Last response for los_angeles received.  Process completed.
sacramento 1 response received.
Waiting 10 seconds...

Last response for sacramento received.  Process completed.
chicago 1 response received.
Waiting 10 seconds...

Last response for chicago received.  Process completed.
san_diego 1 response received.
Waiting 10 seconds...

Last response for san_diego received.  Process completed.
houston 1 response received.
Waiting 10 seconds...

In [5]:
# Walk through each region to get a list of all individual postings for math tutoring 
# in the results page we searched up earlier.
posts_dict = {}
for region, responses in response_dict.items():
    #current_region = region
    region_posts = []
    for response in responses:
        current_html_soup = BeautifulSoup(response.text, 'html.parser')
        current_posts = current_html_soup.find_all('li', class_='result-row')
        region_posts.extend(current_posts)
    posts_dict[region] = region_posts

In [6]:
soup_objects_dict = {}

current_time = dt.datetime.now()
num_seconds = num_regions * 120 * 10
max_finish_time = current_time + dt.timedelta(seconds=num_seconds)

print(F"Current time is {current_time.strftime('%H:%M:%S')}")
print(F"Process will finish by {max_finish_time.strftime('%H:%M:%S')}")
print()

for count, region in enumerate(posts_dict, start=1):
    # Walk through each region and create a list of soup_objects to scrape from by 
    # storing them into memory.  This way we only have to send these get requests 
    # once and Craigslist doesn't ban us for sending the same https requests over 
    # and over
    soup_objects_list = []
    #link_list = []
    for i, post in enumerate(posts_dict[region]):
        # Impose a timer so that we send each get request between 5 and 10 seconds.
        # This is again to help prevent from getting banned for too many HTTP 
        # requests.
        random_int = random.randint(5,10)
        time.sleep(random_int)
        current_link = post.a.get('href')
        #link_list.append(current_link)
        response_object = session.get(current_link)
        soup_object = BeautifulSoup(response_object.text, 'html.parser')
        soup_objects_list.append(soup_object) 
        # Impose condition that every 10th post will trigger something printed
        # to the screen.  This part of the code is a long process and I wanted
        # something to help keep track of how much progress has been made
        if (i !=0) and ((i-1) % 10 == 9):
            print(F"Post number {i} in {region} is being extracted.")
    
    soup_objects_dict[region] = soup_objects_list
    if count != len(posts_dict):
        print()
        print(F"Soup objects for {region} acquired.  Waiting for next region...")
        print()
    else:
        print()
        print(F"Soup objects for {region} acquired.  Process complete.")

Current time is 23:15:05
Process will finish by 02:55:05

Post number 10 in sf_bay_area is being extracted.
Post number 20 in sf_bay_area is being extracted.
Post number 30 in sf_bay_area is being extracted.
Post number 40 in sf_bay_area is being extracted.
Post number 50 in sf_bay_area is being extracted.
Post number 60 in sf_bay_area is being extracted.
Post number 70 in sf_bay_area is being extracted.
Post number 80 in sf_bay_area is being extracted.
Post number 90 in sf_bay_area is being extracted.
Post number 100 in sf_bay_area is being extracted.
Post number 110 in sf_bay_area is being extracted.
Post number 120 in sf_bay_area is being extracted.
Post number 130 in sf_bay_area is being extracted.
Post number 140 in sf_bay_area is being extracted.
Post number 150 in sf_bay_area is being extracted.
Post number 160 in sf_bay_area is being extracted.
Post number 170 in sf_bay_area is being extracted.
Post number 180 in sf_bay_area is being extracted.
Post number 190 in sf_bay_area is

## Pre-processing Craigslist Data

In [85]:
df_list = []
error_list_text = []
error_list_links = []

# Walk through each region that contains a list of soup objects corresponding to the # search of services for math tutors.
for search_region in soup_objects_dict:
    # Initialize several lists to store relevant information for analysis
    price_list = []
    city_list = []
    datetime_list = []
    body_text_list = []
    subregion_list = []
    region_list = []
    link_list = []
    search_region_price_list = []
    
    # Walk through each soup object in the list corresponding to the search region 
    # and get the link of the soup object to scrape from.
    for soup in soup_objects_dict[search_region]:
        try:
            link = soup.find("meta", property="og:url")['content']
        except:
            # In case a link can't be found, we add the soup object to a list
            # to inspect later and set link to 'None', which we'll use to filter
            # these results out later
            link = 'None'
            error_list_links.append(soup)
            print("Couldn't get link")

        # Extract region of post from Craigslist
        post_region = soup.find_all('li',class_='crumb area')[0].find('a').get_text()
        post_region = post_region.replace(' ', '_')
        post_region = post_region.lower()
        
        # Get text of postingbody of the post and remove unwanted text.
        try:
            text = soup.find('section', id='postingbody').get_text()
            #text = text.replace('\n', '')
            text = text.replace(';', ',') # We do this so that we can use ; as 
                                          # a delimiter when copying data from a 
                                          # CSV file into a SQL database later.
            text = text.replace('QR Code Link to This Post', '') # We do this 
                                                                 # because this
                                                                 # text from one
                                                                 # post in
                                                                 # particular was                                                                      # giving me 
                                                                 # trouble and
                                                                 # the best way I 
                                                                 # could find to 
                                                                 # handle it was 
                                                                 # to remove the 
                                                                 # text.
            text = text.replace(u'\xa0', u' ')

        except:
            error_list_text.append(soup)
            text = 'None'
            #body_text_list.append(text)
            print("Couldn't get text")

        # Only let posts through that have a link to scrape from and those posts 
        # where the region of the post matches the region of the search.  Some CL 
        # search results are for neighboring areas, ones that come up in a different
        # region than the region your search was from, which leads to duplicates in 
        # areas like Los Angeles and San Diego.  This will weed out duplicates.
        if post_region == search_region and link!= 'None':
            region_list.append(post_region)
            link_list.append(link)
            body_text_list.append(text)

            # Use regular expressions to find all instances of prices in the text
            #old_prices = re.findall('(?:[\$]{1}[,\d]+.?\d*)', text)
            old_prices = re.findall('(?:[\$]{1}[,\d]+\d*)', text)
            # Alternative, if trying to capture decimals 
            # ^(?:\${1}\d+(?:,\d{3})*(?:\.{1}\d{2}){0,1})?$



            # Intialize empty list to store the new prices after processing old
            # prices.
            new_prices = []
            #print(F"Initialized new_prices: {new_prices}")
            # Walk through each price in the post.
            for price in old_prices:
                # Clean unwanted characters.
                price = price.replace('$', '')
                price = price.replace('/', '')
                price = price.replace('!', '')
                price = price.replace('h', '')
                price = price.replace('.', '')
                price = price.replace(')', '')
                price = price.replace(',', '')
                price = price.replace('>', '')
                price = price.rstrip()   
                # Some tutors give prices as a range ie '$30-40'.  In order to
                # work with this data, I split based on the hyphen, then I can 
                # use each price individually.
                split_prices = price.split('-')
            #print(F"Here are the old_prices: {old_prices}")
            #print(F"Here are the split_prices: {split_prices}")

                # Walk through each price in the posting, after any necessary splits 
                # have been made.
                for p in split_prices:
                    # Only proceed if the post contained prices, ie if p is a non-
                    # empty string.
                    if len(p)!=0:

                        try:
                            # Convert string price to int.
                            new_int = int(p)
                            if new_int <= 200:
                                new_prices.append(new_int)

                        except:
                            # Show which prices aren't able to convert to an int and 
                            # the post they came from so we can isolate and fix the 
                            # issue.
                            print(F'Error converting this price: {p}')
                            print(split_prices)
                            print()
                            print('Here is the text of the post:')
                            print()
                            print(text)
                            print('-'*50)
                            print()
                            # Set prices that can't be covered to NaN so the process 
                            # can finish.
                            new_prices.append(np.nan) 
            #print(F"Here are the processed new_prices: {new_prices}")
                    #print(len(new_prices))


            # Append prices before they're processed to a separate list, in case we
            # need to isolate issues and fix them later.

            search_region_price_list.append(new_prices)

            # For posts that had no prices listed, we append new_prices with "None"
            if len(new_prices)==0:
                #price_list.append('None')
                price_list.append(np.nan)
            # For posts that had a single price, we use it.
            elif len(new_prices)==1:
                price_list.append(new_prices[0])
            # For posts that contained two prices, we average them.  This helps with 
            # posts that give a range of prices (ie $25-30).
            elif len(new_prices)==2:
                avg_price_2 = np.average(new_prices)
                price_list.append(avg_price_2)
            # If a post has more than 3 prices, we append them, but this means we 
            # have to inspect them manually and deal with them later.
            else:
                #price_list.append(new_prices)
                price_list.append(np.nan)
            #print(price_list)


            # Get city information for each posting.
            try:
                city = soup.find(class_='postingtitletext').small.get_text()

                # Because of the way CL operates, one has to choose a city from a
                # radio button list that CL provides when one creates a post to offer 
                # a service, however later, there's a field where they can type in 
                # any city they want.  Many people will randomly choose a city from 
                # the radio button list, but then  post their city as "online".  This 
                # makes sure we capture them. 
                re_pattern = re.compile('online')
                online_flag = re.search(re_pattern, city.lower())
                if online_flag:
                    city_list.append('Online')
                else:
                    # Strip out leading and trailing white spaces, replace
                    # parentheses, and capitalize each word in the str.
                    city = city.strip()
                    city = city.replace('(', '').replace(')', '')        
                    city = city.title()
                    city_list.append(city)
            except:
                # If a post has no city information, use None
                city_list.append('None')

            # Extract subregion of Craigslist that the post was made in.
            # This will allow for comparison of prices across different cities
            # within the same metropolitan sub_region.
            try:
                subregion = soup.find_all('li', class_='crumb subarea')[0].find('a').get_text()
                subregion = subregion.title()
                subregion_list.append(subregion)
            except:
                subregion_list.append('None')


            # Extract time the posting was made.
            try:
                dt_object = soup.find('time')['datetime']
                datetime_list.append(dt_object)
            except:
                datetime_list.append('None')
        else:
            pass
    #print(price_list)
    # Create temporary df to store results for each region
    temp_df = pd.DataFrame(data=zip(datetime_list,
                                    link_list, 
                                    price_list, 
                                    city_list, 
                                    subregion_list, 
                                    region_list, 
                                    body_text_list,
                                    search_region_price_list),
                        columns=['date_posted', 
                                 'link', 
                                 'price', 
                                 'city', 
                                 'subregion', 
                                 'region', 
                                 'post_text',
                                 'price_list']
                          )

# # Find indices of duplicate results, then drop them and reset indices.
# temp_duplicate_indices = temp_df[temp_df['post_text'].duplicated()==True].index
# temp_df_no_dups = temp_df.drop(index=temp_duplicate_indices)
# temp_df_no_dups = temp_df_no_dups.reset_index(drop=True)
# temp_df_no_dups['len_of_price_list']=temp_df_no_dups['price_list'].apply(lambda x: len(x))


# temp_text_for_comparison = temp_df_no_dups['post_text']
# vect = TfidfVectorizer(min_df=1, stop_words='english')
# temp_tfidf = vect.fit_transform(temp_text_for_comparison)
# temp_pairwise_similarity = temp_tfidf * temp_tfidf.T
# temp_pairwise_array = temp_pairwise_similarity.toarray()
# np.fill_diagonal(temp_pairwise_array, np.nan)
# temp_argwhere_array = np.argwhere(temp_pairwise_array > 0.9)


# df_row_idx = []
# dup_row_idx = []
# for row in temp_argwhere_array:
#     current_idx = row[0]
#     #print(F"Current row: {row}, Current idx: {current_idx}")
#     duplicate_list = []
#     if current_idx in df_row_idx:
#         continue
#     else:
#         df_row_idx.append(current_idx)
#     for other_row in temp_argwhere_array:
#         other_idx = other_row[1]
#         #print(F"Here's the other_row: {other_row}, Other idx: {other_idx}")
#         if current_idx == other_row[0]:
#             duplicate_list.append(other_idx)
#     #print(F"This is the current dup_list: {duplicate_list}")
#     #print()
#     dup_row_idx.append(duplicate_list)


# temp_df_no_dups['match'] = np.array(temp_df_no_dups.index.values, dtype='object')
# # temp_df_no_dups['match'] = temp_df_no_dups['match'].apply(lambda x: [x])


# match_col_idx = temp_df_no_dups.columns.get_loc('match')
# temp_df_no_dups.iloc[df_row_idx, match_col_idx] = dup_row_idx
# temp_df_no_dups['match'] = temp_df_no_dups['match'].apply(lambda x: [x])

# indices = []

# # for i, row in temp_df_no_dups.iterrows():
# #     indices.append(i)
# #     temp_df_no_dups = temp_df_no_dups.drop(
# #         index=[item for item in row["match"] if item not in indices], errors="ignore"
# #     )

# # if search_region=='phoenix':
# #     print()
# #     print(F'search region: {search_region}')
# #     for i, row in temp_df_no_dups.iterrows():
# #         indices.append(i)
# #         drop_idx = []
# #         print(i, row['match'])
# #         try:
# #             for item in row['match']:
# #                 if item not in indices:
# #                     drop_idx.append(item)
# #             temp_df_no_dups = temp_df_no_dups.drop(index=drop_idx, errors="ignore")
# #         except Exception as e:
# #             #print(i, item, row['match'])
# #             print(e, i, item, row['match'])

# print()
# print(F'search region: {search_region} starting')
# for i, row in temp_df_no_dups.iterrows():
#     indices.append(i)
#     drop_idx = []
#     #print(i, row['match'])
#     try:
#         for item in row['match']:
#             if item not in indices:
#                 drop_idx.append(item)
#         temp_df_no_dups = temp_df_no_dups.drop(index=drop_idx, errors="ignore")
#     except Exception as e:
#         #print(i, item, row['match'])
#         print(e, i, item, row['match'])
# print(F'search region: {search_region} complete')




    # Append each temporary df to a list, which we can concatenate into one larger 
    # df, later.



    df_list.append(temp_df)

Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text
Couldn't get link
Couldn't get text


In [86]:
# Check for errors in getting text from a post, or from getting the URL of a post.
len(error_list_text), len(error_list_links)

(15, 15)

In [87]:
# Concatenate the dfs for each region into one larger df and check its shape.
concat_df = pd.concat(df_list, ignore_index=True)
concat_df.shape

(1326, 8)

### Dropping Duplicate posts

In [88]:
# Get date of html request to label our output with.
date_of_html_request = str(dt.date.today())

# Include the date posts were scraped on to track tutoring prices over time.
concat_df['posts_scraped_on'] = date_of_html_request

# Count duplicates.
concat_df['post_text'].duplicated().value_counts()

True     668
False    658
Name: post_text, dtype: int64

In [89]:
# Find indices of duplicate results, then drop them and reset indices.
duplicate_indices = concat_df[concat_df['post_text'].duplicated()==True].index
df_exact_txt_dropped = concat_df.drop(index=duplicate_indices)
df_exact_txt_dropped = df_exact_txt_dropped.reset_index(drop=True)
df_exact_txt_dropped['len_of_price_list']=df_exact_txt_dropped['price_list'].apply(lambda x: len(x))
df_exact_txt_dropped.shape

(658, 10)

In [90]:
text_for_comparison = df_exact_txt_dropped['post_text']
vect = TfidfVectorizer(min_df=1, stop_words='english')
tfidf = vect.fit_transform(text_for_comparison)
pairwise_similarity = tfidf * tfidf.T
pairwise_array = pairwise_similarity.toarray()
np.fill_diagonal(pairwise_array, np.nan)
argwhere_array = np.argwhere(pairwise_array > 0.63)
argwhere_array

array([[  0, 141],
       [  1,  31],
       [  1,  37],
       ...,
       [646, 643],
       [655, 656],
       [656, 655]])

In [91]:
df_row_idx = []
dup_row_idx = []
for row in argwhere_array:
    current_idx = row[0]
    #print(F"Current row: {row}, Current idx: {current_idx}")
    duplicate_list = []
    if current_idx in df_row_idx:
        continue
    else:
        df_row_idx.append(current_idx)
    for other_row in argwhere_array:
        other_idx = other_row[1]
        #print(F"Here's the other_row: {other_row}, Other idx: {other_idx}")
        if current_idx == other_row[0]:
            duplicate_list.append(other_idx)
    #print(F"This is the current dup_list: {duplicate_list}")
    #print()
    dup_row_idx.append(duplicate_list)
#list(zip(df_row_idx, dup_row_idx))

In [92]:
df_exact_txt_dropped['match'] = np.array(df_exact_txt_dropped.index.values, dtype='object')
df_exact_txt_dropped['match'] = df_exact_txt_dropped['match'].apply(lambda x: [x])

match_col_idx = df_exact_txt_dropped.columns.get_loc('match')
df_exact_txt_dropped.iloc[df_row_idx, match_col_idx] = dup_row_idx
#df_exact_txt_dropped['match'] = df_exact_txt_dropped['match'].apply(lambda x: [x])

  return asarray(a).ndim
  arr_value = np.asarray(value)


In [93]:
df_exact_txt_dropped['match']

0                                  [141]
1      [31, 37, 217, 354, 456, 485, 615]
2                 [3, 68, 130, 342, 384]
3                 [2, 68, 130, 342, 384]
4                                    [4]
                     ...                
653                                [653]
654                                [654]
655                                [656]
656                                [655]
657                                [657]
Name: match, Length: 658, dtype: object

In [94]:
with pd.option_context('display.max_colwidth', None):
  x=0
  #display(df_with_prices.iloc[x]['post_text'])
  display(df_exact_txt_dropped.iloc[x]['link'])
  display(df_exact_txt_dropped.iloc[x]['post_text'])

'https://sfbay.craigslist.org/sby/lss/d/sunnyvale-expert-tutor-for-writing/7430472194.html'

"\n\n\n\n\nDon't wait to get a tutor! This applies to struggling students or brilliant ones, as I can help both achieve their best.\n\n\n\nWould your student perform better in his or her subjects with a weekly tutor? I am here to help. I am a master tutor with fifteen years of experience bringing out the best of my students. I can help your student improve their English and Math skills, and maximize both learning and grades.\n\n\nI earned a graduate degree from Harvard University, and I specialize adapting my teaching style to each unique student. Additionally, I have lived overseas for five years, and so I have a speciality helping students who have English as a second language (ESL). I also am an expert SAT and ACT tutor.\n\n\nI can help you with the following:ENGLISH: English classes,Writing, Grammar, StyleHISTORY & OTHER LIBERAL ARTS: History, Social Studies, Government, Political Science, etc.MATH: Arithmetic, Fractions, Algebra, Geometry, TrigonometrySCIENCE: Chemistry, Biology, 

In [95]:
with pd.option_context('display.max_colwidth', None):
  x=471
  #display(df_with_prices.iloc[x]['post_text'])
  display(df_exact_txt_dropped.iloc[x]['link'])
  display(df_exact_txt_dropped.iloc[x]['post_text'])

'https://sacramento.craigslist.org/lss/d/elk-grove-certified-teacher-stanford/7429909819.html'

"\n\n\n\n\n(916) 238-6790\n\nMy name is Samantha and I’m a former teacher.\n\nMy rates are very reasonable and affordable. I tutor in-person and online.\n\nI tutor students of all ages and for all subjects. I tutor 7 days a week and I have a ton of material I can share with students including over 15 years of previous exams as well as many unique study guides I helped create.\n\nIf you are looking for tutoring for tests such as the MCAT, SAT, ACT, GRE, GMAT, LSAT, or any subject, I would be the perfect tutor. I've taken all of the tests listed above (because I’ve been a tutor for several years) and I scored in the top 5% in each.\n\nI received:\n\n36 on the ACT\n1580 on the SAT\n178 on the LSAT\n525 on the MCAT\n780 on the GMAT\n334 on the GRE.\n\nI graduated from Stanford University as an undergrad and recently graduated with a Ph.D. from Harvard.\n\nAside from helping students with improving their test scores, I can also help students with their math (Algebra, Geometry, Trigonometry,

In [96]:
indices = []

# for i, row in temp_df_exact_txt_dropped.iterrows():
#     indices.append(i)
#     temp_df_exact_txt_dropped = temp_df_exact_txt_dropped.drop(
#         index=[item for item in row["match"] if item not in indices], errors="ignore"
#     )

# if search_region=='phoenix':
#     print()
#     print(F'search region: {search_region}')
#     for i, row in temp_df_exact_txt_dropped.iterrows():
#         indices.append(i)
#         drop_idx = []
#         print(i, row['match'])
#         try:
#             for item in row['match']:
#                 if item not in indices:
#                     drop_idx.append(item)
#             temp_df_exact_txt_dropped = temp_df_exact_txt_dropped.drop(index=drop_idx, errors="ignore")
#         except Exception as e:
#             #print(i, item, row['match'])
#             print(e, i, item, row['match'])

df_no_dups = df_exact_txt_dropped.copy()

for i, row in df_no_dups.iterrows():
    indices.append(i)
    drop_idx = []
    #print(i, row['match'])
    try:
        for item in row['match']:
            if item not in indices:
                drop_idx.append(item)
        df_no_dups = df_no_dups.drop(index=drop_idx, errors="ignore")
    except Exception as e:
        #print(i, item, row['match'])
        print(e, i, item, row['match'])


In [97]:
df_exact_txt_dropped.shape, df_no_dups.shape

((658, 11), (346, 11))

### Dropping posts that contained no prices, which aren't helpful for our analysis

In [98]:
df_no_dups['len_of_price_list'] = df_no_dups['price_list'].apply(lambda x: len(x))

In [99]:
# Filter out results that don't have a price and reset indices.
df_with_prices = df_no_dups[df_no_dups['len_of_price_list'] > 0]
df_with_prices = df_with_prices.reset_index(drop=True)

In [100]:
df_with_prices.shape

(142, 11)

In [101]:
unique_posts_count = len(df_no_dups)
post_with_prices_count = len(df_with_prices)
num_posts = len(concat_df)

percent_unique = unique_posts_count / num_posts * 100
percent_with_prices = post_with_prices_count / num_posts * 100

print(F"Out of {num_posts}, there were {unique_posts_count} posts that weren't duplicated, or {percent_unique:.2f}%.")
print(F"There were {post_with_prices_count} posts that had prices included and weren't duplicates.")

print(F"Only {percent_with_prices:.2f}% of the posts that we scraped remain.")

Out of 1326, there were 346 posts that weren't duplicated, or 26.09%.
There were 142 posts that had prices included and weren't duplicates.
Only 10.71% of the posts that we scraped remain.


### Extracting complete.

# *Transforming* Craigslist data: Post-processing

## Are there any posts that might need manual cleaning?  This would include:
* Posts that had 3 or more prices and were marked as null
* Posts where the price wasn't able to convert from `str` -> `int` and were marked as null during pre-processing

There are the entries that were marked as `Null`.  Let's investigate them manually:

In [102]:
df_null_prices = df_with_prices[df_with_prices['price'].isnull()==True]
df_null_prices[['price', 'price_list']]

Unnamed: 0,price,price_list
0,,"[90, 60, 40]"
3,,"[40, 40, 45, 45]"
9,,"[30, 35, 45]"
16,,"[30, 30, 60, 90]"
19,,"[60, 50, 100]"
25,,"[100, 115, 130, 65, 30, 60]"
29,,"[40, 40, 40, 40, 40]"
31,,"[50, 10, 50]"
32,,"[50, 100, 135]"
34,,"[30, 50, 60]"


In [103]:
posts_with_mult_prices = df_null_prices.shape[0]
print(F"There were {posts_with_mult_prices} posts with price marked null.")

There were 23 posts with price marked null.


In [104]:
df_null_prices = df_null_prices.drop(columns=['len_of_price_list', 'match'])
df_null_prices.to_csv('./posts_to_investigate/{}_posts_with_null_prices.csv'.format(date_of_html_request), index=False)

In [105]:
# Inspect links manually, one by one, to decide what to do about price information
with pd.option_context('display.max_colwidth', None):
  x=3
  #display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['link'])
  display(df_with_prices.iloc[x]['price'])

'https://sfbay.craigslist.org/pen/lss/d/san-mateo-math-emcsci-english-chem-bio/7429754585.html'

nan

### Cleaning posts with three or more prices manually - distilling down to one price

We distill posts that had more complicated text that involved three or more prices, such as :

* $40$/hr, $50$/1.5hr, $60$/2hr
  * Complicated pricing schedule
* $40$/hr but $10$ additional per person, if a group session is desired
  * Group rates
* $30$/hr Science, $40$/hr math, come and try a first session for the reduced price of $20$.
  * Special offers

into a single price.  Other posts repeated their prices multiple times, so we distill those down to a single price as well, then mark any of the entries we changed as being cleaned.

In [106]:
price_col_idx = df_with_prices.columns.get_loc('price')

In [107]:
# Says $40 for in person, or $45 for at home, so I took the average.
san_mateo_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('I mainly tutor, in person, at the Downtown Redwood City, downtown San Mateo')].index

try:
    df_with_prices.iloc[san_mateo_tutor_idx,price_col_idx] = 42.5

except:
    print("Issue with san_mateo_tutor and iloc.")
    pass

In [108]:
# Because the ad says $90 in person, $60 for online, and Corona Virus pricing of
# $40 for online weekdays, I'm using the $40 per hour rate because it seems the
# most reasonable.
kenari_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('kenaritutor.com')==True].index

try:
    df_with_prices.iloc[kenari_tutor_idx,price_col_idx] = 40
except:
    print('Issue with kenari_tutor_idx and iloc.')
    pass

In [109]:
# This ad mentions several prices for different subjects, but explicitly says $30 for math.
la_honda_idx = df_with_prices[df_with_prices['post_text'].str.contains('909-640-3570')].index

try:
    df_with_prices.iloc[la_honda_idx,price_col_idx] = 30
    
except:
    print("Issue with la_honda_idx and iloc.")
    pass

In [110]:
# Says #60 per hour.
glasses_lady_idx = df_with_prices[df_with_prices['post_text'].str.contains("offering virtual one-on-one Math tutoring via Zoom")==True].index

try:
    df_with_prices.iloc[glasses_lady_idx, price_col_idx] = 60
except:
    print("Issue with glasses_lady_idx and iloc.")
    pass  

In [111]:
# Says #60 per hour.
UC_Davis_data_scientist = df_with_prices[df_with_prices['post_text'].str.contains("PhD in Engineering from UC Davis")==True].index

try:
    df_with_prices.iloc[UC_Davis_data_scientist, price_col_idx] = 60
except:
    print("Issue with UC_Davis_data_scientist and iloc.")
    pass  

In [112]:
#This guy has weird price structuring, but I used his hourly rate for each time interval, $100 for 80 minutes, $115 for 100 minutes, $130 for 120 minutes, then averaged those hourly rates to estimate for what a single hour would cost.
oakland_exp_tutor_online_idx = df_with_prices[df_with_prices['post_text'].str.contains('I received a full scholarship to University of Cincinnati and held a 3.8 GPA through my master’s program in aerospace')==True].index

oakland_tutor_avg_rate = ((100/80) + (115/100) + (130/120)) * 60 / 3

try:
    df_with_prices.iloc[oakland_exp_tutor_online_idx, price_col_idx] = oakland_tutor_avg_rate

except:
    print("Issue with oakland_exp_tutor_online_idx and iloc.")
    pass

In [113]:
# The ad repeats the price of $40 over and over, so I'm replacing the price with 
# a single instance.
star_star_college_math_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('https://www.youtube.com/channel/UCqhFZRmUqOAAPMQpo58TV7g'
                   ) == True].index

try:
    df_with_prices.iloc[star_star_college_math_tutor_idx, price_col_idx] = 40
    
except:
    print("Issue with star_star_college_math_tutor_idx and iloc.")
    pass

In [114]:
# Says $50/hr    
trevor_skelly_idx = df_with_prices[df_with_prices['post_text'].str.contains('trevorskelly')==True].index

try:
    df_with_prices.iloc[trevor_skelly_idx,price_col_idx] = 50
    
except:
    print("Issue with trevor_skelly_idx and iloc.")
    pass

In [115]:
# Charges $50 per hour for sessions under 3 hours
spss_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('datameer', case=False)==True].index

try:
    df_with_prices.iloc[spss_tutor_idx, price_col_idx] = 50
    
except:
    print("Issue with spss_tutor_idx and iloc.")
    pass

In [116]:
# Charges $50 per hour
tutor_sam_idx = df_with_prices[df_with_prices['post_text'].str.contains('thetutorsam')==True].index

try:
    df_with_prices.iloc[tutor_sam_idx, price_col_idx] = 50
    
except:
    print("Issue with tutor_sam_idx and iloc.")
    pass

In [117]:
# Charges $40 per hour
peter_d_idx = df_with_prices[df_with_prices['post_text'].str.contains('Peter D.')==True].index

try:
    df_with_prices.iloc[peter_d_idx, price_col_idx] = 40
except:
    print("Issue with peter_d_idx and iloc.")
    pass    

In [118]:
# Charges $45 per hour for individual lessons
algebra_exclusively_idx = df_with_prices[df_with_prices['post_text'].str.contains('algebra EXCLUSIVELY')==True].index

try:
    df_with_prices.iloc[algebra_exclusively_idx, price_col_idx] = 45
except:
    print("Issue with algebra_exclusively_idx and iloc.")
    pass    

In [119]:
# Post includes many prices, but states $55/hr for Precalc and $80/hr for Calculus, so I took the average of those prices
aerospace_engineer_idx = df_with_prices[df_with_prices['post_text'].str.contains('in the aerospace industry looking', regex=False)==True].index

try:
    df_with_prices.iloc[aerospace_engineer_idx, price_col_idx] = (55 + 80)/2

except:
    print("Issue with aerospace_engineer_idx and iloc.")
    pass    

In [120]:
# This ad mentions $45 for lower division college courses, which are a large segment of the subjects I help with, so I'm using that price to compare myself against.
ucb_phd_student_and_ta_idx = df_with_prices[df_with_prices['post_text'].str.contains('Former UC-Berkeley economics Ph.D. student and TA')].index

try:
    df_with_prices.iloc[ucb_phd_student_and_ta_idx, price_col_idx] = 45

except:
    print("Issue with ucb_phd_student_and_ta_idx and iloc.")
    pass

In [121]:
# The add says $55/hr for K-12, then $65/hr for AP/Honors, as well as Pre-calc, 
# etc., I'm going to average the two prices.  Set needs cleaning column to False 
# b/c the prices have been cleaned.
park_academy_idx = df_with_prices[df_with_prices['post_text'].str.contains('(949) 490-0872', regex=False)==True].index

try:
    df_with_prices.iloc[park_academy_idx, price_col_idx] = 60

except:
    print("Issue with park_academy_idx and iloc.")
    pass

In [122]:
# Says $25/hr for high school, $30/hr for college, just went with $30/hr
sharp_mind_idx = df_with_prices[df_with_prices['post_text'].str.contains('(650) 398-9490', regex=False)==True].index

try:
    df_with_prices.iloc[sharp_mind_idx, price_col_idx] = 30
    
except:
    print("Issue with sharp_mind_idx and iloc.")
    pass

In [123]:
# Says $50/hr if travelling, $30-35/hr if virtual, so I took the average of 50 and 35
stock_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('714.425.3828', regex=False)==True].index

try:
    df_with_prices.iloc[stock_tutor_idx, price_col_idx] = (35 + 50)/2
    
except:
    print("Issue with stock_tutor_idx and iloc.")
    pass

In [124]:
# Post says $30/hr for Precalc/Trig and $50/hr for Calculus, so I took the average
lonzo_tutoring_idx = df_with_prices[df_with_prices['post_text'].str.contains('951-795-5027', regex=False)==True].index

try:
    df_with_prices.iloc[lonzo_tutoring_idx, price_col_idx] = 40

except:
    print("Issue with lonzo_tutoring_idx and iloc.")
    pass    

In [125]:
# This ad says $30 for one hour.
poway_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('(619)735-2579', regex=False)==True].index

try:
    df_with_prices.iloc[poway_tutor_idx, price_col_idx] = 30
    
except:
    print("Issue with poway_tutor_idx and iloc.")
    pass

In [126]:
# $20/hr online, $30/hr in person, split the difference at $25
austin_sabrina_idx = df_with_prices[df_with_prices['post_text'].str.contains('My girlfriend Sabrina')==True].index

try:
    df_with_prices.iloc[austin_sabrina_idx, price_col_idx] = 25
    
except:
    print("Issue with austin_sabrina_idx and iloc.")
    pass    

In [127]:
# Says $25/hr
alex_farrell_idx = df_with_prices[df_with_prices['post_text'].str.contains('Alexander Farrell')==True].index

try:
    df_with_prices.iloc[alex_farrell_idx, price_col_idx] = 25

except:
    print("Issue with alex_farrell_idx and iloc.")
    pass    

In [128]:
# $25/hr if meeting near CSU Sac, $35/hr if they drive to you, $20/hr for online.
# I chose $30/hr to split the difference between the in person prices.
best_math_idx = df_with_prices[df_with_prices['post_text'].str.contains('bestmathtutoring.com')==True].index

try:
    df_with_prices.iloc[best_math_idx, price_col_idx] = 30
    
except:
    print("Issue with best_math_idx and iloc.")
    pass  

In [129]:
ucla_grad_henry_idx = df_with_prices[df_with_prices['post_text'].str.contains("916 390-7923", regex=False)==True].index

try:
    df_with_prices.iloc[ucla_grad_henry_idx, price_col_idx] = 35

except:
    print("Issue with ucla_grad_henry_idx and iloc.")
    pass    

#### Checking results - Are there any posts that were marked as needing to be cleaned that we missed?

In [130]:
num_still_null = len(df_with_prices[df_with_prices['price'].isnull()==True])

if num_still_null==0:
    print("There are no posts with null prices still needing cleaning.")
else:
    print(F"There are {num_still_null} posts that need cleaning.")

There are 1 posts that need cleaning.


### Checking Posts that have two prices listed to see if averaging them is reasonable

In [131]:
df_with_prices[df_with_prices['len_of_price_list']==2][['price','price_list']]

Unnamed: 0,price,price_list
5,46.0,"[57, 35]"
7,40.0,"[35, 45]"
12,55.0,"[60, 50]"
27,57.5,"[45, 70]"
30,46.5,"[80, 13]"
35,100.0,"[80, 120]"
38,70.0,"[60, 80]"
40,55.0,"[50, 60]"
41,104.5,"[84, 125]"
45,50.0,"[40, 60]"


In [132]:
with pd.option_context('display.max_colwidth', None):
  x=136
  #display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['link'])
  display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['price'])

'https://dallas.craigslist.org/ndf/lss/d/plano-private-math-physics-tutor/7429037374.html'

'\n\n\n\n\nIn-person tutor for mathematics and physics. Range of topics include:\n\nMath: Pre-algebra through calculus\nPhysics: High school through college freshman/sophomore level\n\nCredentials:\nMS in Physics, MS in Electrical Engineering\nBS in Physics, BS in Mathematics (Honors)\nFour years private math tutoring experience\nFour years formal physics teaching experience\nSeven years industry work as an electrical engineer at US defense contractor\n\nFlexible schedule.  Willing to travel.  Rate is $45/hour.  Feel free to contact with any questions or to set up a session!\n'

45.0

#### Ads where averaging doesn't make sense

In [133]:
# This guy's ad says 35$/half hour, but explicitly says $57 per hour, so averaging doesn't make sense.  
blake_tutoring_idx = df_with_prices[df_with_prices['post_text'].str.contains('BlakeTutoring.com', case=False)==True].index

df_with_prices.iloc[blake_tutoring_idx, price_col_idx] = 57

In [139]:
# This ad says $84/hr but then mentions a $125 for 1.5 hours.  Since these are the only two prices in the post, our code averages them, so we set the correct price to $84
test_trainer_inc_idx = df_with_prices[df_with_prices['post_text'].str.contains("TestTrainerinc", regex=False)==True].index

try:
    df_with_prices.iloc[test_trainer_inc_idx, price_col_idx] = 84

except:
    print("Issue with test_trainer_inc_idx and iloc.")
    pass 

In [140]:
# This guy's ad says $60/45mins, but $80 per hour.  Either price comes out to the same hourly rate, so averaging doesn't make sense.
hiro_kobayashi_idx = df_with_prices[df_with_prices['post_text'].str.contains('415-250-4831', case=False)==True].index

df_with_prices.iloc[hiro_kobayashi_idx, price_col_idx] = 80

In [141]:
# This guy's ad says $40/1hr, $70/2hr, so averaging doesn't make sense
guy_with_suit_idx = df_with_prices[df_with_prices['post_text'].str.contains('trained mathematician with about 20 years experience')==True].index

df_with_prices.iloc[guy_with_suit_idx, price_col_idx] = 40

In [142]:
# This guy's ad says $25/1hr, $40/2hr, so averaging doesn't make sense
christian_cerritos_college_idx = df_with_prices[df_with_prices['post_text'].str.contains('trained mathematician with about 20 years experience')==True].index

df_with_prices.iloc[christian_cerritos_college_idx, price_col_idx] = 25

In [143]:
# This guy's ad says $30/half hr, $50/1hr, so averaging doesn't make sense
dustin_csu_long_beach_idx = df_with_prices[df_with_prices['post_text'].str.contains('International Society of Automation')==True].index

df_with_prices.iloc[dustin_csu_long_beach_idx, price_col_idx] = 50

In [144]:
# This guy's ad says $65/hr for subject tutoring, $100/hr for standardized tests.  I'm primarily competing against subject tutoring, so I'll use that price
smarter_than_you_think_idx = df_with_prices[df_with_prices['post_text'].str.contains('guarantee you are smarter than you think')==True].index

df_with_prices.iloc[smarter_than_you_think_idx, price_col_idx] = 65

In [145]:
# This guy's ad says $50/hr or $160/4hr, so it doesn't make sense to average.
dead_in_ditch_idx = df_with_prices[df_with_prices['post_text'].str.contains('dead in a ditch')==True].index

df_with_prices.iloc[dead_in_ditch_idx, price_col_idx] = 50

In [146]:
# This guy's ad says $45/hr +$10 more per student, so it doesn't make sense to average.
distinguished_teacher_idx = df_with_prices[df_with_prices['post_text'].str.contains('"Distinguished Teacher"')==True].index

df_with_prices.iloc[distinguished_teacher_idx, price_col_idx] = 45

In [147]:
# This guy's ad says $40/hr +$10 more for each additional person, so it doesn't make sense to average.
vahab_idx = df_with_prices[df_with_prices['post_text'].str.contains('vababtaghizade@gmail.com')==True].index

df_with_prices.iloc[vahab_idx, price_col_idx] = 40

In [148]:
# This guy's ad says $30/hr for trial session, then $60/hr afterwards, so it doesn't make sense to average.
myles_ahead_idx = df_with_prices[df_with_prices['post_text'].str.contains('mylesaheadtutoring')==True].index

df_with_prices.iloc[myles_ahead_idx, price_col_idx] = 60

In [149]:
# This guy's ad says $45/hr, then talks about selling a workbook for $30, so it doesn't make sense to average.
john_the_tutor_idx = df_with_prices[df_with_prices['post_text'].str.contains('480-343-2212')==True].index

df_with_prices.iloc[john_the_tutor_idx, price_col_idx] = 45

## Investigating posts with extreme prices.  Are there any price outliers that we need to clean?

Prices >= 100 or <= 20 are what I would consider to be extreme prices.  Let's investigate them.

In [150]:
df_with_prices[(df_with_prices['price']>=100) | (df_with_prices['price']<=20)][['price', 'post_text', 'price_list']]

Unnamed: 0,price,post_text,price_list
4,120.0,\n\n\n\n\n*****I am currently offering both Zo...,[120]
8,20.0,\n\n\n\n\nText 2133408660 or register at peerl...,[20]
35,100.0,"\n\n\n\n\nMy name is Sameer Tyagi, former Harv...","[80, 120]"
37,150.0,\n\n\n\n\nHello! My name is Connor and I've be...,[150]
55,200.0,\n\n\n\n\ncheck out my website!\nmd-maker.com\...,[200]
61,19.0,\n\n\n\n\nHi! \n\nI am a certified teacher wit...,[19]
73,15.0,\n\n\n\n\njargon free math tutor $15 all level...,[15]
74,20.0,\n\n\n\n\nLocated in NYC. I graduated with a b...,[20]
75,20.0,\n\n\n\n\nWhy I am an exceptional tutor: \n\nF...,[20]
76,18.0,\n\n\n\n\nSAT prep for as low as $18 per hour!...,[18]


In [151]:
with pd.option_context('display.max_colwidth', None):
  x=40
  #display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['link'])
  display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['price'])

'https://sfbay.craigslist.org/sby/lss/d/san-jose-math-enrichment-classes-for/7427917503.html'

"\n\n\n\n\nHi, I'm Mark, and I'm a tutor with six years of experience tutoring in math, English, biology, chemistry, SAT, and economics. I am currently teaching three math enrichment classes - Accelerated Algebra, Accelerated Precalculus/Calculus, and Calculus for Young Learners.\n\nAccelerated Algebra: In this course, the concepts behind Algebra 1 and Algebra 2 are combined with a focus on precalculus to challenge students and prepare them for the next level of math. My goal is to challenge the students to understand the next-level applications of the concepts they learn in algebra.\n\nAccelerated Precalculus/Calculus: This course is intended for students who have taken or are planning to take precalculus/calculus and want to learn at a faster, more challenging level. This is ideal for students planning to take the AP Calculus AB/BC Exams or College Math.\n\nCalculus for Young Learners:  is a special course intended for children who aspire to gain a deep understanding of the concepts 

55.0

### Dropping posts with extreme prices that aren't relevant

In [152]:
# This ad is for poker tutoring/coaching, not really what I'm competing against, so we drop all instances.  He also mentions he tutors math in this post, but he has a separate post up that we've captured which has his math tutoring pricing information.
australia_daniel_idx = df_with_prices[df_with_prices['post_text'].str.contains("I'm available as a dealer if you need one", regex=False)==True].index

df_with_prices.drop(labels=australia_daniel_idx, inplace=True)
df_with_prices = df_with_prices.reset_index(drop=True)

### Correcting pricing information for posts with extreme prices

In [153]:
# This ad says $50/hr but then mentions a prepay plan for $160 for 4 hours.  Since these are the only two prices in the post, our code averages them, so we set the correct price to $50
google_maps_idx = df_with_prices[df_with_prices['post_text'].str.contains("willing to travel if Google Maps", regex=False)==True].index

try:
    df_with_prices.iloc[google_maps_idx, price_col_idx] = 50

except:
    print("Issue with google_maps_idx and iloc.")
    pass 

In [154]:
# This ad says $45/hr for high school or college, but then mentions a $35 for middle school.  Since these are the only two prices in the post, our code averages them, so we set the correct price to $45, since I primarily tutor high school or college students.
rancho_penasquitos_idx = df_with_prices[df_with_prices['post_text'].str.contains("Rancho Penasquitos (Park Village Neighborhood)", regex=False)==True].index

try:
    df_with_prices.iloc[rancho_penasquitos_idx, price_col_idx] = 45

except:
    print("Issue with rancho_penasquitos_idx and iloc.")
    pass 

### Transforming Complete

# *Load* - Saving results

### Store results locally as CSV files

In [155]:
# Drop unnecessary columns.  CL links will expire after some number of days, the prices_need_cleaning and price_to_investigate columns have been manually inspected, and lastly we've distilled the multiple prices in the price_list down to a single value
df_for_sql = df_with_prices.drop(labels=['link', 'price_list', 'len_of_price_list', 'match'], axis=1)

# In order for psycopg2 to parse our CSV file correctly later, we need to escape all new line characters by adding an additional \ in front of \n.
df_for_sql['post_text'] = df_for_sql['post_text'].str.replace('\n', '\\n')

# Store cleaned data as CSV file in preparation for importing to SQL database
df_for_sql.to_csv("./csv_files/{}_all_regions_with_prices.csv".format(date_of_html_request), index=False, sep=';')

# Store original data, before we applied any cleaning to it, in case it's needed for something later on.
concat_df.to_csv("./csv_files/{}_all_regions_posts.csv".format(date_of_html_request), index=False)

### Importing into PostgreSQL database

In [156]:
# Establish connection to PSQL database
conn = psycopg2.connect("host=localhost dbname=rancher user=rancher")

# Instantiate a cursor object
cur = conn.cursor()

# Use cursor object to create a database for storing the information we scraped and cleaned, if one doesn't already exist.
cur.execute("""    
    CREATE TABLE IF NOT EXISTS cl_tutoring2(
    id SERIAL primary key,
    date_scraped date,
    price decimal,
    city text,
    subregion text,
    region text,
    post_text text,
    date_posted timestamp
);
""")

# Commit changes to database
conn.commit()

In [157]:
# Instantiate a new cursor object
cur = conn.cursor()

# Copy data from our CSV file into database.  
### Note, we can use the ; separator freely because we replaced all instances of semicolons in a post to commas during the preprocessing stage, ensuring that psycopg2 won't misinterpret a semicolon in the body of a post as a separator, splitting a row in the CSV file into too many columns as a result.
### Also, we must specify null="" because Python represents null values as an empty string when writing to a CSV file and psycopg2 needs to know how null values are represented in the CSV file in order to properly insert null values into the database
with open('./csv_files/' + str(date_of_html_request) + '_all_regions_with_prices.csv', 'r') as file:
    next(file) # Skip the header row
    cur.copy_from(file, 'cl_tutoring2', sep=';', null="", columns=('date_posted', 'price', 'city', 'subregion', 'region', 'post_text', 'date_scraped'))
    
# Commit changes to database
conn.commit()

### Done!!!

# Scratch work

In [158]:
df_with_prices[df_with_prices['price'].isnull()==True]

Unnamed: 0,date_posted,link,price,city,subregion,region,post_text,price_list,posts_scraped_on,len_of_price_list,match
98,2022-01-08T05:45:44-0800,https://losangeles.craigslist.org/wst/lss/d/lo...,,,Westside-Southbay,los_angeles,\n\n\n\n\nElementary teacher/Home school Teach...,"[20, 25, 30]",2022-01-09,3,[390]


In [160]:
with pd.option_context('display.max_colwidth', None):
  x=98
  #display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['link'])
  display(df_with_prices.iloc[x]['post_text'])
  display(df_with_prices.iloc[x]['price'])

'https://losangeles.craigslist.org/wst/lss/d/los-angeles-elementary-virtual-remote/7430151416.html'

'\n\n\n\n\nElementary teacher/Home school Teacher/Private tutor  with over 10 years of teaching experience .  \nServices provided: virtual/remote online support for elementary students (K-4th) using Zoom or other platform.    \n\nOnline support with teaching elementary subjects - Reading, Writing and Math, teaching English as a second language,  teaching social emotional learning tools and/or creating fun and engaging Science or Social Studies lessons. \n\nLessons will be engaging and interactive.      $20/hr  OR   $25 hr for 2 students OR $30/hr for 3-5 students. \n\n Payment via Paypal or Venmo.\n    '

nan

# Transforming Craigslist data REMOVING ENTRIES AND QUERYING WITH SQL LATER -- TO BE CONTINUED

### Are there any posts that might need manual cleaning?  This would include:
* Posts that had 3 or more prices and were marked as null
* Posts where the price wasn't able to convert from `str` -> `int` and were marked as null during pre-processing

I'll identify these posts, then remove them from our `DataFrame` to be analyzed later.  All remaining posts will have just a single price listed, which we can input to our SQL database.

In [None]:
# These are the entries with 3 or more prices listed, let's investigate why
df_null_prices = df_with_prices[df_with_prices['price'].isnull()==True]
df_null_prices[['price', 'price_list']]

In [None]:
posts_with_mult_prices = df_null_prices.shape[0]
print(F"There were {posts_with_mult_prices} posts with price marked null.")

In [None]:
null_price_idx = df_null_prices.index
df_with_single_price = df_with_prices.drop(index=null_price_idx)
df_with_single_price = df_with_single_price.reset_index(drop=True)
df_with_single_price.shape

In [None]:
df_null_prices = df_null_prices.drop(columns=['len_of_price_list', 'match'])
df_null_prices.to_csv('./posts_to_investigate/{}_posts_with_null_prices.csv'.format(date_of_html_request), index=False)