In [1]:
from requests import get
import requests
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import random
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import datetime as dt
import csv 
import psycopg2
import time
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Create a Session and Retry object to manage the quota Craigslist imposes on HTTP get requests within a certain time period 
session = requests.Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Get all state/region names

In [3]:
# Parse URL that contains all regions of Craigslist
all_sites_response = session.get('https://craigslist.org/about/sites')
all_sites_soup = BeautifulSoup(all_sites_response.text, 'html.parser')

# Extract part of webpage corresponding to regions in the US
us_sites = all_sites_soup.body.section.div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling

# Extract HTML tags corresponding to the state name and region
states_tags = us_sites.find_all('h4')
regions_tags = us_sites.find_all('ul')

states_and_regions = list(zip(states_tags, regions_tags))

# Get URL for each region of Craigslist

In [6]:
# For each of the HTML tags, we get the text of which state the region belonged to and the text of the region's name.  We now have a dictionary with keys as states that map to a list of regions in that state
state_dict = {}

for ele in states_and_regions:
    current_state = ele[0].text
    href_list = ele[1].find_all('li')
    temp_region_list = []
    for href in href_list:
        region = href.a['href'].replace('https://','').replace('.craigslist.org/','')
        temp_region_list.append(region)
        state_dict[current_state]=temp_region_list

# Crawl each state/region of Craigslist
Get the URL that corresponds to a search of the services section for "math tutor."  Craigslist is limited to showing 120 results per page, so if a region has more than 120 postings, we extract URLs corresponding to the next page of results, until there is no next button anymore and we've extracted all URLs for that region.

In [10]:
# Walk through each state in our state_Dict to get the HTML page corresponding to a search for "math tutor" in the services section
response_dict = {}

for state in state_dict.keys():

    for region in state_dict[state]:
        # This gets the first page of search results
        i=1
        
        current_response = session.get('https://' + region + '.craigslist.org/d/services/search/bbb?query=math%20tutor&sort=rel')
        
        sleep_timer = random.randint(2,4)
        time.sleep(sleep_timer)
        
        print(F"Response #{i} for {state}: {region} received.")
        #print(F"Waiting {sleep_timer} seconds...")
        print()
        
        region_response_list = []
        region_response_list.append(current_response)

        # This gets all subsequent pages, using the next button from the search page
        is_next_button = True
        while is_next_button:
            try:
                next_response = current_response
                next_soup = BeautifulSoup(next_response.text, 'html.parser')
                
# CL search pages have one of the following:
    # 1) A next button:
        # - when the region contains more than 120 posts for a given search
    # 2) A greyed out next button:
        # - when you've reached the last page of search results and there are no more
        # OR
        # - when a page has less than 120 results.
    # 3) No next button:
        # - when a page has less than 120 results
# html suffix is None type when a next button isn't shown
# html suffix is '' when the next button is greyed out.  This can happen in either case 2) or 3) from above
# The while loop only needs to be peformed in case 1) when there is a next button you can click
                html_suffix = next_soup.find(class_='button next')
                if html_suffix is not None:
                    html_suffix = html_suffix.get('href')
                    if html_suffix != '':
                        i += 1
                        #print(i, html_suffix)
                        #print('html_suffix is not blank')
                        new_button = 'https://' + current_region + '.craigslist.org' + html_suffix
                        current_response = session.get(new_button)
                        region_response_list.append(current_response)

                        sleep_timer = random.randint(2,4)
                        time.sleep(sleep_timer)
                        print(F"{region} {i} response received.")
                        print(F"Waiting {sleep_timer} seconds...")
                        print()
                    else:
                        is_next_button = False
                        #print('html_suffix is blank')
                        print(F"Last response for {region} received.  Process completed.")
                        print()
                else:
                    is_next_button = False
                    #print('next_button is None')
                    print(F"Last response for {region} received.  Process completed.")
                    print()
                    pass
            except:
                is_next_button = False
                pass

        # Store all search pages for math tutor
        response_dict[(state, region)] = region_response_list

auburn 1 response received.
Waiting 2 seconds...

Last response for auburn received.  Process completed.

bham 1 response received.
Waiting 2 seconds...

Last response for bham received.  Process completed.

dothan 1 response received.
Waiting 4 seconds...

Last response for dothan received.  Process completed.

shoals 1 response received.
Waiting 3 seconds...

Last response for shoals received.  Process completed.

gadsden 1 response received.
Waiting 4 seconds...

Last response for gadsden received.  Process completed.

huntsville 1 response received.
Waiting 4 seconds...

Last response for huntsville received.  Process completed.

mobile 1 response received.
Waiting 3 seconds...

Last response for mobile received.  Process completed.

montgomery 1 response received.
Waiting 4 seconds...

Last response for montgomery received.  Process completed.

tuscaloosa 1 response received.
Waiting 3 seconds...

Last response for tuscaloosa received.  Process completed.

anchorage 1 response rec

# Get URL for each individual posting in a state/region combo

In [11]:
# Walk through each state/region combo to get a list of all individual postings for math tutoring in the results pages we searched up earlier.
posts_dict = {}
for key, responses in response_dict.items():
    state = key[0]
    region = key[1]
    #current_region = region
    region_posts = []
    for response in responses:
        current_html_soup = BeautifulSoup(response.text, 'html.parser')
        current_posts = current_html_soup.find_all('li', class_='result-row')
        wanted_posts = []
        for post in current_posts:
# Many CL pages have "results from nearby areas", for instance some results for sandiego.craigslist.org show up in the losangeles.craigslist.org.  By comparing the region that we're currently scraping from against the URL of the posts, we can detect if it's from a nearby region or not.  To avoid duplicates and make the script finish more quickly, We only want to include posts where the URL of the post matches the region we're scraping from
            if post.a.get('href').replace('https://','').split('.')[0] == region:
                wanted_posts.append(post)
        region_posts.extend(wanted_posts)
    posts_dict[(state,region)] = region_posts

In [12]:
# Calculate how many posts in total are to be scraped for countdown timer

num_regions = len(posts_dict)

num_posts = 0
for region in posts_dict:
    num_posts += len(posts_dict[region])

### Getting soup object response for each individual post in a state/region combo

In [13]:
soup_objects_dict = {}

num_posts_remaining = num_posts
current_time = dt.datetime.now()
max_seconds_until_finish = num_posts * 4
max_finish_time = current_time + dt.timedelta(seconds=max_seconds_until_finish)

print(F"Current time is {current_time.strftime('%H:%M:%S')}")
print(F"Process estimated to finish before {max_finish_time.strftime('%H:%M:%S')}")
print()

for count, key in enumerate(posts_dict, start=1):
    # Walk through each region and create a list of soup_objects to scrape from by storing them into memory.  This way we only have to send these get requests once and Craigslist doesn't ban us for sending the same https requests over and over
    soup_objects_list = []
    for i, post in enumerate(posts_dict[key]):
        
        # Impose a timer to help prevent from getting banned for too many HTTP requests in too short a time period.
        random_int = random.randint(2,4)
        time.sleep(random_int)
        current_link = post.a.get('href')
        response_object = session.get(current_link)
        soup_object = BeautifulSoup(response_object.text, 'html.parser')
        soup_objects_list.append(soup_object) 
        
        # Impose condition that every 10th post will trigger something printed to the screen.  This part of the code is a long process and I wanted something to help keep track of how much progress has been made
        if (i !=0) and ((i-1) % 10 == 9):
            print(F"Post number {i} in {key} is being extracted.")
    
    soup_objects_dict[key] = soup_objects_list
    if count != len(posts_dict):
        num_posts_remaining -= len(posts_dict[key])
        current_time = dt.datetime.now()
        new_seconds_until_finish = num_posts_remaining * 5
        new_max_finish_time = current_time + dt.timedelta(seconds=new_seconds_until_finish)
        
        state = key[0]
        region = key[1]
        
        print()
        print(F"Soup objects for {state}:{region} acquired.  Waiting for next region...")
        print(F"Process will now finish by {new_max_finish_time.strftime('%H:%M:%S')}")
        print()
    else:
        print()
        print(F"Soup objects for {key} acquired.  Process complete.")

Current time is 01:02:45
Process will finish by 05:12:05


Soup objects for ('Alabama', 'auburn') acquired.  Waiting for next region...
Process will now finish by 06:14:25


Soup objects for ('Alabama', 'bham') acquired.  Waiting for next region...
Process will now finish by 06:14:19


Soup objects for ('Alabama', 'dothan') acquired.  Waiting for next region...
Process will now finish by 06:14:18


Soup objects for ('Alabama', 'shoals') acquired.  Waiting for next region...
Process will now finish by 06:14:17


Soup objects for ('Alabama', 'gadsden') acquired.  Waiting for next region...
Process will now finish by 06:14:17


Soup objects for ('Alabama', 'huntsville') acquired.  Waiting for next region...
Process will now finish by 06:14:15


Soup objects for ('Alabama', 'mobile') acquired.  Waiting for next region...
Process will now finish by 06:14:15


Soup objects for ('Alabama', 'montgomery') acquired.  Waiting for next region...
Process will now finish by 06:14:12


Soup objects f

# Extracting information from each post

In [14]:
df_list = []
error_list_text = []
error_list_links = []

# Walk through lists of soup objects corresponding to an individual posting for a math tutor in a given search_region.
for search_region in soup_objects_dict:
    # Initialize several lists to store relevant information for analysis
    price_list = []
    city_list = []
    datetime_list = []
    body_text_list = []
    subregion_list = []
    region_list = []
    link_list = []
    search_region_price_list = []
    state_list = []
    
    # Walk through each soup object in the list corresponding to the search region
    for soup in soup_objects_dict[search_region]:
        try:
            # Get link of post
            link = soup.find("meta", property="og:url")['content']
        except:
            # In case a link can't be found, we add the soup object to a list to inspect later and set link to 'None', which we'll use as a filter later so Python doesn't try to scrape from them
            link = 'None'
            error_list_links.append(soup)
            #print("Couldn't get link")

        # Extract region of post from Craigslist
        post_region = soup.find_all('li',class_='crumb area')[0].find('a').get_text()
        if post_region=='sf bay area':
            post_region = 'sfbay'
        else:
            post_region = post_region.replace(' ', '')
        post_region = post_region.lower()
        
        # Get text of postingbody of the post and remove unwanted text.
        try:
            text = soup.find('section', id='postingbody').get_text()
            text = text.replace(u'\xa0', u' ')
            # We do this so that we can use ; as a delimiter when copying data from a CSV file into a SQL database later.
            text = text.replace(';', ',') 
            # We do this because one post in particular had this text and was giving me trouble.  The best way I could find to handle it was to remove the text.
            text = text.replace('QR Code Link to This Post', '') 

        except:
            error_list_text.append(soup)
            text = 'None'
            #body_text_list.append(text)
            #print("Couldn't get text")

        # Only let posts through that have a link to scrape from and those posts where the region of the post matches the region of the search.  Some CL search results are for neighboring areas, ones that come up in a different region than the region your search was from, which leads to duplicates in nearby areas like Los Angeles and San Diego.  This will weed out duplicates.
        #if post_region == search_region[1] and link!= 'None':
        if link!= 'None':
            state = search_region[0]
            state_list.append(state)
            region_list.append(post_region)
            link_list.append(link)
            body_text_list.append(text)

            # Use regular expressions to find all instances of prices in the text
            #old_prices = re.findall('(?:[\$]{1}[,\d]+.?\d*)', text)
            old_prices = re.findall('(?:[\$]{1}[,\d]+\d*)', text)
            # Alternative, if trying to capture decimals 
            # ^(?:\${1}\d+(?:,\d{3})*(?:\.{1}\d{2}){0,1})?$



            # Intialize empty list to store the new prices after processing old prices.
            new_prices = []
            #print(F"Initialized new_prices: {new_prices}")
            # Walk through each price in the post.
            for price in old_prices:
                # Clean unwanted characters.
                price = price.replace('$', '')
                price = price.replace('/', '')
                price = price.replace('!', '')
                price = price.replace('h', '')
                price = price.replace('.', '')
                price = price.replace(')', '')
                price = price.replace(',', '')
                price = price.replace('>', '')
                price = price.rstrip()   
                # Some tutors give prices as a range ie '$30-40'.  In order to work with this data, I split based on the hyphen, then I can use each price individually.
                split_prices = price.split('-')
            #print(F"Here are the old_prices: {old_prices}")
            #print(F"Here are the split_prices: {split_prices}")

                # Walk through each price in the posting, after any necessary splits have been made.
                for p in split_prices:
                    # Only proceed if the post contained prices, ie if p is a non-empty string.
                    if len(p)!=0:
                        try:
                            # Convert string price to int.
                            new_int = int(p)
                            # Ignore prices which are too high to be reasonable.  Some posts included scholarship amounts as ways for a tutor to boast about their abilities, but this will only allow dollar amounts that are reasonable through.
                            if new_int <= 200:
                                new_prices.append(new_int)

                        except:
                            # Show which prices aren't able to convert to an int and the post they came from so we can isolate and fix the issue if need be.
                            print(F'Error converting this price: {p}')
                            print(split_prices)
                            print()
                            print('Here is the text of the post:')
                            print()
                            print(text)
                            print('-'*50)
                            print()
                            # Set prices that can't be covered to NaN so the process can finish.
                            new_prices.append(np.nan) 
            #print(F"Here are the processed new_prices: {new_prices}")
                    #print(len(new_prices))


            # Append all prices from the post to a separate list, in case we need to isolate issues and fix them later.

            search_region_price_list.append(new_prices)

            # For posts that had no prices listed, we use null
            if len(new_prices)==0:
                price_list.append(np.nan)
            # For posts that had a single price, we use it.
            elif len(new_prices)==1:
                price_list.append(new_prices[0])
            # For posts that contained two prices, we average them.  This helps with posts that give a range of prices (ie $25-30).
            elif len(new_prices)==2:
                avg_price_2 = np.average(new_prices)
                price_list.append(avg_price_2)
            # If a post has more than 3 prices, we append null.  We'll have to inspect these posts manually and deal with them later.
            else:
                price_list.append(np.nan)
            #print(price_list)


            # Get city information for each posting.
            try:
                city = soup.find(class_='postingtitletext').small.get_text()

                # Because of the way CL operates, one has to choose a city from a radio button list, that CL provides, when one creates a post to offer a service, however later, there's a field where they can type in any city they want.  Many people will randomly choose a city from the radio button list, but then  post their city as "online".  This makes sure we capture them. 
                re_pattern = re.compile('online')
                online_flag = re.search(re_pattern, city.lower())
                if online_flag:
                    city_list.append('Online')
                else:
                    # Strip out leading and trailing white spaces, replace parentheses, and capitalize each word in the str.
                    city = city.strip()
                    city = city.replace('(', '').replace(')', '')        
                    city = city.title()
                    city_list.append(city)
            except:
                # If a post has no city information, use None
                city_list.append('None')

            # Extract subregion of Craigslist that the post was made in. This will allow for comparison of prices across different cities within the same metropolitan sub_region.
            try:
                subregion = soup.find_all('li', class_='crumb subarea')[0].find('a').get_text()
                subregion = subregion.title()
                subregion_list.append(subregion)
            except:
                subregion_list.append('None')


            # Extract time the posting was made.
            try:
                dt_object = soup.find('time')['datetime']
                datetime_list.append(dt_object)
            except:
                datetime_list.append('None')
        else:
            pass
    #print(price_list)
    # Create temporary df to store results for each region
    temp_df = pd.DataFrame(data=zip(datetime_list,
                                    link_list, 
                                    price_list, 
                                    city_list, 
                                    subregion_list, 
                                    region_list,
                                    state_list,
                                    body_text_list,
                                    search_region_price_list),
                        columns=['date_posted', 
                                 'link', 
                                 'price', 
                                 'city', 
                                 'subregion', 
                                 'region',
                                 'state',
                                 'post_text',
                                 'price_list']
                          )

# # Find indices of duplicate results, then drop them and reset indices.
# temp_duplicate_indices = temp_df[temp_df['post_text'].duplicated()==True].index
# temp_df_no_dups = temp_df.drop(index=temp_duplicate_indices)
# temp_df_no_dups = temp_df_no_dups.reset_index(drop=True)
# temp_df_no_dups['len_of_price_list']=temp_df_no_dups['price_list'].apply(lambda x: len(x))


# temp_text_for_comparison = temp_df_no_dups['post_text']
# vect = TfidfVectorizer(min_df=1, stop_words='english')
# temp_tfidf = vect.fit_transform(temp_text_for_comparison)
# temp_pairwise_similarity = temp_tfidf * temp_tfidf.T
# temp_pairwise_array = temp_pairwise_similarity.toarray()
# np.fill_diagonal(temp_pairwise_array, np.nan)
# temp_argwhere_array = np.argwhere(temp_pairwise_array > 0.9)


# df_row_idx = []
# dup_row_idx = []
# for row in temp_argwhere_array:
#     current_idx = row[0]
#     #print(F"Current row: {row}, Current idx: {current_idx}")
#     duplicate_list = []
#     if current_idx in df_row_idx:
#         continue
#     else:
#         df_row_idx.append(current_idx)
#     for other_row in temp_argwhere_array:
#         other_idx = other_row[1]
#         #print(F"Here's the other_row: {other_row}, Other idx: {other_idx}")
#         if current_idx == other_row[0]:
#             duplicate_list.append(other_idx)
#     #print(F"This is the current dup_list: {duplicate_list}")
#     #print()
#     dup_row_idx.append(duplicate_list)


# temp_df_no_dups['match'] = np.array(temp_df_no_dups.index.values, dtype='object')
# # temp_df_no_dups['match'] = temp_df_no_dups['match'].apply(lambda x: [x])


# match_col_idx = temp_df_no_dups.columns.get_loc('match')
# temp_df_no_dups.iloc[df_row_idx, match_col_idx] = dup_row_idx
# temp_df_no_dups['match'] = temp_df_no_dups['match'].apply(lambda x: [x])

# indices = []

# # for i, row in temp_df_no_dups.iterrows():
# #     indices.append(i)
# #     temp_df_no_dups = temp_df_no_dups.drop(
# #         index=[item for item in row["match"] if item not in indices], errors="ignore"
# #     )

# # if search_region=='phoenix':
# #     print()
# #     print(F'search region: {search_region}')
# #     for i, row in temp_df_no_dups.iterrows():
# #         indices.append(i)
# #         drop_idx = []
# #         print(i, row['match'])
# #         try:
# #             for item in row['match']:
# #                 if item not in indices:
# #                     drop_idx.append(item)
# #             temp_df_no_dups = temp_df_no_dups.drop(index=drop_idx, errors="ignore")
# #         except Exception as e:
# #             #print(i, item, row['match'])
# #             print(e, i, item, row['match'])

# print()
# print(F'search region: {search_region} starting')
# for i, row in temp_df_no_dups.iterrows():
#     indices.append(i)
#     drop_idx = []
#     #print(i, row['match'])
#     try:
#         for item in row['match']:
#             if item not in indices:
#                 drop_idx.append(item)
#         temp_df_no_dups = temp_df_no_dups.drop(index=drop_idx, errors="ignore")
#     except Exception as e:
#         #print(i, item, row['match'])
#         print(e, i, item, row['match'])
# print(F'search region: {search_region} complete')




    # Append each temporary df to a list, which we can concatenate into one larger df, later.
    df_list.append(temp_df)

In [15]:
len(soup_objects_dict[('California', 'sfbay')])

363

In [16]:
soup_objects_dict[('California', 'sfbay')][0].find_all('li',class_='crumb area')[0].find('a').get_text()

'SF bay area'

In [17]:
df_list[218]

Unnamed: 0,date_posted,link,price,city,subregion,region,state,post_text,price_list


In [18]:
# Check for errors in getting text from a post, or from getting the URL of a post.
len(error_list_text), len(error_list_links)

(0, 0)

In [19]:
# Concatenate the dfs for each region into one larger df and check its shape.
concat_df = pd.concat(df_list, ignore_index=True)
concat_df.shape

(3740, 9)

### Dropping Duplicate posts

In [20]:
# Get date of html request to label our output with.
date_of_html_request = str(dt.date.today())

# Include the date posts were scraped on to track tutoring prices over time.
concat_df['posts_scraped_on'] = date_of_html_request

# Count duplicates.
concat_df['post_text'].duplicated().value_counts()

True     2197
False    1543
Name: post_text, dtype: int64

In [21]:
# Find indices of rows that have exactly the same post_text, then drop them and reset indices.
duplicate_indices = concat_df[concat_df['post_text'].duplicated()==True].index
df_exact_txt_dropped = concat_df.drop(index=duplicate_indices)
df_exact_txt_dropped = df_exact_txt_dropped.reset_index(drop=True)
df_exact_txt_dropped['len_of_price_list']=df_exact_txt_dropped['price_list'].apply(lambda x: len(x))
df_exact_txt_dropped.shape

(1543, 11)

In [22]:
# Vectorize each posts' text and calculate the cosine similarity of each post against all other posts to determine which are duplicates
## https://kanoki.org/2018/12/27/text-matching-cosine-similarity/
text_for_comparison = df_exact_txt_dropped['post_text']
vect = TfidfVectorizer(min_df=1, stop_words='english')
tfidf = vect.fit_transform(text_for_comparison)
pairwise_similarity = tfidf * tfidf.T

# Store results in a 2D NumPy array
pairwise_array = pairwise_similarity.toarray()

# The diagonal of our array is the similarity of a post to itself, which we fill will null so that these are essentially ignored
np.fill_diagonal(pairwise_array, np.nan)

# Many people on CL will change their posting in ways to avoid CL flagging them as duplicates for removal.  This finds all posts above a certain similarity threshold.
argwhere_array = np.argwhere(pairwise_array > 0.63)

In [23]:
# In order to remove the duplicates, we need to restructure our 2D NumPy array in such a way that the first column is the index of the post that has a duplicate and the second column contains a list of the indices of the duplicate post(s).
df_row_idx = []
dup_row_idx = []
for row in argwhere_array:
    current_idx = row[0]
    #print(F"Current row: {row}, Current idx: {current_idx}")
    duplicate_list = []
    if current_idx in df_row_idx:
        continue
    else:
        df_row_idx.append(current_idx)
    for other_row in argwhere_array:
        other_idx = other_row[1]
        #print(F"Here's the other_row: {other_row}, Other idx: {other_idx}")
        if current_idx == other_row[0]:
            duplicate_list.append(other_idx)
    #print(F"This is the current dup_list: {duplicate_list}")
    #print()
    dup_row_idx.append(duplicate_list)
#list(zip(df_row_idx, dup_row_idx))

In [24]:
# Create match column in our df, initialize it so that each row contains the index of that row and convert to a list, so we can iterate over it
df_exact_txt_dropped['match'] = np.array(df_exact_txt_dropped.index.values, dtype='object')
df_exact_txt_dropped['match'] = df_exact_txt_dropped['match'].apply(lambda x: [x])

# For rows that are duplicate postings, we overwrite the match column with the indices of all other rows that have duplicated text
match_col_idx = df_exact_txt_dropped.columns.get_loc('match')
df_exact_txt_dropped.iloc[df_row_idx, match_col_idx] = dup_row_idx
#df_exact_txt_dropped['match'] = df_exact_txt_dropped['match'].apply(lambda x: [x])

df_exact_txt_dropped['match']

  return asarray(a).ndim
  arr_value = np.asarray(value)


0                                             [4, 11, 25]
1                                              [675, 856]
2                                          [3, 8, 9, 912]
3                                          [2, 8, 9, 912]
4                                             [0, 11, 25]
                              ...                        
1538    [10, 12, 873, 917, 918, 1077, 1326, 1462, 1468...
1539                                               [1539]
1540                                               [1540]
1541                                               [1541]
1542                                               [1542]
Name: match, Length: 1543, dtype: object

In [25]:
indices = []

# for i, row in temp_df_exact_txt_dropped.iterrows():
#     indices.append(i)
#     temp_df_exact_txt_dropped = temp_df_exact_txt_dropped.drop(
#         index=[item for item in row["match"] if item not in indices], errors="ignore"
#     )

# if search_region=='phoenix':
#     print()
#     print(F'search region: {search_region}')
#     for i, row in temp_df_exact_txt_dropped.iterrows():
#         indices.append(i)
#         drop_idx = []
#         print(i, row['match'])
#         try:
#             for item in row['match']:
#                 if item not in indices:
#                     drop_idx.append(item)
#             temp_df_exact_txt_dropped = temp_df_exact_txt_dropped.drop(index=drop_idx, errors="ignore")
#         except Exception as e:
#             #print(i, item, row['match'])
#             print(e, i, item, row['match'])


df_no_dups = df_exact_txt_dropped.copy()

# Iterate over each row and remove all rows that have duplicated text
for i, row in df_no_dups.iterrows():
    indices.append(i)
    drop_idx = []
    #print(i, row['match'])
    try:
        for item in row['match']:
            if item not in indices:
                drop_idx.append(item)
        df_no_dups = df_no_dups.drop(index=drop_idx, errors="ignore")
    except Exception as e:
        #print(i, item, row['match'])
        print(e, i, item, row['match'])

In [26]:
# Check shape when we dropped posts with exactly the same post_text against the shape after we dropped text deemed similar by cosine similarity 
df_exact_txt_dropped.shape, df_no_dups.shape

((1543, 12), (765, 12))

### Dropping posts that contained no prices, which aren't helpful for our analysis

In [27]:
# Use the len of price_list to find posts that contained no prices
df_no_dups['len_of_price_list'] = df_no_dups['price_list'].apply(lambda x: len(x))

# Filter out results that don't have a price and reset indices.
df_with_prices = df_no_dups[df_no_dups['len_of_price_list'] > 0]
df_with_prices = df_with_prices.reset_index(drop=True)

In [28]:
df_with_prices.shape

(360, 12)

In [29]:
unique_posts_count = len(df_no_dups)
post_with_prices_count = len(df_with_prices)
num_posts = len(concat_df)

percent_unique = unique_posts_count / num_posts * 100
percent_with_prices = post_with_prices_count / num_posts * 100

print(F"Out of {num_posts} posts, there were {unique_posts_count} that were unique, or {percent_unique:.2f}%.")
print(F"Out of those, there were {post_with_prices_count} posts that had prices included.")

print(F"Only {percent_with_prices:.2f}% of the posts that we scraped remain.")

Out of 3740 posts, there were 765 that were unique, or 20.45%.
Out of those, there were 360 posts that had prices included.
Only 9.63% of the posts that we scraped remain.


### Extracting complete.

In [30]:
# Drop unnecessary columns.
df_for_sql = df_with_prices.drop(labels=['link', 'price_list', 'len_of_price_list', 'match'], axis=1)

# In order for psycopg2 to parse our CSV file correctly later, we need to escape all new line characters by adding an additional \ in front of \n.
df_for_sql['post_text'] = df_for_sql['post_text'].str.replace('\n', '\\n')

# Store cleaned data as CSV file in preparation for importing to SQL database
df_for_sql.to_csv("./csv_files/{}_all_regions_with_prices.csv".format(date_of_html_request), index=False, sep=';')

# Store original data, before we applied any cleaning to it, in case it's needed for something later on.
concat_df.to_csv("./csv_files/{}_all_regions_posts.csv".format(date_of_html_request), index=False)

In [31]:
df_no_dups.to_csv('./csv_files/{}_all_regions_no_dups.csv'.format(date_of_html_request), index=False, sep=';')

In [None]:
for key in posts_dict.keys():
    print(key)