In [7]:
import requests
import hashlib
from datetime import datetime
import time
import random
import os
import csv
from bs4 import BeautifulSoup
import re 
import pandas as pd

# Parsing functions

In [8]:
def parse_listing_html(text,file_name):
    #define the html text
    page_html = BeautifulSoup(text, 'html.parser')
    
    if deleted_listing(page_html):
        print('listing {} has been deleted'.format(file_name))
        return
    
    if page_html.find('meta',attrs={'property':'og:url'}) is None: 
        return
    else:
        listing_url = page_html.find('meta',attrs={'property':'og:url'}).attrs['content']
    try:
        listing_id = page_html.findAll('p', class_='postinginfo')[1].text[9:]

        listing_title = page_html.find('span',attrs={'id':'titletextonly'}).text
        
        listing_nh = page_html.find('small').text[2:-1]
        
        listing_city = page_html.find('meta',attrs={'name':'geo.placename'}).attrs['content']
        
        listing_date = page_html.find('time', class_='date timeago')['datetime']
        
        if page_html.find('span', class_='price') is None: 
            return
        else:
            listing_price = int(page_html.find('span', class_='price').text.strip('$,').replace(",",""))

        
        listing_bedrooms = page_html.find('span', class_='shared-line-bubble').text.split('BR')[0]
        
        listing_bathrooms = page_html.find('span', class_='shared-line-bubble').text.split('/')[1][:-2]
        
        # Extract sqft info from bubbles class and use regex to find the digits ending in ft2
        bubbles_sqft = page_html.findAll('span', class_='shared-line-bubble')
        listing_sqft = None
        if len(bubbles_sqft) >= 2:
            x = re.search("[0-9]+ft2$", bubbles_sqft[1].text)
            if x:
                 listing_sqft = x.string[:-3]

        listing_address_element = page_html.find('div', class_='mapaddress')
        listing_address = listing_address_element and listing_address_element.text

        listing_info = page_html.findAll('p', class_='attrgroup')[1].text

        listing_body = page_html.find('section',attrs={'id':'postingbody'}).text[30:]

        listing_first_image = page_html.find('div',class_='slide first visible').img['src']
    except Exception as e:
        print('Error parsing url: {} file: {}'.format(listing_url, file_name))
        raise e

    return {
        'listing_title': listing_title,
        'listing_nh': listing_nh,
        'listing_city':listing_city,
        'listing_date':listing_date,
        'listing_price': listing_price,
        'listing_bedrooms':listing_bedrooms,
        'listing_bathrooms':listing_bathrooms,
        'listing_sqft':listing_sqft,
        'listing_address':listing_address,
        'listing_info':listing_info,
        'listing_body':listing_body,
        'listing_id':listing_id,
        'listing_url':listing_url,
        'listing_first_image':listing_first_image
       }


def deleted_listing(page_html):
    h2 = page_html.find('h2')
    return h2 and h2.text == ' This posting has been deleted by its author. '

# Create df from dictionary

In [9]:
list_of_dicts = []
path = "/Users/pandabear/springboard/CapstoneTwoProject/data/raw"
dir_list = os.listdir(path)


### Testing parsing for one folder scraped by playwright
# path_testing = '/Users/pandabear/springboard/CapstoneTwoProject/data/raw/2022-08-17T22:01:42'
# file_list = os.listdir(path_testing)
# for file in file_list:
#     if len(file) > 15:
#         with open(path_testing + '/' + file, 'r') as f:
#             text = f.read()
#             parsed_data = parse_listing_html(text, file)
#             if parsed_data:
#                 list_of_dicts.append(parsed_data)

# listing_df = pd.DataFrame(list_of_dicts)
### Testing code


for folder in dir_list:
    if not folder.startswith('.'):
        path_html = path + '/' + folder
        file_list = os.listdir(path_html)
#         print(file_list)
        
        for file in file_list:
            if len(file) > 15:
                with open(path_html + '/' + file, 'r') as f:
                    text = f.read()
                    parsed_data = parse_listing_html(text, file)
                    if parsed_data:
                        list_of_dicts.append(parsed_data)
#                 print('Number of parsing errors: {}'.format(count_parsing_error))
        listing_df = pd.DataFrame(list_of_dicts)


listing c3ef226a1bf4ef4080a4b69aa5f663808dd609cefef0feb5804501fa9be01a9c has been deleted
listing 3f4ce9c85d102d6da2ab87fd34c38d797a80832f93d1c13acd0bcf9f29d4e03b has been deleted
listing cd54be649ad10badbfcb63bd4c4a368b548dbd7da493464a726549658f9b5968 has been deleted
listing 00c0b60fca283730cb2b417f04a85b2ac0bf4c935d37f62166100dfec3e4afa5 has been deleted
listing 58b018f9e3542e661ebb75709ce2db0adcd675115af6cdba3bbedd2b50843bf2 has been deleted
listing 075eabcec6f68958678c15ca0effab9e78a03036ff3ddcba03df92056b6c4295 has been deleted
listing a9664a79bb9b68fdf7fc2955bc0ef0678ee35b2840e3c80c1e485071e72d07db has been deleted
listing 405f2f89c11fb5d37f6100405a50307385ec2ecd2e945555c9b43b614a92c5f7 has been deleted
listing 202559b57c80cbb3022b36004db10a4db58241e4df8461723e5099564a8e08f2 has been deleted
listing a489473984c40d69ad8153bc5be0e2f341d309802f2d49c417a66fa684529705 has been deleted
listing b3543edce1dc860b3ba00d1dc7ff41999bc88113b0732412594e18fda9524694 has been deleted
listing bb

listing 5e6f403a0b0fdb2b5ee0a693bc8c1e46f9a568931eee1e24a48b634a13ea7b82 has been deleted
listing 26ecd851f93baa5d9fcdfa5f4b0dafbdc52a1cf14475e7b7bcefc1329b83c607 has been deleted
listing 13f2e305008c6e88fced794b96fac17c22b635aeeaf16529f9ee61bbf54eb79e has been deleted
listing 1499b1cdb2aa351993605642204debd1724ede5279cd437f8a1361f491d4ff4c has been deleted
listing 1facd83bddda79176fa4d051bf485521e3ecce77eb2f9bf24c78fc3465eccdfb has been deleted
listing 784a0b608b43d4c7ef2ebd9dc0e47f7f13999e13908ce84a8735a6c5f06773de has been deleted
listing ea505dee6182f683619008937948f91836712f385e283946288f90b45031c24a has been deleted
listing 711c9b5d5fa7cbe8ef9f3d10d3565bafb7cb55135a463c9201697ee232253f0f has been deleted
listing ac352899392db9d2330387f83002ceebb8b68bab36e26c76fdcb2eb4d2c919a6 has been deleted
listing 5edce9bc5381a5bb11316e55ce39b7a96c726eca9fa16af4c1bc5d377e4bea08 has been deleted
listing aef08d71c0229f80bc32070ec0816cac957729f16c1e4d94ce0aaf14a677a62a has been deleted
listing 9e

listing 1c30dabe40539e80ce53f8a36d8981354c8e05141b1941c7b0c0fd80d3fb56c7 has been deleted
listing 028913c04b05822038b68230ce8150a8863f8dbf0b7cd4e66b3487fcfb6bfb79 has been deleted
listing 00da2383dbf744a0bb35bbdf229eaec0504e4330842fb801140581b42601d809 has been deleted
listing 77535a13ea894fe1b1b51db2e29c33e3eff13858c389d1bdd87ed491d9bef511 has been deleted
listing 57894ed1887ffa46bbd18ab2c634ce6ae093cea683f37a817144875eee40789c has been deleted
listing 90a6d2641686a88c62db993c317d63a83b08ffa82b9fd3c81628a29b79e16bc8 has been deleted
listing 76bd62bd473210d2f015d661af9d8fe2f5b6dfc60f47c846629dbe4f19468381 has been deleted
listing 4a45766344169a49d03a0b10de4f6cbaacd850c73e85413a6835a1e84b1a0530 has been deleted
listing d703ba676f8efe5e302012b4f7ab39f60b45ab3aaa5899de4b4ea6b9abd38e3d has been deleted
listing f3a4bd8d9a34422734d1da0e4702df93a334ba1d745fadb1563c1fd3e28a38a9 has been deleted
listing b0749b367c29819a65ecbe63c847e97c7dde7bf7013adfbc89b3502881632abc has been deleted
listing d7

listing 8eed6b17707dd43e898e6afbd7d279f9f3e845f26a4c3b679b6f8aa849e43dd5 has been deleted
listing e85010522f800de77e8fd82549ff44cc01d9ab5b78799e2ee429fc0fb18a4ab0 has been deleted
listing f4b208fc87aa63d1425261c99c7dbb22c38d849ed7f3a12f928403f9c55ae02d has been deleted
listing 1da2569bead691e42984de0953e63b74d55773d9654af42924ed248a928513b9 has been deleted
listing ebfd9113e3ceacc6ab51eca901a409a932b1ecaa3fbcd5dff73c7b06e00f795b has been deleted
listing 41d856cda9abf5df2ad14c49f615b5a18374e1a370b6fdd0a0fe3c32994e61b6 has been deleted
listing 5c9a78f9c20ae922e6008c0cc89bb2b8e85939c90fcbbd47517174dc02c775c0 has been deleted
listing 5064888afec96047ab8afe015fdd9047d0360b529bdf57cfb3b9b656f8725bae has been deleted
listing c92b9fbe52321e85a68d4ea79dacabdd7262878a1f8b0c363220709c42401b8a has been deleted
listing 2f3a963a366d01c0613718621add202ac3b7dda539f45edb1bbab0c299671e1b has been deleted
listing 806c21517cce9c4943fb77e9246636a13d569ddb1c4fa128d684689ba4790470 has been deleted
listing 82

listing 2f69bd2a04103d3946b676ed85c9b53d1240468a75140314ac9eb6733681c26a has been deleted
listing 077a538cd2af8c172ba50eb413bc08f40d5ccd0dde09682d3f0061ede1b7c386 has been deleted
listing 3e42749dc271730af1a602f14d6c8c01b0d3ee02cd30b2267a47696bc4092d08 has been deleted
listing d9b44eb54046a6f24bd6de5abe11c6aa147ca3f9f345b10214d5e10b2e40dd71 has been deleted
listing 4eeb4fd6b70f955346c7c97da3b562732e8606d7649b3502086110c40da1655e has been deleted
listing 587d5b6b54e5bc5effd5a25c8c3cc3bf3b0a3614255a7dd375ae0ceb500597f2 has been deleted
listing e11f49bf762f01aea6aa225535f9c074f0d800d44cca68ee18237e4e6560b9ac has been deleted
listing d70efae0db93bb387bfa57ae25d8f27ba426e3d3f29c87d3e07bce4ada163ba9 has been deleted
listing 041a310ecbaeeadd5c943b025ad6b47687759f8042136a54099329a7f11342a1 has been deleted
listing 7e3e4351324e2b33638b7e9299486fa0ab4f3f9eb37f1dbec246e8b2ce04531d has been deleted
listing d1707da473c587a3e264f63d23a7306e1eac1f216eaf86a144681e70a2810d0c has been deleted
listing d7

listing 7902b03c53f1079a78a6e0a8321d84c80d57bcb4289dc9b273a7a76bfd394c4c has been deleted
listing f7b6f910e10d7520c85d777d64309c93d4b1817eabf681b1004ff5a65169fca8 has been deleted
listing abb28934d4780a6a3601f57c27f56d0392c672792d116b66186b1128685635ff has been deleted
listing b8e1cf3a97b55c48baa9ef2b0d1e9b867a0172671d20873100f7d31b6a306b70 has been deleted
listing 76ced54e814d38a1dbc984c88422ecfb013340a897e99503615ad8d9c81ceeef has been deleted
listing f82b034a3b430ba79c91482c93a1829b1941a8912e51646e8ea79e5b5a62dece has been deleted
listing 41af378d0702e12309537c33ef6b8ebcebe82305a59501cbbdfa0be798d6af4f has been deleted
listing 54cd3b5233b968181e9b8dbc1577e3d80ee329e49ca379f181207949eac8a966 has been deleted
listing 12807eb0dcad3a540a7857fddfe002c95a8c231e8df66f34bca69c37dcfbe288 has been deleted
listing de3800c6f17aaeeec1d98be2494e85407a713bbbdeb7084ccaf8b9f87f66ee06 has been deleted
listing a18ab474f68b8b67ac17389b9d136a975b0c278b31876ee4e9e862c85c64b475 has been deleted
listing 87

In [10]:
listing_df.set_index('listing_id', inplace = True)
listing_df.head()

Unnamed: 0_level_0,listing_title,listing_nh,listing_city,listing_date,listing_price,listing_bedrooms,listing_bathrooms,listing_sqft,listing_address,listing_info,listing_body,listing_url,listing_first_image
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
7518473082,Charming Update Duplex w/Large Yard,burlingame,Burlingame,2022-08-07T10:39:29-0700,2750,1,1,,Oak Grove near Acacia,cats are OK - purrr dogs are OK - wooof dup...,"Large front and back yard, along a quaint neig...",https://sfbay.craigslist.org/pen/apa/d/burling...,https://images.craigslist.org/00k0k_9yaPmYku1l...
7522326058,"Spacious Studio * Minutes to Dining, Shopping!",burlingame,Burlingame,2022-08-16T17:17:40-0700,2500,0,1,,1401 Floribunda Ave,apartment laundry in bldg no smoking attac...,Welcome Home to Classic Peninsula Living at Th...,https://sfbay.craigslist.org/pen/apa/d/burling...,https://images.craigslist.org/00D0D_aFhw5Xkeqe...
7510413512,Where City Living Reaches New Heights Come to ...,daly city,San Francisco,2022-07-18T13:00:26-0700,3450,2,2,1130.0,,application fee details: $39 cats are OK - p...,Highpoint Terrace San Francisco www.highpoints...,https://sfbay.craigslist.org/pen/apa/d/san-fra...,https://images.craigslist.org/00P0P_pHGfLJMzGf...
7526323338,"Wi-Fi in Community Areas, Spacious Open-Concep...",burlingame,Burlingame,2022-08-26T12:02:23-0700,3466,1,1,793.0,1008 Carolan Ave,EV charging cats are OK - purrr dogs are OK...,"Burlingame Spirit, Anson Prestige Brand-New Ap...",https://sfbay.craigslist.org/pen/apa/d/burling...,https://images.craigslist.org/00o0o_bypIwAvRZc...
7503964545,BEAUTIFUL BURLINGAME HOME FOR LEASE!,burlingame,Burlingame,2022-07-02T10:31:55-0700,6500,3,2,1960.0,1711 Marco Polo Way near Clarice,house w/d in unit no smoking attached gara...,"Beautifully remodeled 3 bedroom, 2 bath + sepa...",https://sfbay.craigslist.org/pen/apa/d/burling...,https://images.craigslist.org/00y0y_5q38PKEa4B...


In [11]:
listing_df.shape

(173272, 13)

In [12]:
listing_df.to_csv(r'/Users/pandabear/springboard/CapstoneTwoProject/data/interim/listing_df.csv', header=True)
print('Saved to csv file')

Saved to csv file
