In [1]:
import requests
import hashlib
from datetime import datetime
import time
import random
import os
import csv
from bs4 import BeautifulSoup
import re 
import pandas as pd

# Parsing functions

In [2]:
def parse_listing_html(text,file_name):
    #define the html text
    page_html = BeautifulSoup(text, 'html.parser')
    
    if deleted_listing(page_html):
        print('listing {} has been deleted'.format(file_name))
        return
    
    if page_html.find('meta',attrs={'property':'og:url'}) is None: 
        return
    else:
        listing_url = page_html.find('meta',attrs={'property':'og:url'}).attrs['content']
    try:
        listing_id = page_html.findAll('p', class_='postinginfo')[1].text[9:]

        listing_title = page_html.find('span',attrs={'id':'titletextonly'}).text
        
        listing_nh = page_html.find('small').text[2:-1]
        
        listing_city = page_html.find('meta',attrs={'name':'geo.placename'}).attrs['content']
        
        listing_date = page_html.find('time', class_='date timeago')['datetime']
        
        if page_html.find('span', class_='price') is None: 
            return
        else:
            listing_price = int(page_html.find('span', class_='price').text.strip('$,').replace(",",""))

        
        listing_bedrooms = page_html.find('span', class_='shared-line-bubble').text.split('BR')[0]
        
        listing_bathrooms = page_html.find('span', class_='shared-line-bubble').text.split('/')[1][:-2]
        
        # Extract sqft info from bubbles class and use regex to find the digits ending in ft2
        bubbles_sqft = page_html.findAll('span', class_='shared-line-bubble')
        listing_sqft = None
        if len(bubbles_sqft) >= 2:
            x = re.search("[0-9]+ft2$", bubbles_sqft[1].text)
            if x:
                 listing_sqft = x.string[:-3]

        listing_address_element = page_html.find('div', class_='mapaddress')
        listing_address = listing_address_element and listing_address_element.text

        listing_info = page_html.findAll('p', class_='attrgroup')[1].text

        listing_body = page_html.find('section',attrs={'id':'postingbody'}).text[30:]

        listing_first_image = page_html.find('div',class_='slide first visible').img['src']
    except Exception as e:
        print('Error parsing url: {} file: {}'.format(listing_url, file_name))
        raise e

    return {
        'listing_title': listing_title,
        'listing_nh': listing_nh,
        'listing_city':listing_city,
        'listing_date':listing_date,
        'listing_price': listing_price,
        'listing_bedrooms':listing_bedrooms,
        'listing_bathrooms':listing_bathrooms,
        'listing_sqft':listing_sqft,
        'listing_address':listing_address,
        'listing_info':listing_info,
        'listing_body':listing_body,
        'listing_id':listing_id,
        'listing_url':listing_url,
        'listing_first_image':listing_first_image
       }


def deleted_listing(page_html):
    h2 = page_html.find('h2')
    return h2 and h2.text == ' This posting has been deleted by its author. '

# Create df from dictionary

In [3]:
list_of_dicts = []
path = "/Users/pandabear/springboard/CapstoneTwoProject/data/raw"
dir_list = os.listdir(path)


### Testing parsing for one folder scraped by playwright
# path_testing = '/Users/pandabear/springboard/CapstoneTwoProject/data/raw/2022-08-17T22:01:42'
# file_list = os.listdir(path_testing)
# for file in file_list:
#     if len(file) > 15:
#         with open(path_testing + '/' + file, 'r') as f:
#             text = f.read()
#             parsed_data = parse_listing_html(text, file)
#             if parsed_data:
#                 list_of_dicts.append(parsed_data)

# listing_df = pd.DataFrame(list_of_dicts)
### Testing code


for folder in dir_list:
    if not folder.startswith('.'):
        path_html = path + '/' + folder
        file_list = os.listdir(path_html)
#         print(file_list)
        
        for file in file_list:
            if len(file) > 15:
                with open(path_html + '/' + file, 'r') as f:
                    text = f.read()
                    parsed_data = parse_listing_html(text, file)
                    if parsed_data:
                        list_of_dicts.append(parsed_data)
#                 print('Number of parsing errors: {}'.format(count_parsing_error))
        new_listing_df = pd.DataFrame(list_of_dicts)


listing bd2f42238071ca5af0f51ff5d03e5d735d241e51dc0fd7b48bfdb285ab34ae70 has been deleted
listing 16684e5f69795abf39ccb1636e4643f54c3ed25e9f630564874e35ab00c50643 has been deleted
listing bbdf06f159200b28adca3d0e1a10fd4804734f275fe81b22210294854705bb7b has been deleted
listing 090c7d6ec91186068b21b2cb64e1cf68529cbbe00788764da3f290c2ac5a4a89 has been deleted
listing edd31a8f6a013d2b4681cad06321a0954c2bc16af0768af73c677bc471381e1e has been deleted
listing 2a17a6e8e33a2deff19eccb9c5ab48f9efdc1c81c4665f47b864ac4e884c83e1 has been deleted
listing 4dff65c137f062e598d422ea8ab2ace593ee615c4ff5f1aea8c5a4bcbdd84765 has been deleted
listing efac3de9f6c47188415f2cc3a45e8f556a2f0c8775d9e247889028c67e16bdbd has been deleted
listing 4c126db3423111b873047b3ecb3428f86fdf20ba03039f492624a36b0f145f5c has been deleted
listing 0841adc649ce290e973586ac8c7c4c6b445545815a1dffca48f3c0e14e2bfe37 has been deleted
listing aa17c81011918fb206b5c9b67a36370e2e43e3c4471e6e5e5693596b8f247265 has been deleted
listing a5

listing a68c1039cb2dc802ba720d77b34fc526bcd4962e6df0e12565564ad18a025289 has been deleted
listing b3753398048d5d8726cbc47e53093b34b3ef982edd05429a91e90a34624da8d8 has been deleted
listing 369c4eb7a669053671ef9ef209707c5f516f6d60d32e6ca070426157893dbaa5 has been deleted
listing 1467836236ffa78ca1002ed823e078a651ebe6c80b19976ee76143a4c6f965f5 has been deleted
listing dd58812ccf3a9011d0e8788327a1116b5fc369b3cc913aa8e80e26cae53318fc has been deleted
listing a05111172bd8389d1662871b37152457d9af5c6fc053798f5cb6a5fd2cd193dc has been deleted
listing b28d5d548c7d92d87f6db49089824954285e559804c82f6d2cc65d78a224be0f has been deleted
listing 592ebbcf420b0ecabed960bf03c0715500749bf2c8b96caf105cfc4028abf7c3 has been deleted
listing da39ff7a32bcaad356bba1272d6e0abbf291d277ffc57f6e5403d462a50e78d7 has been deleted
listing 1ba5a25f1b37f8a9b2bf2d163205b86cec58ce15c5689405a4411f0c5520c50c has been deleted
listing 576be37c10847e4fba2a244c0c92c8d005135d82d8bf8c365b835664deb53037 has been deleted
listing 5e

listing e123cff1c76ddb5f06b402d9725455eb87d4cd9b146275bdbd322dcbbf8b1b99 has been deleted
listing 8984b518bd86f38ec4f06d5daf3437ee88f74953992a2e3cb6615f34d4b77996 has been deleted
listing 7f188bca4f43b81f02504fa29163f8107596588e402720635fd51b2ed41d4abe has been deleted
listing 21f4982d1d59b17f215a7c0fa3027fdb22bf6101afcb1841af22decd87a7440e has been deleted
listing 11927f6af6399c367fc23138bee71b3e3fd04f791c4a9b96f824ecd7e3b4687d has been deleted
listing aaf150ca7fa9feb3eb5a4750c0ba2514c4dd96fdf5483d2b700044561c2a0e81 has been deleted
listing 96a6a0e0a5be00def85f1704eb075fa7ca9f98b4ec4b9e9e0a403f93f737421b has been deleted
listing bcdc52223e7a6c4d38db1a8fb7b99f4a805c1101fdd056a6ad7453a8d46c3b48 has been deleted
listing 48da75b3444770546c257d9db9315f7abdd7986319b205928c8bf78251b60f50 has been deleted
listing 404fd8d872643e7f9e83e1fcde5350c002442fc4587607655c6a9bde81161571 has been deleted
listing 44790c132730e1849460b88808b00a7cd3c45288faf1d7796e280391f1d27ab7 has been deleted
listing e7

listing 377f625896f7ac1f2ec392d8d4f1946574d1c8c148b1aec628f3efe85cb83162 has been deleted
listing 2178d5fc2724fbe0a78a5d144694edbfdfb34c2e1237e7da430cfb57f091c9a8 has been deleted
listing 7612f1497b8bc232f2151e239dffcb67df54fc1dc1d21d25483963bc9a151122 has been deleted
listing 08607b36a249c453c579bfe37b4b7daed0b5d2279d0cd0e596872dae6eed3551 has been deleted
listing 6ec2a9dc508785d72faf3d6293eb7eac5d09a737d73290edab2e26e99e2c5122 has been deleted
listing 0e5b09b9f8ae88dcdd9680bdfaef428c31a9fc3ff723355a741c71b55206c079 has been deleted
listing 65dbc4185a7512d5c6e93b3a5ab2d3abb3b066bf44024fe8af01bc637dcd8d6d has been deleted
listing bfa5fbd49f3ffc223705f3a4f528511276bac1f20e72e43a91790944c7041839 has been deleted
listing 2e943ad3bb633428f774da0ace9e5925e2facb7681d01ff83ade8d19ea3b042d has been deleted
listing bfa0aa60b452fde438a08225315002da7d12dc190b94ac710fcf9b7b3399afbf has been deleted
listing fe331e6b1c6706af737690ca3ea5a38b54ebaef22a41fc57d9b4821cdc9e49c2 has been deleted
listing b2

listing cecc2a7098a964cc9e905397bc8e9b905ae92e8315f07a969e0fdea610762250 has been deleted
listing 08d3a1cd826dd50c9860bca263a5cfbd375e558af9708ab8edd07a06638de8c2 has been deleted
listing 46d5c7208f9916c09d5fa8eec50afe823a98b0c093c9849af9091d2d903304b3 has been deleted
listing 6f52cd11edfa7c5c2ca65323e08b4d04f568078702d2e0d2f1fca9764bbd8886 has been deleted
listing 3390516f9f948c10dc2d29554a7845ae90afc8b9ff834775244582930dd83548 has been deleted
listing 8851201d5dde84d121303a275294812bcd2ad635142507a8fcd232e256fddfb5 has been deleted
listing 7250089eceaa3ef1b053ca8142bb067ef9ca953905894dbedabfcfa6f2408df8 has been deleted
listing 88c222339b8210a6f004faad5ccb45bfda10dacb8b3061251f90edc8a9db2aae has been deleted
listing 4c1d0a667d44338234ab7e96232a87004edbe4bda742b0d0aea798c5c763a7ef has been deleted
listing 3da27d7fbe39f08bdaaf17e5c9adeaab7a69341eb22a030ead52c183deba3bfc has been deleted
listing 4244483fe447b378ec600927850540cd5cea99862c858ef2e1b788cfc16b4c75 has been deleted
listing b9

listing 569a6eb979ef0f0ce353908eb11dc0e4bedc2f7e454933f080f5fcee7cc83e9c has been deleted
listing 8ee5819ce7c6f069bd9e1737809ffc23aaf4cea36b703f927093796ccee4b58f has been deleted
listing 0030dd3cdef5e532ace30e994a73e75851241abb58e197b5d8bf6f88d600d3f7 has been deleted
listing 668f08269fab5c24197d9ae752396db192e3294d28793821c3fdf4dd6fb1586d has been deleted
listing d88f0a0a2056dcbd29b0b7ccc8c417a3a548b6cf75b7265129694d6abc17d320 has been deleted
listing 1c20464827a3be4e77ea7b32932f61686eb8a6f2d7b7a2dd1d837b6ed88b2a98 has been deleted
listing 51343ff5a8140869be40eb58e0cc4ac821734d3289cd92dcb762322d2e8273a4 has been deleted
listing 7405b637ae40491680fcc1fd7ee69a4e488baf2181683f208f29feba6fcbd8e8 has been deleted
listing 0fb0add52b8b8a0681f785cd99e0faf76ece17282a9e6b9f7483c92f46352cab has been deleted
listing 402aa5eb871f299630a86f2e38d205c7b981e469531b438cab4bf0002d77f2aa has been deleted
listing dcfb3d37066619dfa4e624d8a8f41d23d449fa275565894e41f4853625b8e01e has been deleted
listing 44

listing 0a8358ff043d13cd1eac18f31b67f95a9565da796a5131c8bae4ab652899779e has been deleted
listing 3430971ede4cc31569384b767c2ff23e9c6c556493b938e77d453d3d478fc313 has been deleted
listing d2731a5a38c4d46b00b7b91a9ae256fa55a58a81423a3e6971ea75029e09f3e2 has been deleted
listing 2eaf39608bfb94616907e5f69c69bf2184a5e3ee908dfefbcd83dbaa54a9d8c2 has been deleted
listing aee3ed3c70a0bbb370964b401351ce63976dbef740e710f87b8b66f46987fbbb has been deleted
listing fc969aaaf9a05b275efeb6d4322b4cefb2ae2689ae9fd7ceac3178ff1cb45a95 has been deleted
listing 36f31d9d6dfff745ea42eb8699b6e0524f6ff09f9de95ba01951a8a33190a49d has been deleted
listing 019ee369fe53f44b79ffb976aa9662e6f70f0447b58444921e1103bf63dde34c has been deleted
listing 0ce4dc845e4470c39b7fc7aa121b58f68b1804c994b0c73b1f36716735138068 has been deleted
listing 92e3ac76f241d8ce3375e0924371cb7871bdf7ef2a174ebe0d0d5acf8c296400 has been deleted
listing 8d7a29698b3b80786bb3101cc9bfde5ede646c09ca644e0cd9230d81c068335c has been deleted
listing c3

listing 0bfd4c325d8566fedd83040c0dfb84d86a4d6f3cb30eaf78c4b4bab6b2f47cc3 has been deleted
listing a039a06a3d892605aca1ffecbb76fd7b8bf3c4601c9e6f8318825a8cd910a2ea has been deleted
listing 6788b64e29696ddcbf8c1faadcd6958649cb9fe8afc25d1809af999ad5d157b8 has been deleted
listing aacec1b64b97886e04936194b458bfd29bbcd3646063839f3a3647bface27f0d has been deleted
listing e34a897826e567f99d841e5fe0308d57c37057e39f77665112bb32a1d4c9f141 has been deleted
listing 73e0233acfc5fcff99cdbd368a49c433394e3968a6dad30d47ba23f866d2ac71 has been deleted
listing 2d609d40fa00c51ec1d2d796927863be377adbabd76c5468f567c0fd4e0611de has been deleted
listing 1ac47d204731f40c7af28f2d53fc0260344761b86a627a70e4a45bf8311d88d0 has been deleted
listing 257a00528e3f059b8e877ddda258840969bb069abbf3fac962e0956b6853bd18 has been deleted
listing b0d35532733b46b42b0c827f8b3c823d08d832de921d53006a72fb1e41e6a03a has been deleted
listing 2625e56bc6d4c73ad2bed91fa86db3d42265a90762b4859910ddafed418159a5 has been deleted
listing 98

listing daace97bd104d16dd2b56889c607d6121bef7f8c42f49104d61b1efa629d3c7a has been deleted
listing a3f10bc112bafe554e7e6d235ac392157ea83040f41e12a219693bd06b5f0f0d has been deleted
listing 92327fcb2eb572e915134c55686e47269d3c06548f051d4347079048f854eeed has been deleted
listing 6826877c9ccc546af73f4bf2a75d875b224ca33e174821d350b363123902b36f has been deleted
listing 51bafe225abf2bba5649057f274f2cd4147cf6227f4528dca0f892e5fa72f50c has been deleted
listing b0b6c8853aa60a52a9cf3c3ebcef5e31657b0e64ddf14267a70bc49c8dbe828b has been deleted
listing a7115369418d76feb68a1e3a3a2c604d8e787265b3ae22a65db5d64bf48e288f has been deleted
listing ef2a69512f9c04f624cd1cc89c7fd47dec545bf89c8f5b17e3846b942faaac99 has been deleted
listing 0250bddbb3857a2b13bc43a179634e25770bce429c817e03e80fcc4cfe70194a has been deleted
listing c96c5135641d0ce1e03d77c1400a56e81f1d0e920445e05a698fc9733b501003 has been deleted
listing a1ab519915f40e59e450f8764f255b5b36623199f44cc39ead5490db75a9cbef has been deleted
listing d6

listing 473775230b472941dbd61e64eeb4397a353c438400a4f19b2f80d43be74b4b36 has been deleted
listing 4d601dd66c8f490d1744bcc6ca537612f37d049036d8b8c9cef38bcead9a4a41 has been deleted
listing 3a15bbb4680041555c7654fd23a2a938a789668bd1b1fdc21a98e39ad2d692ed has been deleted
listing bbddc80e757c9889ccd43eb98d51701f4a36be132dd6431393fa19b511becf5d has been deleted
listing ffe31c8155866ccd21cf7bf1d1b4ea8ce5b37491c516506ec101a5d1812828d4 has been deleted
listing 6faa7c80668cc1edc18e606c33aa0d34e8ef94cc1f191ee49769c49c2bcb2b45 has been deleted
listing bff6e0043051d1f76fa002ce417d759cb7e3f5418b8b8b07c2498a550180d080 has been deleted
listing 19cd47586f45da0b1a657a0e3daaeab96dee34951d4b5aacd444fe0a55543c65 has been deleted
listing 888918e1b312752f3e3e092f66e30a5e72befc018c189cd3feee7c87a15e50ab has been deleted
listing d57444389d0e5736e2d92d1641e4f27fe1f47d1ea5f87ba1d9285d05af11b455 has been deleted
listing ec6db3330716187820dc1a73314a61411a8a4bac5e1a9fc3b2537c8f7ff468f4 has been deleted
listing 72

listing 31a1f0b0afbad35048201475de3af2c8feccda28bdfeeedd395d8dff8632613b has been deleted
listing 0ff274886923300bbb33d7bdae24b4a0f65d192143995d3d22da1bf6490b24f8 has been deleted
listing 73584586c52d1b8d778f2cc4b56d8ac51c079c1acef947c46cea39942a885627 has been deleted
listing 2cfddc4d75ba853fcd54e8ef32c4502b6a5aeb7a89f3a9cae9f9ab0ae15190d0 has been deleted
listing 9b6720e33bcf8940cdfe34d5ab2524cbf2d93ba166f82281277ff539bf0c9ab0 has been deleted
listing 4446f5ee85f1995e3c5620e4e6dfe68c3036b18c2b7c4ae16c693e9e462e15fb has been deleted
listing b5fdf61c6f8f97372e8f85a11c673430967ebaf8c52a2b33693f7992d67201e3 has been deleted
listing be9c408c5e1f8cf57f5d157f301d6aa4e8b73e7561954859af5fcdce4174821b has been deleted
listing 76797e5df6ac8db35b1283280c65ad4b365d328bcf7d33f5853c16933f2181d7 has been deleted
listing b40a2a50b5e073b32684aab46f148ab8c66d25f3dfc54fd2f6e9c1d3005f0989 has been deleted
listing 8148b064e83d89cd524aed1c70d0bad864165115a2f8e12167f4ac0fc334cebc has been deleted
listing 29

listing 985139523c3dfaf66e75e0bf94457b8957d898f8a3ca4cde213d77416d22db6f has been deleted
listing f244f48be5137b1c13bf5ca3355f8872174bfb965fe865796744a7f67dec3160 has been deleted
listing b0948530e126da7ca0a03c5925b7075902b0afb44e9bfb2e15967cf79168e977 has been deleted
listing 17c43c7114400f3cc314bf10c10eadf203909b80de63aa202d89969697cbd764 has been deleted
listing 1de994658e0b861b5d24cc9181e06af26264b0585b74aed35c560f0fed63f58c has been deleted
listing 642933e57bdfca60759e0c46bab19947b279cc98f4980eb259fd77a4c4105a9d has been deleted
listing cfffc80b2ddcb7f419dd37467964813696ecdfb100f69842c6761a4b9f96696e has been deleted
listing 09d5b6608295566e158c35c2368042502328488cf4f508bed80c880c211a3470 has been deleted
listing 53129fbe62032a5a40a40078efc579cb4115ace77cc958f1161d993389b30061 has been deleted
listing b9c5e66cbff3a84241ce452afd923422e0c4759562255ff9107d4e770baf24b1 has been deleted
listing 2d17adc033b150f4dcdb42bd73baf3136169bc43d52d61b9eab3e657da650a8e has been deleted
listing 30

listing f1791febc94d5dfb190d2e204890fad7d706f41563fb14ce19332b171075e4a5 has been deleted
listing 92f52f3e1e8363a209b791d79cf14a87cceb9437e23897b2628351da28fc44d9 has been deleted
listing 8708ef63385978e05441ef9980e52a6d883349c6c06966d260e48b71361f2eb7 has been deleted
listing 48315025f1891fb878c58bab3231d7eed29b1a2e6c9930635aca70505cfcb0d6 has been deleted
listing 6eaacb36a9b00c59ccf506424ad79f66b7abfa238209ae6019699ec8c81ac01a has been deleted
listing 065dcac2ee9aed33a266edef1eb2d083974d1cb063727a18b6e496f0c1168ac4 has been deleted
listing 9ae83cd4f472e8773b2a6000b85ac8aa84a2695cf4c48be1873f19cf423d2aea has been deleted
listing 0a79c34ef4c1f20e864dd73bc8f198f80369a44ffbeee853b86a9e4a8203d886 has been deleted
listing 0b6b1a5b3b2d60cb6ce13c92b4c7494f529c833084a016f8faca244dd45baf52 has been deleted
listing 8581d54d778b1deb79ee21232a64c5f03265e72fd39a29072251ca286842b21b has been deleted
listing e038a4c497f59f6931f4032572db07b6e11cd0ade03bfbe1ff6a3bc5b7a39c77 has been deleted
listing 1f

listing 5c29b1cbbca5ac465a8f07c9d402a8b8b58420795108b26afaf554d9c31415c5 has been deleted
listing 6959006c8ae6fa6fba94ab879d833656c18200c20a1187c356b0ef48e9bf1f9c has been deleted
listing 7f04ce314cee448407e6e7fd07d6f7f64b44f58adc24b1499c33353c96ca633b has been deleted
listing 95eb97883127e5bcd98ba7059219c5835e69bb54fa506c9bcd99f267ccef909d has been deleted
listing 338ec271b95db1d89824dd27dd295ddd8958fd5018ae54c6a73726218b22e637 has been deleted
listing 49b225a42a25592c3ce3f1857b1d1256427281fb2f8d58924a1034103911247e has been deleted
listing 0d19459886430aeeae7c82d5aed16243ef1f2342cb3c3dc0df149b18c32dacb8 has been deleted
listing 029f8c137d798b9903f85c36904780f19c5d9b2b73f144d1eb4da0f0c2a25ce5 has been deleted
listing f9f277b1338607829a24058b3e31f1ee576fc78f095a7d397851059f6ffdb6f7 has been deleted
listing bb189adba40267cec02a90e026ccc61d7e504307e9065f7d418289f9dd66dddb has been deleted
listing 424d941bcbd007e00b5184bf69b2cb53cdc50e286f0df963ac77343d1f4cb43b has been deleted
listing b0

listing b2c0e07cb0f5183916d4727fc9e0c3db3bee6cc063acd7306e96d7f0fd8d2371 has been deleted
listing c949700d49c89e1778a2667520379f83445e433a548a09a7372f989c9096f8f7 has been deleted
listing 9a27a7207dc42b136575c9eb2ce5cc83f3d7bc149c5e624fd6a0ddaa0cb5e7e8 has been deleted
listing 36be80fb5323f16355dca6e001a908ce0c75b228563ed9250709d19915faf1aa has been deleted
listing 69bd32fd75660e7b99252800f1d324dcb027d33cf47adfcae9474dd44d929552 has been deleted
listing cc8afbb711371d45f0c93ab2829fa2def78806e6f6250fdea28f4b69f8e327ad has been deleted
listing 36d59857ae2506459b202853b5ea3d27e61a288e970c124c17d2f0ed1f8290c2 has been deleted
listing f2b823f0908a8f60b23ad80c2cf4721bf9c2a219e94ef4637173ad5f40ab9a3e has been deleted
listing 67b79b85abaa1451fc967c1f977c66d5baa36409f2880439de98c26dd3ad5c18 has been deleted
listing a04411d8e47b6784909dcdcf7a89dcceed719bfe1c3dfa7bde1addaaa1a4168d has been deleted
listing e3c40fa881bb51267ca388a4e553150dd75ce970b9ad14d35e565fc58f8756ef has been deleted
listing cc

listing f3783af314f1c46bab5c31ef0a82265e21b1ff693439d36da4ef7316eed33fc8 has been deleted
listing a31f717b1d3c5404f1cb8fda9f576137063c0b719f9793261a1a0505fdf7fa06 has been deleted
listing 84b50bca30de9b62b2ff94c48ed113308615edee963ea3119032bc5b75ada52c has been deleted
listing 1f8c44d76b23b95d4ff42c5fed779d6e9c677e1ee0fc350958e18162f45bd412 has been deleted
listing 987e75b32b3d751b070c83cc2855659807433793a77df63ea40a9292646f2ca4 has been deleted
listing cf42ff0aced97a550cc870b9e09f928062978301f7bd92af50999d39200454cc has been deleted
listing 8392e8c2cd3c15623552d03ec9e5beca3be48234199c5861761fcc403c6da879 has been deleted
listing bb9ab8c215e0a6e3e927f289748422e76c6afaf746da91540b94124db7d7ac9c has been deleted
listing cd048132eb53eb5d1270eac64160c69b2097c42d590a69859df161cec52f7c21 has been deleted
listing 4760f335b0615f6d7dc6919950c676b5b4feba92d0a5cec1d8eab038056a9dfb has been deleted
listing 68c26b3fb54b3e73c10b325fe76a6990a024bec3f47de741067714482b743707 has been deleted
listing 26

listing e9941aac8157ad64ad07b49a085b64e10fc03c5b51f0c4b09fdbae0967d9bf5e has been deleted
listing 91335e0df36144098b9ddd809000f38505695b3e45390878322c98b783bdcfe9 has been deleted
listing 6808af810a97f57b6874f06fb35935787059ad98fa6545ec92bc6b51358a9d63 has been deleted
listing 8c3286a728851cedfc6f061a6922156bcfa3856a0b07cc9232495b8c59bc7f5c has been deleted
listing 2d7ec4469466b3447d1a28eb60268fc4424a75eaa18ebebb2b5c9f86dabbf6b1 has been deleted
listing 777974375f18401f7d4cdb1ec3f0b00249e833aeb526ad95668e84346abe0dbb has been deleted
listing e8a65403bcc9b4cc535884664ef5013529fa64662a99beb60f943e486e75dedf has been deleted
listing e5925c074fe7f5f4e879eb7e1477dcfecee5146b7dd70eaca8ce995aeb813d33 has been deleted
listing e0699c7e4b09c7c5302b8438b2c43b9c31577184a9ac8d65e83fa89dd6c27128 has been deleted
listing 64a0bde5db88cffe6619552c858160df3aa37330e349663de094940d1e0cb1ee has been deleted
listing d849f40fa1a9007daa3c24275d25a4c0eb3149ee157e5c9f1afa579909d0b5fb has been deleted
listing 01

In [4]:
new_listing_df.set_index('listing_id', inplace = True)
new_listing_df.head()

Unnamed: 0_level_0,listing_title,listing_nh,listing_city,listing_date,listing_price,listing_bedrooms,listing_bathrooms,listing_sqft,listing_address,listing_info,listing_body,listing_url,listing_first_image
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
7539510462,1x1 w/ Spectacular West Facing Views Over Haye...,hayes valley,San Francisco,2022-09-28T15:06:37-0700,2759,1,1,641.0,,open house dates friday 2022-10-07 satur...,Reduced Rate on 1x1 at Brand New 100 Van Ness!...,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,https://images.craigslist.org/01010_hbtjexHi2I...
7546522181,Beautiful 1BR / Fully Remodeled / WD in Unit /...,russian hill,San Francisco,2022-10-16T22:30:16-0700,4095,1,1,600.0,2200 Leavenworth,cats are OK - purrr dogs are OK - wooof apa...,"* GET IT WHILE IT'S HOTT, WON'T LAST LONG!! -*...",https://sfbay.craigslist.org/sfc/apa/d/san-fra...,https://images.craigslist.org/00l0l_6J3y3Qgxmb...
7547280923,Beautifully Renovated Pac Heights 2BR/1BA; Bri...,pacific heights,San Francisco,2022-10-18T16:28:49-0700,4695,2,1,1002.0,1801 California,cats are OK - purrr dogs are OK - wooof apa...,"1801 California #202 San Francisco, CA 94109 ...",https://sfbay.craigslist.org/sfc/apa/d/san-fra...,https://images.craigslist.org/00t0t_kQd5Dx0w5U...
7547558701,FREE OCTOBER RENT Renovated 3 BR,mission district,san francisco,2022-10-19T11:30:12-0700,3900,3,1,,shotwell near 20 th,apartment no laundry on site no smoking st...,"571 Shotwell Street San Francisco, CA 94110 - ...",https://sfbay.craigslist.org/sfc/apa/d/san-fra...,https://images.craigslist.org/00303_7ADJOwqRcJ...
7538170113,A very nice 1 bedroom with new kitchen and app...,castro / upper market,San Francisco,2022-09-25T09:32:12-0700,2780,1,1,,Duboce near Pearl,apartment w/d in unit street parking rent ...,"Now showing A large bedroom, living room All ...",https://sfbay.craigslist.org/sfc/apa/d/san-fra...,https://images.craigslist.org/00J0J_ablz3MqqGI...


In [5]:
new_listing_df.shape

(227247, 13)

In [6]:
# Concatenate with existing df along rows
dir_folder = '/Users/pandabear/springboard/CapstoneTwoProject/data/interim/'
existing_df = pd.read_csv(dir_folder + 'listing_df_parsed.csv',index_col='listing_id')

combined_df = pd.concat([existing_df, new_listing_df], axis=0)

In [7]:
existing_df.shape

(910735, 13)

In [8]:
combined_df.shape

(1137982, 13)

In [9]:
combined_df.to_csv(r'/Users/pandabear/springboard/CapstoneTwoProject/data/interim/listing_df_parsed.csv', header=True)
print('Saved to csv file')

Saved to csv file
