In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Global DataFrame Display Settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
# Define apartment scrape json
apt_scrape_json_filepath = '..\\scrapes\\craigslist_listings_211111144509.json'
filtered_csv_export_filepath = 'filtered_listings_211111144509.csv'

In [4]:
# read in dataframes from mongodb exports

# coagen + poa results data
with open(apt_scrape_json_filepath,'r') as json_file:
    json_data = json.load(json_file)
apts_df = pd.json_normalize(json_data, max_level=5)
apts_df = apts_df.apply(pd.to_numeric, errors='ignore')

In [5]:
apts_df.head(10)

Unnamed: 0,price,listingTime,title,rooms,sqft,link,area,id
0,1700,2021-11-11 14:35,"Tour today and get November special! 1, 2, and 3 bedrooms available!",1,610,https://inlandempire.craigslist.org/apa/d/san-bernardino-tour-today-and-get/7404009613.html,"1422 E9th Street, Inland Empire, CA",7404009613
1,2428,2021-11-11 14:35,"Large Closet, Tennis Court, Swimming Pool",2,895,https://inlandempire.craigslist.org/apa/d/loma-linda-large-closet-tennis-court/7405767166.html,"10558 Mountain View Avenue, Redlands, CA",7405767166
2,1958,2021-11-11 14:33,"Electronic Entry Gates, Extra Storage, Large Walk In Closets",1,704,https://inlandempire.craigslist.org/apa/d/upland-electronic-entry-gates-extra/7404040519.html,"1420 Chaffee St, Upland, CA",7404040519
3,1850,2021-11-11 14:32,"Cable Ready, Fitness Center, Bay Window",1,700,https://inlandempire.craigslist.org/apa/d/redlands-cable-ready-fitness-center-bay/7406667741.html,"30598 Independence Avenue, Redlands, CA",7406667741
4,2350,2021-11-11 14:32,"Granite Countertops, Resort Inspired Pool & Spa, Community BBQs",1,842,https://inlandempire.craigslist.org/apa/d/corona-granite-countertops-resort/7406709569.html,Corona,7406709569
5,1450,2021-11-11 14:31,"Garages Available, Public Transportation Nearby, Gated Community",1,623,https://inlandempire.craigslist.org/apa/d/riverside-garages-available-public/7406709108.html,Riverside,7406709108
6,2909,2021-11-11 14:29,"Front-loading washer and dryer, Onsite playground, 9-foot ceilings",3,1319,https://inlandempire.craigslist.org/apa/d/menifee-front-loading-washer-and-dryer/7406700177.html,"30414 Town Center Dr, Inland Empire, CA",7406700177
7,2854,2021-11-11 14:28,This unit comes with a GARAGE!!,2,1073,https://inlandempire.craigslist.org/apa/d/montclair-this-unit-comes-with-garage/7396002096.html,"4868 Cypress Street, Montclair, CA",7396002096
8,1930,2021-11-11 14:28,"Patio/Balcony, Natural Lighting, Balcony",2,920,https://inlandempire.craigslist.org/apa/d/patton-patio-balcony-natural-lighting/7406702111.html,"2011 Arden Ave, Highland, CA",7406702111
9,2900,2021-11-11 14:27,"No current availability, we have waitlist , call for waitlist",2,1110,https://inlandempire.craigslist.org/apa/d/ontario-no-current-availability-we-have/7406707255.html,Ontario,7406707255


In [6]:
# Create filter to get only apartments I'm interested in
price_range_mask_df = (apts_df['price'] <= 1800) & (apts_df['price'] >= 1000)

# Get only places in Riverside
cols_to_filter = ['area']
search_values = ['riverside', 'canyon crest', 'corona', 'redland', 'norco']
patt = '|'.join(search_values)
area_mask_df = apts_df[cols_to_filter].apply(lambda x: x.str.contains(patt, case=False)).any(1)

filtered_apts_df = apts_df[price_range_mask_df & area_mask_df]

# # Get only the latest coagen (w/models) result for each vessel (sort in descending order, then drop duplicates)
# modeled_poacoagen_df = modeled_poacoagen_df.sort_values('ts', ascending=False)
# modeled_poacoagen_df = modeled_poacoagen_df.drop_duplicates(subset=['properties.mmsi'], keep='first')

# # Get only the latest raw coagen result for each vessel (sort in descending order, then drop duplicates)
# raw_poacoagen_df = poacoagen_df.sort_values('ts', ascending=False)
# raw_poacoagen_df = poacoagen_df.drop_duplicates(subset=['properties.mmsi'], keep='first')

In [7]:
filtered_apts_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38 entries, 5 to 6429
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   price        38 non-null     int64 
 1   listingTime  38 non-null     object
 2   title        38 non-null     object
 3   rooms        38 non-null     int64 
 4   sqft         38 non-null     int64 
 5   link         38 non-null     object
 6   area         38 non-null     object
 7   id           38 non-null     int64 
dtypes: int64(4), object(4)
memory usage: 2.7+ KB


In [8]:
display_cols = ['price', 'rooms', 'sqft', 'area', 'title', 'link' ]
filtered_apts_df[display_cols]

Unnamed: 0,price,rooms,sqft,area,title,link
5,1450,1,623,Riverside,"Garages Available, Public Transportation Nearby, Gated Community",https://inlandempire.craigslist.org/apa/d/riverside-garages-available-public/7406709108.html
23,1775,2,975,"3429 Rustin Avenue, Riverside, CA",Reserve your new home for 2022,https://inlandempire.craigslist.org/apa/d/riverside-reserve-your-new-home-for-2022/7405771050.html
180,1500,1,642,Corona,"Cozy 1bed, 1bath home",https://inlandempire.craigslist.org/apa/d/corona-cozy-1bed-1bath-home/7406536207.html
252,1790,1,616,Redlands,Waiting List Open for 1 Bedroom 1 Bath!,https://inlandempire.craigslist.org/apa/d/redlands-waiting-list-open-for-bedroom/7406551036.html
253,1800,1,630,Riverside,"Gated Community, Visit Your New Home, Incredible Amenities",https://inlandempire.craigslist.org/apa/d/riverside-gated-community-visit-your/7406550429.html
306,1500,1,650,Corona,"Charming 1bed, 1bath home",https://inlandempire.craigslist.org/apa/d/corona-charming-1bed-1bath-home/7406527954.html
321,1450,1,623,Riverside,Welcome home to Cranford Court,https://inlandempire.craigslist.org/apa/d/riverside-welcome-home-to-cranford-court/7406517652.html
351,1200,3,1180,"4692 Jackson St Ramona, Riverside, CA",Hurry Home Won't Last,https://inlandempire.craigslist.org/apa/d/riverside-hurry-home-wont-last/7406411766.html
373,1800,2,975,"3429 Rustin Avenue, Riverside, CA",Reserve your home for 2022,https://inlandempire.craigslist.org/apa/d/riverside-reserve-your-home-for-2022/7394180596.html
417,1300,1,462,Redlands,Redlands 1 bedroom apartment - walk to UR or Downtown,https://inlandempire.craigslist.org/apa/d/redlands-redlands-bedroom-apartment/7406377160.html


In [9]:
filtered_apts_df[display_cols].to_csv(filtered_csv_export_filepath, index=False)