## Imports

In [53]:
#Import pandas, matplotlib.pyplot, and seaborn in the correct lines below
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re

In [51]:
# Neighborhoods in SF
SFC_NEIGHBORHOODS = ['alamo square / nopa', 'bayview', 'bernal heights', 'castro / upper market', 'cole valley / ashbury hts', 'downtown / civic / van ness',
 'excelsior / outer mission', 'financial district', 'glen park', 'haight ashbury', 'hayes valley', 'ingleside / SFSU / CCSF', 'inner richmond',
 'inner sunset / UCSF', 'laurel hts / presidio', 'lower haight', 'lower nob hill', 'lower pac hts', 'marina / cow hollow', 'mission district',
 'nob hill', 'noe valley', 'north beach / telegraph hill', 'pacific heights', 'portola district', 'potrero hill', 'richmond / seacliff', 'russian hill', 
'SOMA / south beach', 'sunset / parkside', 'tenderloin','treasure island', 'twin peaks / diamond hts', 'USF / panhandle', 'visitacion valley', 
'west portal / forest hill', 'western addition']

# Neighborhoods in Peninsula
PEN_NEIGHBORHOODS = ['atherton','belmont','brisbane','burlingame','coastside/pescadero','daly city','east palo alto','foster city','half moon bay','los altos','menlo park','millbrae',
                     'mountain view','pacifica','palo alto','portola valley','redwood city','redwood shores','san bruno','san carlos','san mateo','south san francisco','woodside']

# Neighborhoods in the East Bay
EBY_NEIGHBORHOODS = ['alameda','albany / el cerrito','berkeley','berkeley north / hills','brentwood / oakley','concord / pleasant hill / martinez','danville / san ramon',
                     'dublin / pleasanton / livermore','emeryville','fairfield / vacaville','fremont / union city / newark','hayward / castro valley','hercules, pinole, san pablo, el sob',
                     'lafayette / orinda / moraga','oakland downtown','oakland east','oakland hills / mills','oakland lake merritt / grand','oakland north / temescal',
                     'oakland piedmont / montclair','oakland rockridge / claremont','oakland west','pittsburg / antioch','richmond / point / annex','san leandro','vallejo / benicia','walnut creek']
# List of cities
SFC_CITIES = ['San Francisco']
PEN_CITIES = ['Palo Alto','San Mateo','Mountain View','Redwood City','Menlo Park','Daly City','San Bruno','South San Francisco','Foster City','Burlingame']
EBY_CITIES = ['Oakland','Berkeley','Fremont','Walnut Creek','Concord','Hayward','Alameda','Pleasanton','San Leandro','Vallejo','San Ramon','Emeryville',
        'Dublin','Richmond','Fairfield','Livermore','Antioch','Pittsburg','Union City','El Cerrito']

ALL_CITIES = SFC_CITIES + PEN_CITIES + EBY_CITIES

# Dictionaries for neighborhood and city relationship
METRO_TO_NEIGHBORHOODS = {'sfc': SFC_NEIGHBORHOODS, 'pen': PEN_NEIGHBORHOODS, 'eby': EBY_NEIGHBORHOODS}
METRO_TO_CITIES = {'sfc': SFC_CITIES, 'pen': PEN_CITIES, 'eby': EBY_CITIES}

NEIGHBORHOOD_TO_METRO = {neighborhood: metro for metro, neighborhoods in METRO_TO_NEIGHBORHOODS.items() for neighborhood in neighborhoods}
CITY_TO_METRO = {city: metro for metro,cities in METRO_TO_CITIES.items() for city in cities}

# Create a dictionary mapping listing_info values to the categorical column name
LISTING_INFO_TO_COLUMN = {'cats are OK - purrr':'Animals_cats','dogs are OK - wooof':'Animals_dogs','furnished':'Furnished',
                    'no smoking':'Smoking','wheelchair accessible':'Wheelchair accessible','air conditioning':'Has_AC',
                     'EV charging':'HasEVCharging','laundry in bldg':'Laundry_in_bldg','w/d in unit':'Laundry_in_unit',
                     'w/d hookups':'Laundry_has_hookup','laundry on site':'Laundry_onsite','no laundry on site':'Laundry_not_onsite',
                     'carport':'Parking_carport','attached garage':'Parking_attached_garage','detached garage':'Parking_detached_garage',
                     'off-street parking':'Parking_offstreet','street parking':'Parking_street','valet parking':'Parking_valet','no parking':'Parking_none',
                     'rent period: daily':'Rent_period_daily','rent period: weekly':'Rent_period_weekly','rent period: monthly':'Rent_period_monthly',
                     'condo':'Housing_condo','apartment':'Housing_apt','cottage/cabin':'Housing_cottage','duplex':'Housing_duplex',
                     'flat':'Housing_flat','house':'Housing_house','in-law':'Housing_inlaw','loft':'Housing_inlaw','townhouse':'Housing_townhouse'}

# Create a dictionary mapping listing_body features to the categorical column name
LISTING_BODY_TO_COLUMN = {'rent controlled':'is_rent_controlled','no pets':'pets allowed','remodelled':'is_remodelled',
                         'amenities':'has_amenities'}

In [45]:
def correct_metro(neighborhood, city):
    if city in CITY_TO_METRO:
        return NEIGHBORHOOD_TO_METRO[neighborhood] == CITY_TO_METRO[city]
    else:
        return False
    
def clean_city(city):
    city = city.strip()
    city = city.strip('.')
    if ',' in city:
        return city.split(',')[0]
    if '/' in city:
        return city.split('/')[0]
    return city

## Load the data

In [4]:
# The parsed html listing data is the interim directory
dir_folder = '/Users/pandabear/springboard/CapstoneTwoProject/data/interim/'
listing_df = pd.read_csv(dir_folder + 'listing_df.csv',index_col='listing_id')

# Select the first 1000 rows only TEMP
listing_df = listing_df.iloc[:1000,:]
listing_df.shape

(1000, 13)

In [5]:
listing_df.head()

Unnamed: 0_level_0,listing_address,listing_bathrooms,listing_bedrooms,listing_body,listing_city,listing_date,listing_first_image,listing_info,listing_nh,listing_price,listing_sqft,listing_title,listing_url
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
7506354686,24 Union Square,2,2,Furnished Available -Parking -Pool -Controlle...,Union City,2022-07-08T11:18:15-0700,https://images.craigslist.org/00P0P_hZ2AtgxDfz...,air conditioning cats are OK - purrr dogs a...,fremont / union city / newark,3250,1226.0,"At Last, This Is What You’ve Been Searching For",https://sfbay.craigslist.org/eby/apa/d/union-c...
7508776873,1625 15th Street,1,2,"Upgraded Single Family Home in San Pablo, CA...",San Pablo,2022-07-14T11:52:10-0700,https://images.craigslist.org/00101_70g2UwYKVU...,application fee details: $45 Online Applicati...,"hercules, pinole, san pablo, el sob",2600,,"Upgraded Single Family Home in San Pablo, CA",https://sfbay.craigslist.org/eby/apa/d/richmon...
7520158858,3900 Business Center Dr near Business Center D...,1,1,Verdant at Green Valley 3900 Business Center D...,Fairfield,2022-08-11T11:38:28-0700,https://images.craigslist.org/00707_7hc2WHQqbx...,cats are OK - purrr dogs are OK - wooof apa...,fairfield / vacaville,2102,712.0,"Business Center, Parcel Lockers, Disposal, Com...",https://sfbay.craigslist.org/eby/apa/d/fairfie...
7509730537,36000 Fremont Blvd,1,1,To schedule a tour We now book our tour appoin...,Fremont,2022-07-16T16:43:33-0700,https://images.craigslist.org/00N0N_5CCDB2ckEM...,cats are OK - purrr dogs are OK - wooof apa...,fremont / union city / newark,2325,610.0,Unwind at Casa Serena!,https://sfbay.craigslist.org/eby/apa/d/fremont...
7516143553,,2,3,3 Beds 2 Baths single family house Excellent ...,San Leandro,2022-08-01T14:25:13-0700,https://images.craigslist.org/00z0z_cJsvz9nR7M...,air conditioning cats are OK - purrr dogs a...,san leandro,3500,1300.0,House for rent - 5 mins walk to Bart 3bed room...,https://sfbay.craigslist.org/eby/apa/d/san-lea...


In [6]:
listing_df.describe()

Unnamed: 0,listing_bedrooms,listing_price,listing_sqft
count,1000.0,1000.0,887.0
mean,1.552,2744.193,869.900789
std,0.827031,910.908143,344.632338
min,0.0,925.0,180.0
25%,1.0,2234.25,660.0
50%,2.0,2593.0,805.0
75%,2.0,3050.0,986.0
max,5.0,12710.0,3777.0


In [7]:
listing_df.shape

(1000, 13)

In [8]:
# Drop rows that are missing price data
#listing_df = listing_df.dropna(subset=['listing_price'])

# Drop rows where price is below a chosen min and max
price_min = 500
price_max = 20000
listing_df = listing_df.loc[(listing_df.listing_price > price_min) & (listing_df.listing_price < price_max)]

In [9]:
listing_df.describe()

Unnamed: 0,listing_bedrooms,listing_price,listing_sqft
count,1000.0,1000.0,887.0
mean,1.552,2744.193,869.900789
std,0.827031,910.908143,344.632338
min,0.0,925.0,180.0
25%,1.0,2234.25,660.0
50%,2.0,2593.0,805.0
75%,2.0,3050.0,986.0
max,5.0,12710.0,3777.0


## Exploring the data

In [10]:
# Look at the number of exact duplicates
duplicates = listing_df['listing_title'].value_counts()
duplicates.head()

Amazing 2 bed 2 bath home                                                4
Renovated 1x1 Apartment with In-Wall USB Charging Ports                  4
Unwind at Casa Serena!                                                   4
Convenient Location & Comfortable Community: Spacious 1 Bedroom w/ De    3
**FITNESS CENTER, SPORTS COURT, POOLS & WELCOME OUR NEW PET PARK!***     3
Name: listing_title, dtype: int64

In [11]:
print('Total number of duplicates by listing title: {}'.format(duplicates[duplicates != 1].sum()))

Total number of duplicates by listing title: 103


In [12]:
# Drop duplicates using 3 filters: (Note: optional fields in craigslist are address, neighborhood)
# 1. Drop duplicates by subset of title, bedroom, bathroom, neighborhood
df_drop_by_filter1 = listing_df.drop_duplicates(subset = ['listing_title','listing_bedrooms','listing_bathrooms','listing_nh'], keep = 'last', inplace = False)
df_drop_by_filter1.shape

(950, 13)

In [13]:
# 2. Drop duplicates by subset of image link, bedroom, bathroom, neighborhood
df_drop_by_filter2 = df_drop_by_filter1.drop_duplicates(subset = ['listing_first_image','listing_bedrooms','listing_bathrooms','listing_nh'], keep = 'last', inplace = False)
df_drop_by_filter2.shape

(858, 13)

In [14]:
# 3. Drop duplicates by subset of listing_address, listing_city, bedroom, bathroom
df_drop_by_filter3 = df_drop_by_filter2.drop_duplicates(subset = ['listing_address','listing_city','listing_bedrooms','listing_bathrooms'], keep = 'last', inplace = False)
df_drop_by_filter3.shape

(595, 13)

In [15]:
# Compare with dropping duplicates based on all columns
df_drop_by_all_cols = listing_df.drop_duplicates(keep = 'last', inplace = False)
df_drop_by_all_cols.shape

(1000, 13)

In [16]:
# Use all 3 filters
df = df_drop_by_filter3
df.head()

Unnamed: 0_level_0,listing_address,listing_bathrooms,listing_bedrooms,listing_body,listing_city,listing_date,listing_first_image,listing_info,listing_nh,listing_price,listing_sqft,listing_title,listing_url
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
7508776873,1625 15th Street,1,2,"Upgraded Single Family Home in San Pablo, CA...",San Pablo,2022-07-14T11:52:10-0700,https://images.craigslist.org/00101_70g2UwYKVU...,application fee details: $45 Online Applicati...,"hercules, pinole, san pablo, el sob",2600,,"Upgraded Single Family Home in San Pablo, CA",https://sfbay.craigslist.org/eby/apa/d/richmon...
7520158858,3900 Business Center Dr near Business Center D...,1,1,Verdant at Green Valley 3900 Business Center D...,Fairfield,2022-08-11T11:38:28-0700,https://images.craigslist.org/00707_7hc2WHQqbx...,cats are OK - purrr dogs are OK - wooof apa...,fairfield / vacaville,2102,712.0,"Business Center, Parcel Lockers, Disposal, Com...",https://sfbay.craigslist.org/eby/apa/d/fairfie...
7516143553,,2,3,3 Beds 2 Baths single family house Excellent ...,San Leandro,2022-08-01T14:25:13-0700,https://images.craigslist.org/00z0z_cJsvz9nR7M...,air conditioning cats are OK - purrr dogs a...,san leandro,3500,1300.0,House for rent - 5 mins walk to Bart 3bed room...,https://sfbay.craigslist.org/eby/apa/d/san-lea...
7497033772,6410 Schmidt Lane,1,1,Bedrooms: 1Bathrooms: 1Square Feet: 610Date Av...,El Cerrito,2022-06-15T15:02:41-0700,https://images.craigslist.org/00L0L_lSulgXVG9d...,apartment w/d in unit carport rent period:...,emeryville,2280,610.0,"Garage, Hardwood Floors, Disability Access",https://sfbay.craigslist.org/eby/apa/d/el-cerr...
7510455016,2121 Dwight Way,1,2,Contact us for details on our specials! Welco...,Berkeley,2022-07-18T14:21:02-0700,https://images.craigslist.org/00808_bfYjJavlFV...,air conditioning cats are OK - purrr dogs a...,berkeley,5078,827.0,"Outdoor Heaters, USB Outlets & Fiber Internet,...",https://sfbay.craigslist.org/eby/apa/d/berkele...


In [17]:
df.dtypes

listing_address         object
listing_bathrooms       object
listing_bedrooms         int64
listing_body            object
listing_city            object
listing_date            object
listing_first_image     object
listing_info            object
listing_nh              object
listing_price            int64
listing_sqft           float64
listing_title           object
listing_url             object
dtype: object

In [18]:
# Convert the columns into the correct dtype
df['listing_date'] = pd.to_datetime(df['listing_date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [19]:
# Trim trailing spaces on string values in listing_bathrooms
df['listing_bathrooms'] = df['listing_bathrooms'].map(lambda x: x.strip())
df['listing_bathrooms'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


array(['1', '2', '3', '2.5', '1.5', '4', 'shared', '3.5'], dtype=object)

In [20]:
df['listing_bathrooms'].value_counts()

1         397
2         147
1.5        22
2.5        16
3           9
4           2
3.5         1
shared      1
Name: listing_bathrooms, dtype: int64

In [21]:
# 1 split bathroom is the same as 1 bathroom, so convert 'split' to type '1'
df['listing_bathrooms'][df['listing_bathrooms'] == 'split'] = '1'

# Since these are the only inputs for bathroom in the craigslist UI, lump all bathrooms > 3 into a single type '3+'
extra_bath = ['3.5','4','4.5','5','5.5','6','6.5','7','7.5','8','8.5','9+']
df['listing_bathrooms'][df['listing_bathrooms'].isin(extra_bath)] = '3+'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
df['listing_bathrooms'].value_counts()

1         397
2         147
1.5        22
2.5        16
3           9
3+          3
shared      1
Name: listing_bathrooms, dtype: int64

In [23]:
df['listing_sqft'].describe()
# Drop sqft values that are too small or too big (the 94103 is clearly a zip code)
sqft_min = 10
sqft_max = 30000
df = df.loc[(df.listing_sqft > sqft_min) & (df.listing_sqft < sqft_max)]

In [24]:
# df['listing_sqft'].describe()

In [25]:
# Convert listing_info from string of values to a list of values
df['listing_info_split'] = df['listing_info'].apply(lambda x: x.strip().split("  "))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [27]:
df.head()

Unnamed: 0_level_0,listing_address,listing_bathrooms,listing_bedrooms,listing_body,listing_city,listing_date,listing_first_image,listing_info,listing_nh,listing_price,listing_sqft,listing_title,listing_url,listing_info_split
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
7520158858,3900 Business Center Dr near Business Center D...,1,1,Verdant at Green Valley 3900 Business Center D...,Fairfield,2022-08-11 18:38:28,https://images.craigslist.org/00707_7hc2WHQqbx...,cats are OK - purrr dogs are OK - wooof apa...,fairfield / vacaville,2102,712.0,"Business Center, Parcel Lockers, Disposal, Com...",https://sfbay.craigslist.org/eby/apa/d/fairfie...,"[cats are OK - purrr, dogs are OK - wooof, apa..."
7516143553,,2,3,3 Beds 2 Baths single family house Excellent ...,San Leandro,2022-08-01 21:25:13,https://images.craigslist.org/00z0z_cJsvz9nR7M...,air conditioning cats are OK - purrr dogs a...,san leandro,3500,1300.0,House for rent - 5 mins walk to Bart 3bed room...,https://sfbay.craigslist.org/eby/apa/d/san-lea...,"[air conditioning, cats are OK - purrr, dogs a..."
7497033772,6410 Schmidt Lane,1,1,Bedrooms: 1Bathrooms: 1Square Feet: 610Date Av...,El Cerrito,2022-06-15 22:02:41,https://images.craigslist.org/00L0L_lSulgXVG9d...,apartment w/d in unit carport rent period:...,emeryville,2280,610.0,"Garage, Hardwood Floors, Disability Access",https://sfbay.craigslist.org/eby/apa/d/el-cerr...,"[apartment, w/d in unit, carport, rent period:..."
7510455016,2121 Dwight Way,1,2,Contact us for details on our specials! Welco...,Berkeley,2022-07-18 21:21:02,https://images.craigslist.org/00808_bfYjJavlFV...,air conditioning cats are OK - purrr dogs a...,berkeley,5078,827.0,"Outdoor Heaters, USB Outlets & Fiber Internet,...",https://sfbay.craigslist.org/eby/apa/d/berkele...,"[air conditioning, cats are OK - purrr, dogs a..."
7518133463,Decoto Rd near Perry Rd,1,2,I have 2 apartments that will be available Dow...,Union City,2022-08-06 18:11:25,https://images.craigslist.org/00202_X0MiqUqmHG...,cats are OK - purrr dogs are OK - wooof apa...,fremont / union city / newark,2450,803.0,2bedroom - Renovated- Spacious-Wonderful Commu...,https://sfbay.craigslist.org/eby/apa/d/union-c...,"[cats are OK - purrr, dogs are OK - wooof, apa..."


In [46]:
def get_listing_info_feature(listing_info_split, target_info):
    for info in listing_info_split:
        if info == target_info:
            return 1
    return 0


for target_info, column in LISTING_INFO_TO_COLUMN.items():
    df[column] = df['listing_info_split'].apply(lambda x: get_listing_info_feature(x, target_info))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [29]:
df.head()

Unnamed: 0_level_0,listing_address,listing_bathrooms,listing_bedrooms,listing_body,listing_city,listing_date,listing_first_image,listing_info,listing_nh,listing_price,...,Rent_period_weekly,Rent_period_monthly,Housing_condo,Housing_apt,Housing_cottage,Housing_duplex,Housing_flat,Housing_house,Housing_inlaw,Housing_townhouse
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7520158858,3900 Business Center Dr near Business Center D...,1,1,Verdant at Green Valley 3900 Business Center D...,Fairfield,2022-08-11 18:38:28,https://images.craigslist.org/00707_7hc2WHQqbx...,cats are OK - purrr dogs are OK - wooof apa...,fairfield / vacaville,2102,...,0,1,0,1,0,0,0,0,0,0
7516143553,,2,3,3 Beds 2 Baths single family house Excellent ...,San Leandro,2022-08-01 21:25:13,https://images.craigslist.org/00z0z_cJsvz9nR7M...,air conditioning cats are OK - purrr dogs a...,san leandro,3500,...,0,1,0,0,0,0,0,1,0,0
7497033772,6410 Schmidt Lane,1,1,Bedrooms: 1Bathrooms: 1Square Feet: 610Date Av...,El Cerrito,2022-06-15 22:02:41,https://images.craigslist.org/00L0L_lSulgXVG9d...,apartment w/d in unit carport rent period:...,emeryville,2280,...,0,1,0,1,0,0,0,0,0,0
7510455016,2121 Dwight Way,1,2,Contact us for details on our specials! Welco...,Berkeley,2022-07-18 21:21:02,https://images.craigslist.org/00808_bfYjJavlFV...,air conditioning cats are OK - purrr dogs a...,berkeley,5078,...,0,1,0,1,0,0,0,0,0,0
7518133463,Decoto Rd near Perry Rd,1,2,I have 2 apartments that will be available Dow...,Union City,2022-08-06 18:11:25,https://images.craigslist.org/00202_X0MiqUqmHG...,cats are OK - purrr dogs are OK - wooof apa...,fremont / union city / newark,2450,...,0,1,0,1,0,0,0,0,0,0


In [30]:
df['listing_city'] = df['listing_city'].apply(lambda x: x.title())
df['listing_city'] = df['listing_city'].apply(lambda x: clean_city(x))
df['listing_city'].value_counts()

# Get index of cities that occur less than 5 times (these are typos or errors)
x = df['listing_city'].value_counts()
city_to_drop = list((x[x < 5]).index)

# Drop city values that are lower than a value count of 2
df['listing_city'] = df['listing_city'].apply(lambda x: None if x in city_to_drop else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [31]:
df['listing_city'].value_counts()

Oakland          114
Fremont           39
Berkeley          31
San Leandro       29
Pleasanton        26
Hayward           25
Walnut Creek      21
Concord           19
Fairfield         16
San Ramon         15
Dublin            14
Livermore         13
Vallejo           13
Pittsburg         12
Union City        12
Alameda           11
Danville           8
Emeryville         8
Pleasant Hill      8
El Cerrito         7
Lafayette          7
Richmond           7
Brentwood          6
Antioch            6
San Pablo          5
El Sobrante        5
Name: listing_city, dtype: int64

In [61]:
# If a listing has a neighborhood tag, cross reference it with the listing_city to make sure it is in the correct city, otherwise remove from df
df['is_consistent_metro'] = df.apply(lambda x: correct_metro(x['listing_nh'], x['listing_city']), axis=1)
df['listing_nh'] = df.apply(lambda x: None if x['is_consistent_metro'] == False, axis=1)

SyntaxError: invalid syntax (<ipython-input-61-4f263b6a34b8>, line 3)

In [32]:
# Missing values in numeric columns
missing = pd.concat([df.isnull().sum(), 100 * df.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count',ascending=False)

Unnamed: 0,count,%
listing_address,32,6.299213
listing_city,31,6.102362
Rent_period_daily,0,0.0
Laundry_not_onsite,0,0.0
Parking_carport,0,0.0
Parking_attached_garage,0,0.0
Parking_detached_garage,0,0.0
Parking_offstreet,0,0.0
Parking_street,0,0.0
Parking_valet,0,0.0


In [54]:
# Extract key features from listing_body
def parse_listing_body(text,target_word):
    for key in LISTING_BODY_TO_COLUMN.keys():
        if re.search(target_word, text) is not None:
            return 1
    return 0
    
for target_word, column_name in LISTING_BODY_TO_COLUMN.items():
    df[column_name] = df['listing_body'].apply(lambda x: parse_listing_body(x, target_word))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [55]:
df.head()

Unnamed: 0_level_0,listing_address,listing_bathrooms,listing_bedrooms,listing_body,listing_city,listing_date,listing_first_image,listing_info,listing_nh,listing_price,...,Housing_duplex,Housing_flat,Housing_house,Housing_inlaw,Housing_townhouse,is_consistent_metro,is_rent_controlled,pets allowed,is_remodelled,has_amenities
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7520158858,3900 Business Center Dr near Business Center D...,1,1,Verdant at Green Valley 3900 Business Center D...,Fairfield,2022-08-11 18:38:28,https://images.craigslist.org/00707_7hc2WHQqbx...,cats are OK - purrr dogs are OK - wooof apa...,fairfield / vacaville,2102,...,0,0,0,0,0,True,0,0,0,1
7516143553,,2,3,3 Beds 2 Baths single family house Excellent ...,San Leandro,2022-08-01 21:25:13,https://images.craigslist.org/00z0z_cJsvz9nR7M...,air conditioning cats are OK - purrr dogs a...,san leandro,3500,...,0,0,1,0,0,True,0,0,0,0
7497033772,6410 Schmidt Lane,1,1,Bedrooms: 1Bathrooms: 1Square Feet: 610Date Av...,El Cerrito,2022-06-15 22:02:41,https://images.craigslist.org/00L0L_lSulgXVG9d...,apartment w/d in unit carport rent period:...,emeryville,2280,...,0,0,0,0,0,True,0,0,0,0
7510455016,2121 Dwight Way,1,2,Contact us for details on our specials! Welco...,Berkeley,2022-07-18 21:21:02,https://images.craigslist.org/00808_bfYjJavlFV...,air conditioning cats are OK - purrr dogs a...,berkeley,5078,...,0,0,0,0,0,True,0,0,0,0
7518133463,Decoto Rd near Perry Rd,1,2,I have 2 apartments that will be available Dow...,Union City,2022-08-06 18:11:25,https://images.craigslist.org/00202_X0MiqUqmHG...,cats are OK - purrr dogs are OK - wooof apa...,fremont / union city / newark,2450,...,0,0,0,0,0,True,0,0,0,0


In [None]:
# Look at distributions of numeric data
# df['listing_bedrooms'].hist(bins=10)
# plt.xlabel('Number of bedrooms')
# plt.ylabel('Count')
# plt.title('Distribution of bedrooms across craigslist listings')