In [16]:
# pandas to work with dataframes
import pandas as pd
# import re to use regular expressions
import re
# import numpy
import numpy as np
# import glob to bring together multiple csv files
import glob
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import bokeh.plotting as bkp
from mpl_toolkits.axes_grid1 import make_axes_locatable
import geopandas as gpd
from shapely.geometry import Point
from collections import Counter
import pickle

In [17]:
# search for csvs containing web-scraped data with the name scheme
# ideally there will only be one geocoded file but for this project
# there happene dto be several 
csvs = glob.glob(r'*geocode_craigslist*.csv')
# set dfs list to store each dataframe that gets read in
dfs = []
# iterate through csv files gathered in glob
for file in csvs:
    # read each csv as a dataframe with the same column names
    df1 = pd.read_csv(file)
    # append dataframe to dfs list
    dfs.append(df1)

In [18]:
# concatenate dataframes in dfs together vertically
df = pd.concat(dfs, axis=0)
# set an index of integers equal to the length of dataframe rows
index_list = list(range(len(df.index)))
# make index series a column in dataframe
df['index'] = index_list
# set index column as index of dataframe
df = df.set_index(['index'])
# set entire df type as string, as most operations will require data type to be string
df = df.astype(str)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


In [19]:
# get only rows from ramsey and hennepin county, and only rows where geocode_lat and geocode_lon are not "none
df_clean = df.loc[(df['geocode_lat'] != "none") & (df['geocode_lon'] != "none")]
df_clean = df_clean.loc[(df_clean['county'] == "ram") | (df_clean['county'] == "hnp")]
# remove "geocode_lat" and "geocode_lon" columns with "error" value
df_clean = df_clean.loc[(df_clean['geocode_lat'] != "error") & (df_clean['geocode_lon'] != "error")]
# drop columns that will not be necessary for analysis 
df_clean = df_clean.drop(["county_state", "number_of_beds_1", "latitude", "longitude", "map accuracy", "square_feet_three", "square_feet_two", "square_feet_one", "Unnamed: 0", "address_check", "address_county", "address_lat", "address_lon", "check_superbowl", "county_lat", "county_lon", "square feet 2", "value_square_feet_two", "value_square_feet_three", "number_of_beds_2", "first_beds_boolean", "second_beds_boolean", "first_square_feet_boolean", "second_square_feet_boolean", "third_square_feet_boolean"], axis = 1)    
df_clean = df_clean.astype(str)

In [20]:
# Import shapefile of hennepin county municipalities
hennepin_county = "Hennepin_County_Municipalities.shp"
# import shapefile of ramsey county municipalities
ramsey_county = "Cities.shp"
# set coordinate projection
crs = {'init': 'epsg:4326'}
# read hennepin_county cities shapefile as geopandas dataframe
hennepin = gpd.read_file(hennepin_county)
# set projection of dataframe to crs variable
hennepin = hennepin.to_crs({'init': 'epsg:4326'})
# read ramsey county cities shapefile as geopandas dataframe
ramsey = gpd.read_file(ramsey_county)
# set projection of dataframe to crs variable
ramsey = ramsey.to_crs({'init': 'epsg:4326'})

In [21]:
# rename column NAME_TXT to "City" so 
# ramsey and hennepin geodataframes have common in column
# to perform join
hennepin = hennepin.rename(columns = {"NAME_TXT" : "City"})
# Set County column in both dataframes with respective counties
hennepin['County'] = "Hennepin"
ramsey['County'] = "Ramsey"
# subset geodataframes to have the same types of columns
hennepin = hennepin[['County', 'City', 'ShapeSTAre', 'ShapeSTLen', 'geometry']]
ramsey = ramsey[['County', 'City', 'ShapeSTAre', 'ShapeSTLen', 'geometry']]
# concatenate dataframes since both ramsey and hennepin county municipalities 
# will be included in analysis 
ramsey_hennepin = pd.concat([ramsey, hennepin], axis=0)

In [22]:
# set latitude and longitude columsn as float
df_clean['geocode_lon'] = df_clean['geocode_lon'].astype(float)
df_clean['geocode_lat'] = df_clean['geocode_lat'].astype(float)
# convert latitude and longitude points from df_clean into geospatial data poitns
geometry = [Point(xy) for xy in zip(df_clean['geocode_lon'], df_clean['geocode_lat'])]
locations = gpd.GeoDataFrame(df_clean, crs=crs, geometry=geometry)
locations = locations.to_crs({'init': 'epsg:4326'})

In [23]:
# perform a spatial join, such that points within ramsey and hennepin county
# will be combined with columns containing municipal data 
counties_subset = gpd.sjoin(locations, ramsey_hennepin, how="right", op='within')
# subset columns of dataframe
counties_subset = counties_subset.iloc[:, 1:18]

In [24]:
# set an index of integers equal to the length of dataframe rows
index_list = list(range(len(counties_subset.index)))
# make index series a column in dataframe
counties_subset['index'] = index_list
# set index column as index of dataframe
counties_subset = counties_subset.set_index(['index'])
counties_subset = pd.DataFrame(counties_subset)
# drop nan values in geocode lat, geocode lon, 
# which will correspond to cities in Hennepin or Ramsey where
# there are not craigslist ads
counties_subset = counties_subset.dropna(subset = ['geocode_lat', 'geocode_lon'])

In [25]:
# search for titles with "studio" in the title
pattern = re.compile('studio', re.IGNORECASE)
# subset beds that have a beds_baths value of 1 and see if they contain studio in the title
counties_subset['one to studio'] = [bool(re.search(pattern, str(cell))) for cell in counties_subset['title']]
# set one bedrooms that suggest that they are studio in the title as studios
counties_subset.loc[counties_subset['one to studio'] == True, "beds_baths"] = "studio"

In [26]:
#convert counties_subset beds_baths to string
counties_subset['beds_baths'] = counties_subset['beds_baths'].astype(str)
# create column in clean dataframe to verify if bed value is assigned in code to follow
counties_subset['bed assigned'] = False
# assign 'bed assigned' column to false for values that need to be inspected
counties_subset.loc[(counties_subset['beds_baths'] == "0") | (counties_subset['beds_baths'] == "0.0") | (counties_subset['beds boolean'] == "False"), 'bed assigned'] = False
# take a subset of listings without a beds value, with 0 as a bed value, or 0.0
# as the titles of this subset will be insepcted to see if they can be assigned a beds value
beds_none = counties_subset.loc[(counties_subset['beds_baths'] == "0") | (counties_subset['beds_baths'] == "0.0") | (counties_subset['beds boolean'] == "False")] 
# drop any listing that has beds, price and square feet values as null
beds_none = beds_none.drop(beds_none[(beds_none['beds boolean'] == "False") & (beds_none['price boolean'] == "False") & (beds_none['square feet boolean'] == "False")].index)

In [27]:
# create a list of bed values and their index
beds_none_list = beds_none.index.values.tolist()
# create a list of titles
titles_list = beds_none['title'].values.tolist()
# zip the two lists so that the two values remain together
beds_none_list = list(zip((beds_none_list), titles_list))

In [28]:
# search for titles with "studio" in the title
pattern = re.compile('studio', re.IGNORECASE)
studios = [value for value in beds_none_list if bool(re.search(pattern, str(value[1])))]
# search for titles that suggest they are 1 bedroom
pattern = re.compile('one bedroom|1 bedroom|1 br|1br', re.IGNORECASE)
ones = [value for value in beds_none_list if bool(re.search(pattern, str(value[1])))]
# search for titles that suggest they are 2 bedroom
pattern = re.compile('two bedroom|2 bedroom|2 br|2br', re.IGNORECASE)
twos = [value for value in beds_none_list if bool(re.search(pattern, str(value[1])))]
# search for titles that suggest they are 3 bedroom
pattern = re.compile('three bedroom|3 bedroom|3 br|3br', re.IGNORECASE)
three = [value for value in beds_none_list if bool(re.search(pattern, str(value[1])))]
# search for titles that suggest they are 4 bedroom
pattern = re.compile('four bedroom|4 bedroom|4 br|4br', re.IGNORECASE)
# search for titles that suggest they are 4 bedroom
four = [value for value in beds_none_list if bool(re.search(pattern, str(value[1])))]

In [29]:
print (len(studios))
print (len(ones))


88
92


In [30]:
# use a Counter to determine which titles occur more than once in main list of 
# beds_number_assigned
count_beds_number = Counter(beds_number_assigned)
count_beds_number = {k:count_beds_number[k] for k in count_beds_number if count_beds_number[k] > 1}
print (count_beds_number)

NameError: name 'beds_number_assigned' is not defined

In [31]:
# see if there are any titles that were duplicated across the lists
# to check integrity of regular expression pattern matching operations
beds_number_assigned = ones + studios + twos + three + four
# get the difference of the two lists, the beds not assigned
beds_not_assigned = list(set(beds_none_list) - set(beds_number_assigned))
# create list of lists of bedroom types that were or were not discovered
# and in a certain order for iterative operations below
bed_assignments_list = [studios, ones, twos, three, four]
# loop through subset lists of beds values
increment = 0
for item in bed_assignments_list:
    # filter nonetypes from list
    indices = [x for x in item if x != None]
    # get indices of bed values that were assigned using 'titles' column
    # so they can be compared with indices of counties_subset
    indices = [x for x, y in indices]
     # compare indices assigned using 'titles' column
    # to indices assigned 
    idx = counties_subset.index.isin(indices)
    # add 'index match' column to counties_subset dataframe
    counties_subset['index match'] = idx
    # where 'index match' is true, set 'bed assigned' to yes, if false assign bed to 
    counties_subset.loc[(counties_subset['index match'] == True), 'bed assigned'] = True
    # for studios group, assign value to studio
    if increment == 0:
        counties_subset.loc[counties_subset['index match'] == True, 'beds_baths'] = "studio"
    # for other groups, assign to number of beds
    else:
        counties_subset.loc[counties_subset['index match'] == True, 'beds_baths'] = str(increment)
    # add 1 to increment
    increment += 1
# get just indices from beds_not_assigned
no_match = [x for x, y in beds_not_assigned]
# get boolean comparing indices of dataframe with the no match column
no_match_idx = counties_subset.index.isin(no_match)
# assign boolean True false of no match column in dataframe
counties_subset['no match'] = no_match_idx

In [32]:
# save data to pickle so as to examine unclassified beds and baths in inferential statistics 
unassigned = counties_subset.loc[counties_subset['no match'] == True]
unassigned.to_pickle('unassigned_beds.pickle')
# drop unassigned points from dataframe
counties_subset = counties_subset.loc[counties_subset['no match'] == False]

In [33]:
# for all data, make sure that beds_baths column is consistent 
counties_subset.loc[(counties_subset['beds_baths'] == "0.0") | (counties_subset['beds_baths'] == "0"), "beds_baths"] = "studio"
counties_subset.loc[(counties_subset['beds_baths'] == "1.0"), "beds_baths"] = "1"
counties_subset.loc[(counties_subset['beds_baths'] == "2.0"), "beds_baths"] = "2"
counties_subset.loc[(counties_subset['beds_baths'] == "3.0"), "beds_baths"] = "3"
counties_subset.loc[(counties_subset['beds_baths'] == "4.0"), "beds_baths"] = "4"

In [34]:
# drop all non-valid values from dataframe
# in order to run EDA on beds_baths column
df = counties_subset
# drop bed_bath values that were not matched using the title column from analysis
mask = df['no match'] == False
df = df[mask]
# subset data so as to only include studio - 4 bedrooms
df = df.loc[(df['beds_baths'] == "studio") | (df['beds_baths'] == "1") | (df['beds_baths'] == "2") | (df['beds_baths'] == "3") | (df['beds_baths'] == "4")]

In [35]:
# define function that can deal with subset of values of 
# beds_baths columns and perform various operations such as standard deviation
# and mean and store them in a dictionary
def beds_statistics(df, column_name, column_boolean, beds_number = False):
    # if beds_number parameter is empty, function runs through all beds values
    if beds_number is False:
        # variable to increment and set as dictionary key
        count_beds = 0
        # create dictionary to store values 
        values_dict = {}
        # bed values studio -4 exist in dataset
        for value in range(5):
            # account for "studio" value in beds_baths which is a non-int
            if value == 0:
                count_beds = "studio"
                subset = df.loc[(df[column_boolean] == "True") & (df["beds_baths"] == "studio")]
            else:
            # subset data by beds value through each run in loop
                subset = df.loc[(df[column_boolean] == "True") & (df["beds_baths"] == str(count_beds))]
            # make sure subset only includes non-null values of whatever attribute column is input into 
            # the function
            subset = subset.loc[subset[column_boolean] == "True"]
            # convert column of input attribute to numeric
            subset[column_name] = pd.to_numeric(subset[column_name], errors='coerce')
            # get mean of subset column
            mean_value = (subset[column_name].mean())
            mean_value = round(mean_value, 4)
            # get standard deviation of subset column
            std_value = np.std(subset[column_name])
            std_value = round(std_value, 5)
            #convert to array so as to use iqr function from statsmodel
            new_array = subset[column_name].dropna()
            new_array = new_array.as_matrix()
            # call iqr function
            iqr_value = stats.iqr(new_array, axis=None, rng=(25, 75), scale='raw', nan_policy='propagate', interpolation='linear', keepdims=False)
            new_array = subset[column_name].dropna()
            new_array = new_array.as_matrix()
            # get 25th and 75th percentiles of subsetted column 
            q1, q3 = np.percentile(new_array, [25, 75])
            # set lower boundary as 1.5 standard deviations below the 25th percentile as 
            # than that will produce negative values
            lower = q1 - 1.5*(iqr_value)
            # set upper boundary as 3 standard deviations above the 75th percentile so as to
            # get rid of impossible rent values
            upper = q3 + 3.0*(iqr_value)
            # range of acceptable values will be those within the lower and upper bounds
            outlier_range = (lower, upper)
            # add percentiles to output dictionary
            percentiles = (q1, q3)
            # create dictionary key and nested dictionary as value
            values_dict[count_beds] = {'iqr': iqr_value, 'mean': mean_value, 'std': std_value, 'outlier range': outlier_range, "percentiles": percentiles}
            # account for "studio" value in beds_baths column
            if count_beds == "studio":
                count_beds = 0
             # increment count of beds values so as to perform operations on next value from beds column
            count_beds += 1
        return values_dict
    else:
        # create dictionary to store values 
        values_dict = {}
        # subset data by bed value entered in as beds_number parameter
        mask = df['beds_baths'] == float(beds_number)
        subset = df[mask]
        # make sure subset only includes non-null values of whatever attribute column is input into 
        # the function
        subset = subset.loc[subset[column_boolean] == "True"]
        # convert column of input attribute to numeric
        subset[column_name] = pd.to_numeric(subset[column_name], errors='coerce')
        # get mean of subset column
        mean_value = (subset[column_name].mean())
        mean_value = round(mean_value, 2)
        # get standard deviation of subset column
        std_value = np.std(subset[column_name])
        std_value = round(std_value, 5)
        subset = subset.dropna()
        #conver to array so as to use iqr function from statsmodel
        new_array = subset[column_name].dropna()
        # drop na from new array
        #new_array.dropna()
        # call iqr function
        iqr_value = stats.iqr(new_array, axis=None, rng=(25, 75), scale='raw', nan_policy='propagate', interpolation='linear', keepdims=False)
        # get 25th and 75th percentiles of subsetted column 
        q1, q3 = np.percentile(new_array, [25, 75])
        # set lower boundary as 1.5 standard deviations below the 25th percentile as 
        # than that will produce negative values
        lower = q1 - 1.5*(iqr_value)
        # set upper boundary as 3 standard deviations above the 75th percentile so as to
        # get rid of impossible rent values
        upper = q3 + 3.0*(iqr_value)
        # range of acceptable values will be those within the lower and upper bounds
        outlier_range = (lower, upper)
        # add percentiles to output dictionary
        percentiles = (q1, q3)
        # create dictionary key and nested dictionary as value
        values_dict[count_beds] = {'iqr': iqr_value, 'mean': mean_value, 'std': std_value, 'outlier range': outlier_range, "percentiles": percentiles}
        return values_dict

In [36]:
# create dictionaries of descriptive statistics values for price column 
# subsetted by bed value
price_dict = beds_statistics(df, "price", "price boolean", beds_number = False)
# create dictionaries of descriptive statistics values for square feet column 
# subsetted by bed value
sqft_dict = beds_statistics(df, "square feet", "square feet boolean", beds_number = False)



In [37]:
# function to identify outliers in the data using previously defined outlier range
def find_outliers(df, column_boolean, column_name, dictionary):
    # subset only non-null values of whatever attribute column is input into the 
    # function and conver to numeric
    df = df.loc[df[column_boolean] == "True"]
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
    # Set column to identify outliers
    df['outlier'] = False
    # loop through bed values keys in dictionaries and determine whether values are outliers
    for key in dictionary.items():
        df.loc[(df["beds_baths"] == str(key[0])) & (df[column_name] < (key[1]['outlier range'][0])), "outlier"] = True
        df.loc[(df["beds_baths"] == str(key[0])) & (df[column_name] > (key[1]['outlier range'][1])), "outlier"] = True
    return df

In [38]:
# create dataframe that identifies outliers using function
prices_outliers = find_outliers(df, "price boolean", "price", price_dict)
# set an index of integers equal to the length of dataframe rows
index_list = list(range(len(prices_outliers.index)))
# make index series a column in dataframe
prices_outliers['index'] = index_list
# set index column as index of dataframe
prices_outliers = prices_outliers.set_index(['index'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [39]:
# create groupby object of outliers to see where they tend to be distributed among beds groupings
outliers_grouped = prices_outliers.groupby(['beds_baths']).agg({'outlier' : 'sum' })
print ("outliers grouped", outliers_grouped)

outliers grouped             outlier
beds_baths         
1              39.0
2             228.0
3               9.0
4               1.0
studio         19.0


In [40]:
# create groupby objects to see where the proportion of outliers are distributed
outliers_props = prices_outliers.groupby('beds_baths')["outlier"].value_counts()
outliers_percents = outliers_props.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
print ("outlier proportions", outliers_props)
print ("outlier percents", outliers_percents)

outlier proportions beds_baths  outlier
1           False      7498
            True         39
2           False      6741
            True        228
3           False      1457
            True          9
4           False       436
            True          1
studio      False       979
            True         19
Name: outlier, dtype: int64
outlier percents beds_baths  outlier
1           False      99.482553
            True        0.517447
2           False      96.728368
            True        3.271632
3           False      99.386085
            True        0.613915
4           False      99.771167
            True        0.228833
studio      False      98.096192
            True        1.903808
Name: outlier, dtype: float64


In [41]:
# subset dataframe to only contain non-outliers
prices_no_outliers = prices_outliers.loc[prices_outliers['outlier'] == False]
# get the statistics for the price column of dataframe without outliers
prices_no_outliers_statistics = beds_statistics(prices_no_outliers, "price", "price boolean", beds_number = False)
# do the same for square feet, but using prices_no_outliers dataframe
sqft_no_outliers_statistics = beds_statistics(prices_no_outliers, "square feet", "square feet boolean", beds_number = False)



In [42]:
# using prices_no_outliers_statistics dictionary
# create dictionary of standard deviation and mean of 
# prices and square feet using the per beds grouping
# for future use in data analysis with fair market rents
price_mean_dict = {}
price_std_dict = {}
sqft_mean_dict = {}
sqft_std_dict = {}
for value in prices_no_outliers_statistics.items():
    price_mean_dict[str(value[0])] = value[1]['mean']
    price_std_dict[str(value[0])] = value[1]['std']
for value in sqft_no_outliers_statistics.items():
    sqft_mean_dict[str(value[0])] = value[1]['mean']
    sqft_std_dict[str(value[0])] = value[1]['std']

In [43]:
# function to save dictionaries for use in a later script
def dict_to_pickle(dictionary, filename):
    with open((str(filename) + ".pickle"), 'wb') as handle:
        pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [44]:
# run function to save dictionaries for use in a later script
dict_to_pickle(price_mean_dict, "price_mean_dict")
dict_to_pickle(price_std_dict, "price_std_dict")
dict_to_pickle(sqft_mean_dict, "sqft_mean_dict")
dict_to_pickle(sqft_std_dict, "sqft_std_dict")

In [45]:
# fill null values with mean from prices_no_outliers_statistics dictionary
# declare final dataframe
final_df = prices_no_outliers
# make sure that price and square feeet values are null where 'price boolean'
# indicates False to be able to fill them with mean values
#final_df.loc[final_df['price boolean'] == "False", "price"] = np.nan
#final_df.loc[final_df['square feet boolean'] == "False", "square feet"] = np.nan
# fill null values with the mean values of each bed type
# from square feet mean and price mean dictionaries 
#final_df['price'] = final_df['price'].fillna(df['beds_baths'].apply(lambda x: price_mean_dict.get(x)))
beds_types = ["studio", "1", "2", "3", "4"]
for bed in beds_types:
    print (sqft_mean_dict[bed])
    final_df.loc[(final_df['square feet boolean'] == "False") & (final_df['beds_baths'] == str(bed)), 'square feet'] =  sqft_mean_dict[bed]
    final_df.loc[(final_df['price boolean'] == "False") & (final_df['beds_baths'] == str(bed)), 'price'] =  price_mean_dict[bed]

    # drop the index_left column to avoid future problems
#final_df = final_df.drop(columns = ['index_left'])
final_df.to_csv('cleaned_data_final.csv')

525.1914
743.4594
1077.3147


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


1448.6534
1675.3222




2. Zipcode tabulation areas of minnesota zip_code_tabulation_areas.shp

3. Fair market rents data for Twin Cities 2018 https://www.huduser.gov/portal/datasets/fmr/fmrs/FY2018_code/2018summary.odn

4. US Zipcodes Search Engine which uses census data
