In [35]:
import pandas as pd
from uszipcode import ZipcodeSearchEngine
from shapely.geometry import Point
import geopandas as gpd
from IPython.core.display import clear_output
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# read in matchine learning dataset
df = pd.read_csv(r"C:\Users\murra667\Documents\Springboard\Capstone _1\Data_Collection\Craigslist Data\geocode_2\Data Collection and Cleaning\Data Sources\machine_learning_df.csv")

In [15]:
# convert latitude and longitude points into GIS data points
geometry = [Point(xy) for xy in zip(df['geocode_lon'], df['geocode_lat'])]
# set projections for geometry 
crs = {'init': 'epsg:4326'}
# create dataframe with spatial attributes
locations = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
locations = locations.to_crs({'init': 'epsg:4326'})

In [23]:
# create geodataframe of zipcodes of Minnesota
zipcodes = gpd.GeoDataFrame.from_file(r'C:\Users\murra667\Documents\Springboard\Capstone _1\Data_Collection\Craigslist Data\geocode_2\Data Storytelling\zip_code_tabulation_areas.shp')
# set projection of zipcodes
zipcodes = zipcodes.to_crs({'init': 'epsg:4326'})
# rename column of minnesota zipcodes file
zipcodes = zipcodes.rename(columns={'GEOID10': 'zipcodes'})
# perform spatial join between craigslist 
loc_zip_join = gpd.sjoin(locations, zipcodes, how="left", op='within')

In [27]:
# subset dataframe to include desired columns
merged = loc_zip_join[['baths', 'url', 'datetime', 'address', 'beds boolean', 'beds', 'county', 'geocode_lat', 'geocode_lon', 'post id', 'price', 'price boolean', 'square feet', 'square feet boolean', 'title', 'City', 'one to studio', 'bed assigned', 'zipcodes']]

In [30]:
# track progress of iterration
count = 0
for index, row in merged.iterrows():
    # call zipcode search engine
    search = ZipcodeSearchEngine()
    # input each zipcode from dataframe into search engine
    zipcode=search.by_zipcode(row['zipcodes'])
    # get total number of units per zipcode according to census
    units_tot = zipcode.HouseOfUnits
    # get wealth rating of zipcode
    wealth = zipcode.Wealthy
    # get water area of zipcode
    water_area = zipcode.WaterArea
    # get density area of zipcode
    density = zipcode.Density
    # get population of zipcode
    population = zipcode.Population
    # get total wages of zipcode
    total_wages = zipcode.TotalWages
    # get land area of zipcode
    land_area = zipcode.LandArea
    # fill values at matching zipcode values in dataframe
    # with census data at zipcode level 
    merged.loc[index, 'total units'] = units_tot
    merged.loc[index, 'average annual income'] = wealth
    merged.loc[index, 'water area'] = water_area
    merged.loc[index, 'density'] = density
    merged.loc[index, 'population'] = population
    merged.loc[index, 'aggregate wages'] = total_wages
    merged.loc[index, 'land area'] = land_area
    # aggregate count of iteration
    # and print
    count += 1
    print ('Request: {}'.format(count))
    clear_output(wait = True)

Request: 17111


In [32]:
# save updated dataframe as new machine_learning_df
merged.to_csv("machine_learning_df.csv")