In [29]:
# pandas to work with dataframes
import pandas as pd
# import re to use regular expressions
import re
# import numpy
import numpy as np
# import glob to bring together multiple csv files
import glob

# search for csvs containing web-scraped data with the name scheme
csvs = glob.glob(r'*Craigslist_ApartmentData_Minneapolis*.csv')

# set dfs list to store each dataframe that gets read in
dfs = []

# iterate through csv files gathered in glob
for file in csvs:
    # read each csv as a dataframe with the same column names
    df1 = pd.read_csv(file, names=['address', 'square feet', 'beds_baths', 'datetime', 'square feet 2', 'latitude', 'title', 'longitude', 'map accuracy', 'price', 'url', 'post id'])
    # append dataframe to dfs list
    dfs.append(df1)

# concatenate dataframes in dfs together vertically
df = pd.concat(dfs, axis=0)
# set an index of integers equal to the length of dataframe rows
index_list = list(range(len(df.index)))
# make index series a column in dataframe
df['index'] = index_list
# set index column as index of dataframe
df = df.set_index(['index'])

# set entire df type as string, as most operations will require data type to be string
df = df.astype(str)

# drop any post id values that are null
postid_no_null = df.dropna(subset = ['post id'])



# create dataframe where "post id" column is null
null_id = df[df['post id'].isnull()]
# drop duplicates in null_id dataframe using the "title" column
null_id = null_id.drop_duplicates(subset=['title'], keep='first')
# drop duplicates using "post id" column in the dataframe containing no null "post id" values
postid_no_null = df.drop_duplicates(subset=['post id'], keep='first')
# concatenate vertically null_id and postid_no_null dataframes
df = pd.concat([null_id, postid_no_null])
# drop latitude, longitude, map accuracy columns 
df = df.drop(columns=['latitude', 'longitude', 'map accuracy'])


# extract and store all possible valid square feet or beds values across columns where they might be
# using findall function

# column to store values of square feet that exist in column
df['value_square_feet_two'] = [re.findall('\d+ft', str(cell)) for cell in df['square feet 2']]
# create column of all patterns of square feet that exist in  beds_baths column
df['value_square_feet_three'] = [re.findall('\d+ft', str(cell)) for cell in df['beds_baths']]
# get all patterns of beds that exist in "square feet 2" column
df['number_of_beds_2'] = [re.findall('\d+br', str(cell)) for cell in df['square feet 2']]
# clean values from columns to prepare them to be brought into "square feet", "price" and "beds_baths" columns
df['number_of_beds_2'] = df['number_of_beds_2'].astype(str)
df['value_square_feet_two'] = df['value_square_feet_two'].astype(str)
df['value_square_feet_three'] = df['value_square_feet_three'].astype(str)

# create a boolean of whether or not a beds/bath pattern exists in the "beds_baths" column
df['first_beds_boolean'] = [bool(re.search('\d+[Bb]', cell)) for cell in df['beds_baths']] 
# get boolean of whether or not beds value is present in "square feet 2" column
df['second_beds_boolean'] = [bool(re.search('\d+br', str(cell))) for cell in df['square feet 2']]
# clean value before moving
df.loc[df['second_beds_boolean'] == 1, 'number_of_beds_2'] = df['number_of_beds_2'].apply(lambda x: str(x[2:-2]))
# where value in "beds_baths" is not available, and is available in "square feet 2", 
# fill with the beds value of "square feet 2", stored in "number_of_beds_2"
df.loc[(df['second_beds_boolean'] == 1) & (df['first_beds_boolean'] == 0), 'beds_baths'] = df['number_of_beds_2']
# create a boolean of whether or not a square feet pattern exists in the "square feet" column
df['first_square_feet_boolean'] = [bool(re.search('\d+ft', str(cell))) for cell in df['square feet']]
# boolean to see if row in square feet 2 column contains square feet value
df['second_square_feet_boolean'] = [bool(re.search('\d+ft', str(cell))) for cell in df['square feet 2']]
# clean value before moving
df.loc[df['second_square_feet_boolean'] == 1, 'value_square_feet_two'] = df['value_square_feet_two'].apply(lambda x: str(x[2:-2]))
# where value in "square feet" is not available, and is available in "square feet 2", 
# fill with the square feet value of "square feet 2", stored in "value_square_feet_two"
df.loc[(df['first_square_feet_boolean'] == 0) & (df['second_square_feet_boolean'] == 1), 'square feet'] = df['value_square_feet_two']
# boolean to see if beds_baths column is True/False for square feet pattern
df['third_square_feet_boolean'] = [bool(re.search('\d+ft', str(cell))) for cell in df['beds_baths']]
# clean value before moving
df.loc[df['third_square_feet_boolean'] == 1, 'value_square_feet_three'] = df['value_square_feet_three'].apply(lambda x: str(x[2:-2]))
# for all values in beds_bath where the square feet pattern is true, 
# cell value is updated with number in square_feet_one
df.loc[(df['first_square_feet_boolean'] == 0) & (df['second_square_feet_boolean'] == 0) & (df['third_square_feet_boolean'] == 1), 'square feet'] = df['value_square_feet_three']
# for square feet, beds_baths and price columns, determine if appropriate values are present with boolean
# in order to fill these columns with appropriate values in future operations
df['square feet boolean'] = [bool(re.search('\d+ft', str(cell))) for cell in df['square feet']]
df['beds boolean'] = [bool(re.search('\d+[Bb]', str(cell))) for cell in df['beds_baths']]
df['price boolean'] = [bool(re.search('\d+', str(cell))) for cell in df['price']]


# get just digits in square feet column 
df.loc[(df['square feet boolean'] == 1), 'square feet'] = df['square feet'].apply(lambda x: x[0:-3])
# get just numerical value of beds_baths
df.loc[df['beds_baths'].str.contains(r'B'), 'beds_baths'] = df['beds_baths'].apply(lambda x: x[0])
# get just price in "price" column
df['price'] = df.price.apply(lambda x: x.replace('$', ''))
# identify and drop posts associated with the superbowl
df['check_superbowl'] = [bool(re.search("SUPERBOWL|Superbowl|SUPER BOWL|Super bowl|superbowl|super bowl|Super Bowl", cell)) for cell in df['title']]
df = df.drop(df[df['check_superbowl'] == 1].index)
# check "address" column to see if address exists and create boolean value in "address_check" column
df['address_check'] = [bool(re.search('\d.*|\b.*', str(cell))) for cell in df['address']]
# extract url information from "url" column and fill as value in "county" column
df['county'] = [str(re.findall('org/.../', cell))[6:9] for cell in df['url']]


# get latitude and longitude of counties and assign to respective "county_lat" and "county_lon" coordinates

# create county_lat, county_lon columns
df['address_lat'] = df['county']
df['address_lon'] = df['county']


# make this shorter with a function 
#counties_list = [hnp, wsh, ram, dak, csw, ank]
#counties_lat_lon = {"hnp" : ["45.0209", "-93.5095"], "wsh" : ["44.8897", "-92.8577"], "ram" : ["44.9964", "-93.0616"], "dak" : ["44.7316", "-93.1780"], "csw" : ["44.8254", "-93.7842"]  "ank" : ["45.1977428", "-93.3871758"]}
#hennepin county
df.loc[(df['county'] == "hnp") & (df['address_check'] == False), 'county_lat' ] = "45.0209"
df.loc[(df['county'] == "hnp") & (df['address_check'] == False), 'county_lon' ] = "-93.5095"

#washington county
df.loc[(df['county'] == "wsh") & (df['address_check'] == False), 'county_lat' ] = "44.8897"
df.loc[(df['county'] == "wsh") & (df['address_check'] == False), 'county_lon' ] = "-92.8577"

# ramsey county
df.loc[(df['county'] == "ram") & (df['address_check'] == False), 'county_lat' ] = "44.9964"
df.loc[(df['county'] == "ram") & (df['address_check'] == False), 'county_lon' ] = "-93.0616"

# dakota county
df.loc[(df['county'] == "dak") & (df['address_check'] == False), 'county_lat' ] = "44.7316"
df.loc[(df['county'] == "dak") & (df['address_check'] == False), 'county_lon' ] = "-93.1780"


# scott, carver, wright
df.loc[(df['county'] == "csw") & (df['address_check'] == False), 'county_lat' ] = "44.8254"
df.loc[(df['county'] == "csw") & (df['address_check'] == False), 'county_lon' ] = "-93.7842"

# anoka
df.loc[(df['county'] == "ank") & (df['address_check'] == False), 'county_lat' ] = "45.1977428"
df.loc[(df['county'] == "ank") & (df['address_check'] == False), 'county_lon' ] = "-93.3871758"

# create a column county_state with county and state to append to each respective address
df['county_state'] = df['county']
df.loc[(df['county'] == "hnp"), "county_state"] = " ,Hennepin County, Minnesota"
df.loc[(df['county'] == "ram"), "county_state"] = " ,Ramsey County, Minnesota"
df.loc[(df['county'] == "dak"), "county_state"] = " ,Dakota County, Minnesota"
df.loc[(df['county'] == "ank"), "county_state"] = " ,Anoka County, Minnesota"
df.loc[(df['county'] == "wsh"), "county_state"] = " ,Washington County, Minnesota"
df.loc[(df['county'] == "csw"), "county_state"] = " ,Carver County, Minnesota"
# append the value in county_state to each respective address
df.loc[(df['address_check'] == True), 'address'] = df[['address', 'county_state']].apply(''.join, axis=1)
# create geocode_lat and geocode_lon columns for geocoding
df['geocode_lat'] = "none"
df['geocode_lon'] = "none"

# save dataframe without missing addresses to be geocoded
# then resume script after values have been geocoded

df.to_csv('geocode_craigslist_rentals_2')




Unnamed: 0_level_0,address,square feet,beds_baths,datetime,square feet 2,title,price,url,post id,value_square_feet_two,...,check_superbowl,address_check,county,address_lat,address_lon,county_lat,county_lon,county_state,geocode_lat,geocode_lon
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,1046.67,2,2018-01-13T20:02:24-0600,/ 2br -,Be Part Of Uptown - The Lake/Calhoun Square/Th...,1975,https://minneapolis.craigslist.org/hnp/apa/d/b...,post id: 6458930089,[],...,False,False,hnp,hnp,hnp,45.0209,-93.5095,",Hennepin County, Minnesota",none,none
1,"4th Street NE at 23rd Ave ,Hennepin County, Mi...",1046.67,1,2018-01-11T17:04:47-0600,/ 1br -,"Upstairs 1 BR duplex unit, highly walkable in ...",1150,https://minneapolis.craigslist.org/hnp/apa/d/u...,post id: 6456393172,[],...,False,True,hnp,hnp,hnp,,,",Hennepin County, Minnesota",none,none
2,,1046.67,1,2018-01-13T20:00:35-0600,/ 1br -,North Loop Luxury One Bed - 1 Block From Skywa...,1600,https://minneapolis.craigslist.org/hnp/apa/d/n...,post id: 6458928789,[],...,False,False,hnp,hnp,hnp,45.0209,-93.5095,",Hennepin County, Minnesota",none,none
3,"6426 City West Parkway ,Hennepin County, Minn...",688,1,2017-12-23T11:08:11-0600,/ 1br - 688ft2 -,Very cozy apartment with beautiful woods view,1223,https://minneapolis.craigslist.org/hnp/apa/d/v...,post id: 6435935112,688ft,...,False,True,hnp,hnp,hnp,,,",Hennepin County, Minnesota",none,none
4,,1046.67,1,2018-01-13T19:59:36-0600,/ 1br -,Live On The Park! Must See Unit w/Heated Parki...,2045,https://minneapolis.craigslist.org/hnp/apa/d/l...,post id: 6458928087,[],...,False,False,hnp,hnp,hnp,45.0209,-93.5095,",Hennepin County, Minnesota",none,none
5,,1046.67,1,2018-01-13T19:58:35-0600,/ 1br -,"Loring Park Studio w/Amazing Amenties - Gym, G...",1680,https://minneapolis.craigslist.org/hnp/apa/d/l...,post id: 6458927342,[],...,False,False,hnp,hnp,hnp,45.0209,-93.5095,",Hennepin County, Minnesota",none,none
6,,1046.67,1,2018-01-13T19:54:57-0600,/ 1br -,Insane Poolside Amenities in this Uptown Pad!!,1385,https://minneapolis.craigslist.org/hnp/apa/d/i...,post id: 6458924686,[],...,False,False,hnp,hnp,hnp,45.0209,-93.5095,",Hennepin County, Minnesota",none,none
7,,1046.67,1,2018-01-13T19:53:37-0600,/ 1br -,Great 1 Bed Value Just Steps Away From Britt's...,1405,https://minneapolis.craigslist.org/hnp/apa/d/g...,post id: 6458923708,[],...,False,False,hnp,hnp,hnp,45.0209,-93.5095,",Hennepin County, Minnesota",none,none
8,"Aldrich Ave S at W 26th St ,Hennepin County, M...",950,2,2018-01-15T11:45:55-0600,/ 2br - 950ft2 -,"Uptown 2bdr/1bth, Wedge Neighborhood",1500,https://minneapolis.craigslist.org/hnp/apa/d/u...,post id: 6460540386,950ft,...,False,True,hnp,hnp,hnp,,,",Hennepin County, Minnesota",none,none
9,,1046.67,1.7,2018-01-18T18:31:08-0600,,Frunished 1 bedroom 1 bathroom condo,1800,https://minneapolis.craigslist.org/wsh/apa/d/f...,post id: 6464729986,[],...,False,False,wsh,wsh,wsh,44.8897,-92.8577,",Washington County, Minnesota",none,none


MERGE DATA AFTER GEOCODING


In [None]:
# pandas to work with dataframes
import pandas as pd
# import re to use regular expressions
import re
# import numpy
import numpy as np
# import glob to bring together multiple csv files
import glob

# search for csvs containing web-scraped data with the name scheme
csvs = glob.glob(r'*geocode_craigslist*.csv')

# set dfs list to store each dataframe that gets read in
dfs = []

# iterate through csv files gathered in glob
for file in csvs:
    # read each csv as a dataframe with the same column names
    df1 = pd.read_csv(file)
    # append dataframe to dfs list
    dfs.append(df1)

# concatenate dataframes in dfs together vertically
df = pd.concat(dfs, axis=0)
# set an index of integers equal to the length of dataframe rows
index_list = list(range(len(df.index)))
# make index series a column in dataframe
df['index'] = index_list
# set index column as index of dataframe
df = df.set_index(['index'])

# set entire df type as string, as most operations will require data type to be string
df = df.astype(str)


#mask dataframe to exclude values greater than 10000 and lower than 50
# convert price column to float
df['price'] = df['price'].astype(float)
# mask dataframe for values in "price" column greater than 10000
mask = df['price'] < 10000
df = df[mask]
# mask dataframe for values in "price" column greater than 50
mask = df['price'] > 50
df = df[mask]

# create a dataframe absent null or non-desirable values in the 
# "square feet", "beds_baths" and "price" columns
df2 = df.drop(df[(df['square feet boolean'] == 0) | (df['beds boolean'] == 0) | (df['price boolean'] == 0)].index)
# transform columns to numeric types so as to perform 
# mean operations
df2['price'] = pd.to_numeric(df2.price)
df2['square feet'] = pd.to_numeric(df2['square feet'])
df2['beds_baths'] = pd.to_numeric(df2['beds_baths'])
# get mean value of each column
price_mean = df2['price'].mean()
sqrft_mean = df2['square feet'].mean()
beds_mean = df2['beds_baths'].mean()
# for the "price", "beds_baths", "square feet" columns,
# set all non-matching values as the mean of each column
df.loc[(df['price boolean'] == 0), 'price'] = str(round(price_mean, 2))
df.loc[(df['beds boolean'] == 0), 'beds_baths'] = str(round(beds_mean, 2))
df.loc[(df['square feet boolean'] == 0), 'square feet'] = str(round(sqrft_mean, 2))


df.to_csv('craigslist_final_data')

GEOCODING SCRIPT


In [None]:
import googlemaps
from datetime import datetime
import pandas as pd
from IPython.core.display import clear_output
from random import randint
from time import time 
import requests
from requests import get
from warnings import warn 
from bs4 import BeautifulSoup
from IPython.core.display import clear_output
from time import sleep

from itertools import islice
csv = "geocode_craigslist_rentals.csv"

  

df = pd.read_csv(csv)

index_list = list(range(len(df.index)))
df['index'] = index_list

df = df.set_index(['index'])


sort df by address_check true false


for index, row in islice(df.iterrows(),2302, 3802):
    if (row['geocode_lat'] == "none") & (row['geocode_lon'] == "none") & (row['address_check'] == 1):
        

    

        try:

            sleep((2))
            gmaps = googlemaps.Client(key='AIzaSyDS4IF8J8x49OSMgGNAKloMvNdudswac1U')
            address = str(row['address'])
            geocode_result = gmaps.geocode(address)
            latitude = geocode_result[0]['geometry']['location']['lat']
            longitude = geocode_result[0]['geometry']['location']['lng'] 
            df.loc[index, "geocode_lon"] = str(longitude)
            df.loc[index, "geocode_lat"] = str(latitude)
            if (count %50 ==0):
                print (row['geocode_lon'], row['address'])
            count += 1
            print ('Request: {}'.format(count))
            clear_output(wait = True)


        except:
            if (count %50 ==0):
                print (row['geocode_lon'], row['address'])

            df.loc[index, "geocode_lon"] = "error"
            df.loc[index, "geocode_lat"] = "error"


            count += 1
            print ('Request: {}'.format(count))
            clear_output(wait = True)


       


    
df.to_csv('geocode_craigslist_rentals.csv', header=True)