# Data Cleaning: Enrich Location Data

In the webscraping in part 1, we were able to scrape the latitude, longitude, and city of the listing. However, we anticipate wanting access to richer geographic labels to join against data that describes the US at a different geographic levels.

This notebook will use that scraped data, in combination with the package `geopy`, to get the postcode, county, state, and country to add to the city, latitude, and longitude already scraped.

_(Note: State and country will be predominantly be helpful for filtering out any listings we don't want to analyze. The scraper pulled in some listings from Canada and Mexico on the US border, as well as a few states that are right on the edge of the geographic area I want to analyze.)_

# Import packages

In [3]:
# For standard data manipulation.
import numpy as np
import pandas as pd
from datetime import datetime

# For progress tracking.
from tqdm import tqdm
from time import sleep

# For handling Geographic estimates.
import geopy
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
import traceback
from scipy.spatial import distance

# Makes it easier to see all the columns in wide dataframes!
pd.set_option('display.max_colwidth', None)

# Read in webscraped data

In [2]:
df = pd.read_csv('scraped_listings BACKUP.csv')
df.drop(columns=['Unnamed: 0','Unnamed: 0.8','Unnamed: 0.9'],inplace=True)

df.head()

Unnamed: 0,listing_id,listing_url,is_superhost,rating,n_reviews,listing_city,listing_title,n_pictures,room_type,latitude,...,image_3,image_4,image_5,Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1
0,47924385,https://www.airbnb.com/rooms/47924385,Superhost,4.86,207.0,Flagler,Home in Flagler,9.0,entire_home,39.29294,...,https://a0.muscache.com/im/pictures/a7339783-cf14-4ab0-b9e3-8c17ef699d20.jpg?im_w=720,https://a0.muscache.com/im/pictures/f8ea448c-d293-4895-b49c-07a2636ebaf2.jpg?im_w=720,https://a0.muscache.com/im/pictures/1a2a5d6b-f1ee-44af-b82d-ab42d2717b28.jpg?im_w=720,,,,,,,
1,12964075,https://www.airbnb.com/rooms/12964075,Superhost,4.87,419.0,Flagler,Private room in Flagler,11.0,private_room,39.29492,...,https://a0.muscache.com/im/pictures/478c7fd6-7cd1-4b10-8971-633a698ef522.jpg?im_w=720,https://a0.muscache.com/im/pictures/1b6947f8-2239-4382-9230-5f5430927dc5.jpg?im_w=720,https://a0.muscache.com/im/pictures/ce671a50-b9f8-446d-b91c-e26163effed2.jpg?im_w=720,,,,,,,
2,50379619,https://www.airbnb.com/rooms/50379619,Superhost,4.93,42.0,Bridgeport,Home in Bridgeport,15.0,entire_home,41.665535,...,https://a0.muscache.com/im/pictures/ecc78c28-b302-4436-a12b-bd8a6d8e4352.jpg?im_w=720,https://a0.muscache.com/im/pictures/05172838-6bca-4f99-aad1-114012fd597b.jpg?im_w=720,https://a0.muscache.com/im/pictures/f30c7acb-21e8-4cc6-a58c-2a85ecc463e2.jpg?im_w=720,,,,,,,
3,710231964358460529,https://www.airbnb.com/rooms/710231964358460529,,,,Bridgeport,Private room in Bridgeport,5.0,private_room,41.66877,...,https://a0.muscache.com/im/pictures/52bd72d4-d504-497e-b2e3-ea5184a95743.jpg?im_w=720,https://a0.muscache.com/im/pictures/35c9f58a-d951-4d4c-9eec-0d0271883166.jpg?im_w=720,https://a0.muscache.com/im/pictures/d39ad521-365a-4bd3-9919-6a742aaaab35.jpg?im_w=720,,,,,,,
4,723852070242986749,https://www.airbnb.com/rooms/723852070242986749,,,,Bridgeport,Private room in Bridgeport,7.0,private_room,41.67005,...,https://a0.muscache.com/im/pictures/miso/Hosting-723852070242986749/original/3a4a9f2b-230f-4f0a-9825-151d60d0e5e7.jpeg?im_w=720,https://a0.muscache.com/im/pictures/58f6e95a-6714-4062-9a56-edbf2b227924.jpg?im_w=720,https://a0.muscache.com/im/pictures/miso/Hosting-723852070242986749/original/e08fc6c7-f9d6-48d0-bbc4-b50f903961eb.jpeg?im_w=720,,,,,,,


# Set up geolocator

In [3]:
geolocator = geopy.Nominatim(user_agent='check_1')

# Function for get geolocation data

### def `get_geolocation`:
**Args:** 
* geolocator: The `geopy` geolocator used to extract data.
* latitude: The latitdue of the listing.
* longitude: The longitude of the listing.
* listing_id: The ID of the listing.

**Returns:**
* listing_id: The ID of the listing.
* country: The listing's country.
* state: The listing's state.
* county: The listing's county.
* postcode: The listing's postcode
* timeout: Error tracking for timeouts Errors.
* unavailable: Error tracking for unavailable Errors.

In [4]:
def get_geolocation(geolocator, latitude, longitude, listing_id):
    
    # Sometimes the geocoder isn't available! use a while true to make the function wait a minute,
    # and then proceed with the scraper. While it still takes a while to get through the function,
    # this helps make the function a little more robust to errors that cause the process to stumble.
    
    timeout     = 0
    unavailable = 0
    
    while True:
        try:
            location = geolocator.reverse((latitude, longitude))
        except GeocoderTimedOut:
            timeout += 1
            sleep(60)
            continue
        except GeocoderUnavailable:
            unavailable += 1
            sleep(60)
            continue
        break
    try:
        country =  location.raw['address']['country']
        state =    location.raw['address']['state']
        county =   location.raw['address']['county']
        postcode = location.raw['address']['postcode']
    except:
        country =  None
        state =    None
        county =   None
        postcode = None
    
    return listing_id, country, state, county, postcode, timeout, unavailable

# Get geolocation data.

* Loop through each listing.
* Get its geographic data.
* If it errors, skip it.
* Every 100 listings, save the data.
* Once done, save the results.

In [None]:
# Loop count.
count = 0

# Lists to capture data.
listing_id_list = []
country_list =    []
state_list =      []
county_list =     []
postcode_list =   []

extract1 = pd.read_csv('extract geolocation_data 1.csv')
extract1.drop(columns=['Unnamed: 0'], inplace=True)

# For monitoring and logs.
geoTimeout   = 0
geoUnavailable = 0

# For each row in the df, get geolocation data.
for i, row in tqdm(df[51391:].iterrows()):
    listing_id, country, state, county, postcode, timeout, unavailable = get_geolocation(geolocator, 
                                                                                         row['latitude'],
                                                                                         row['longitude'], 
                                                                                         row['listing_id'])
    # Append data to lists.
    listing_id_list.append(listing_id)
    country_list.append(country)
    state_list.append(state)
    county_list.append(county)
    postcode_list.append(postcode)
    
    count += 1
    
    # For every 100 records, save the data and export to a csv.
    if count % 100 == 0:
        results_df = pd.DataFrame(
                {
                    'listing_id': listing_id_list,
                    'country': country_list,
                    'state': state_list,
                    'county': county_list,
                    'postcode': postcode_list
                })
        
        results_df = pd.concat([extract1, results_df], ignore_index=True)
        
        results_df.to_csv('geolocation_data.csv')
        
        
# Once the loop is complete, save the final data, and export.        
results_df = pd.DataFrame(
                {
                    'listing_id': listing_id_list,
                    'country': country_list,
                    'state': state_list,
                    'county': county_list,
                    'postcode': postcode_list
                })

results_df = pd.concat([extract1, results_df], ignore_index=True)

results_df.to_csv('geolocation_data.csv')

# Print number of geolocation timeouts.
print('Complete.')
print('geoTimeouts:', geoTimeout)
print('geoUnavailable:', geoUnavailable)


# Some listings didn't get any geographic data. Let's fix that.

Using Euclidean distance, we should be able to impute data for the majority of these listings.

In [9]:
# Check number of listings that didn't get geographic data.
len(results_df[results_df['county'].isna()])

17672

In [11]:
# Add the latitude and longitude to the results DataFrame.
results_w_lat_long = results_df.copy()
results_w_lat_long = results_w_lat_long.merge(df[['listing_id','latitude','longitude']], on='listing_id')

# Take a look.
results_w_lat_long

Unnamed: 0,listing_id,country,state,county,postcode,latitude,longitude
0,47924385,United States,Colorado,Kit Carson County,80815.0,39.292940,-103.064290
1,12964075,United States,Colorado,Kit Carson County,80815.0,39.294920,-103.063720
2,50379619,United States,Nebraska,Morrill County,69336.0,41.665535,-103.095772
3,710231964358460529,United States,Nebraska,Morrill County,69336.0,41.668770,-103.101660
4,723852070242986749,United States,Nebraska,Morrill County,69336.0,41.670050,-103.101620
...,...,...,...,...,...,...,...
186682,33413622,United States,California,Mendocino County,95445,38.769516,-123.533096
186683,52635490,United States,California,Mendocino County,95445,38.771200,-123.532330
186684,701981560073949455,,,,,38.734390,-123.484280
186685,731212246556870884,United States,California,Sonoma County,95445,38.751671,-123.517987


### def `closest_point_fillna`:

_Looks for closest listing by Euclidean distance, and inherits its enriched geolocation data. Only works for listings that are +/-0.5 degree in latitude and longitude.

(TODO: will this error if a listing is isolated?)

**Args:** 
* null_latlng: a (lat, lng) tuple of the record with no enriched geographic data.
* listing_id: the ID of the listing.
* df: the DataFrame `results_w_lat_long`.

**Returns:**
* country: The imputed country for the listing. 
* state: The imputed state for the listing.
* county: The imputed county for the listing.
* postcode: The imputed postcode for the listing.

In [31]:
def closest_point_fillna(null_latlng, listing_id, df):
    # Capture a list of candidate distances, and the lat/lngs.
    distances = []
    lats = []
    lngs = []
    
    # For each row of the DataFrame that does not have a null value for zipcode:
    for i, row in df[df.postcode.notnull()].iterrows():
        
        if (row['latitude'] +.5 > null_latlng[0] and row['latitude']  < null_latlng[0] -.5
            and row['longitude'] +.5  > null_latlng[1] and row['longitude'] < null_latlng[1] -.5):
            # Get the latitude and longitude values.
            candidate_latlng=(row['latitude'],row['longitude'])

            # Calculate the Euclidean distance between the row and the null row fed into the function.
            distances.append(distance.euclidean(null_latlng, candidate_latlng))
            lats.append(row['latitude'])
            lngs.append(row['longitude'])
        else:
            distances.append(1e7)
            lats.append(row['latitude'])
            lngs.append(row['longitude'])
            
    
    # Get the closest, or 'minimum', distance.
    min_value = min(distances)
    # Get the index of the minimum value in the result list.
    min_index = distances.index(min_value)
    
    # Get lat key and long key.
    lat, lng = lats[min_index], lngs[min_index]
    
    # Save the zipcode for the specific lat/lng row. This should work because we
    # are using the dataframe with only the unique lat/lng combinations, so we should 
    # only return one value!
    postcode = df[(df['latitude'] == lat) & (df['longitude'] == lng)]['postcode'].values[0]
    county = df[(df['latitude'] == lat) & (df['longitude'] == lng)]['county'].values[0]
    state = df[(df['latitude'] == lat) & (df['longitude'] == lng)]['state'].values[0]
    country = df[(df['latitude'] == lat) & (df['longitude'] == lng)]['country'].values[0]
    
    return country, state, county, postcode

# Impute missing data.

* Loop through each listing ID missing data.
* Find closest listing by Euclidean distance and inherit geographic data.
* Every 100 records, save the data.
* Once done, save the final data.

In [35]:
listing_id_list = []
country_list = []
state_list = []
county_list = []
postcode_list = []
latitude_list = []
longitude_list = []

count = 0

for i, row in tqdm(results_w_lat_long[results_w_lat_long['postcode'].isnull()].iterrows()):
    null_latlng = (row['latitude'], row['longitude'])
    results = closest_point_fillna(null_latlng, listing_id=row['listing_id'], df=results_w_lat_long)
    listing_id_list.append(row['listing_id'])
    country_list.append(results[0])
    state_list.append(results[1])
    county_list.append(results[2])
    postcode_list.append(results[3])
    latitude_list.append(row['latitude'])
    longitude_list.append(row['longitude'])
    
    count += 1
    
    if count % 100 == 0:
        backfilled_df = pd.DataFrame(data={
            'listing_id':listing_id_list,
            'country':country_list,
            'state':state_list,
            'county':county_list,
            'postcode':postcode_list,
            'latitude':latitude_list,
            'longitude':longitude_list
        })
        
        backfilled_df.to_csv('geolocation_data_backfilled.csv')

        
backfilled_df = pd.DataFrame(data={
            'listing_id':listing_id_list,
            'country':country_list,
            'state':state_list,
            'county':county_list,
            'postcode':postcode_list,
            'latitude':latitude_list,
            'longitude':longitude_list
        })
        
backfilled_df.to_csv('geolocation_data_backfilled.csv')

17672it [20:00:27,  4.08s/it]


# Merge the final data.

In [6]:
results_w_lat_long = pd.read_csv('CLEANED geolocation_data.csv',index_col=False)
backfilled_df = pd.read_csv('geolocation_data_backfilled.csv',index_col=False)

In [7]:

results_w_lat_long = results_w_lat_long[results_w_lat_long['county'].notnull()]
geographic_data = pd.concat([results_w_lat_long, backfilled_df], ignore_index=True)
geographic_data.to_csv('CLEANED geolocation_data.csv')
