# Scraping Analysis

In [58]:
# Imports
import math
import numpy as np
import pandas as pd
import geopy.distance

### Read Data

In [59]:
scraped_zip_data = pd.read_pickle(
    '../housing_data/data/final_database_frames/zip_v1.pkl')
scraped_city_data = pd.read_pickle(
    '../housing_data/data/final_database_frames/city_v3.pkl')


def prep_city_data(scraped_city_data):
    scraped_city_data.columns = [
        i + '_city' if i != 'state_city' else i
        for i in scraped_city_data.columns.values
    ]
    return scraped_city_data


scraped_city_data = prep_city_data(scraped_city_data)

location_data = pd.read_csv(
    '../housing_data/data/geographic_data/zip_code_database_cleaned.csv',
    converters={'zip': lambda x: str(x)})


def get_location_data(location_data):
    # read data
    location_data = location_data.loc[:, [
        'state_city_zip', 'latitude', 'longitude', 'state', 'zip',
        'primary_city'
    ]]

    return location_data


location_data = get_location_data(location_data)

hotel_data = pd.read_csv(
    '../housing_data/2012_census_hotel_accomodation/ECN_2012_US_72Z1_with_ann.csv',
    converters={'GEO.id2': lambda x: str(x)},
    header=0)


def prep_hotel_data(hotel_data):
    # filter for all establishments
    hotel_data = hotel_data.iloc[1:, :]
    hotel_data = hotel_data[hotel_data['RCPSZFE.display-label'] ==
                            'All establishments']
    hotel_data = hotel_data.drop_duplicates(
        ['GEO.id2', 'NAICS.display-label', 'ESTAB'], keep='last')
    hotel_data = hotel_data.pivot(index='GEO.id2',
                                  columns='NAICS.display-label',
                                  values='ESTAB')
    hotel_data = hotel_data[[
        'Accommodation', 'Bed-and-breakfast inns', 'Casino hotels',
        'Hotels (except casino hotels) and motels', 'Traveler accommodation',
        'Recreational and vacation camps (except campgrounds)'
    ]]
    return hotel_data


hotel_data = prep_hotel_data(hotel_data)

# read zillow data
zillow = pd.read_pickle(
    '../housing_data/data/zillow/prepped_zillow_data/zillow_19-06.pkl')


  interactivity=interactivity, compiler=compiler, result=result)


### Merging locations and zillow

In [60]:
scraped_data_locations = scraped_zip_data.merge(location_data,
                                                on='state_city_zip',
                                                how='left')

scraped_data_locations_zillow = scraped_data_locations.merge(
    zillow, on='state_city_zip', how='left')

df = scraped_data_locations_zillow.merge(hotel_data,
                                         left_on='zip',
                                         right_on='GEO.id2',
                                         how='left')


### Extrapolating columns

In [61]:
def get_history_extrapolations(my_history):
    avg = []
    cv = []
    
    for i in my_history:
        if type(i) is list:
            if len(i) > 0:
                avg.append(np.average(i))
                cv.append(np.std(i) / np.average(i))
            else:
                avg.append(np.nan)
                cv.append(np.nan)
        else:
            avg.append(np.nan)
            cv.append(np.nan)
    return avg, cv

def get_distance_from_nyc(long_lat_row):
    new_york_coords = (40.812522, -73.951924)
    place = (long_lat_row[1],long_lat_row[0])
    distance = geopy.distance.vincenty(place, new_york_coords).mi
    return distance


def add_extrapolated_revenue(df, for_city=''):
    # now that info is finalized add the extrapolated revenue data
    svg_columns = ['monthly_revenue'+for_city, 'nightly_revenue'+for_city, 'monthly_occupancy'+for_city]
    for col_name in svg_columns:
        # average of lists of history of svg
        avg, cv = get_history_extrapolations(df[col_name])
        df['avg_' + col_name] = avg
        # seasonality measure
        df['seasonality_' + col_name] = cv

    # the multiply by 0.3 comes from 30 days / 100 for occupancy to become a percent between 0-1
    df['expected_avg_monthly_revenue'+for_city] = (df['avg_nightly_revenue'+for_city].mul(
        df['avg_monthly_occupancy'+for_city])).apply(lambda x: x * .3)

    # divide revenue by rooms and guests to get average per room and per guest
    df['revenue_per_room'+for_city] = df['avg_monthly_revenue'+for_city].divide(df['rooms'+for_city])

    df['expected_revenue_per_room'+for_city] = df['expected_avg_monthly_revenue'+for_city].divide(
        df['rooms'+for_city])

    df['expected_revenue_per_guest'+for_city] = df[
        'expected_avg_monthly_revenue'+for_city].divide(df['guests'+for_city])
        
    return df


df = add_extrapolated_revenue(df)
# add distance to zip data
df['distance_from_Eli'] = df[['longitude','latitude']].apply(get_distance_from_nyc, axis=1)

scraped_city_data = add_extrapolated_revenue(scraped_city_data, '_city')



### Add city data to zip data

In [62]:
df['state_city'] = df.state_city_zip.str.extract('(.*)\/\d+')
df = df.merge(scraped_city_data, on='state_city',how='left')

### Add real estate extrapolations to zip data

In [63]:
def add_real_estate_extrapolations(df):
    df['rent_to_rent'] = df.revenue_per_room.divide(
        df['ZriPerSqft_AllHomes2019-06'].apply(lambda x: x * 1000))
    df['rent_to_rent_expected'] = df.expected_revenue_per_room.divide(
        df['ZriPerSqft_AllHomes2019-06'].apply(lambda x: x * 1000))
    return df

df = add_real_estate_extrapolations(df)
    

### Write for widgets and downloads

In [64]:
# simple function to determin in cell is nan
def isNaN(num):
    return num != num


# filtered df for qgrid
filtered_df = df[[True if not isNaN(i) else False for i in df.city]]

# choose to filter with zillow or not
#filtered_df = df[[i < 15772 for i in df.SizeRank.values]]

# subset from df to use as qgrid frame
my_qgrid = filtered_df[[
    'zip', 'primary_city', 'state', 'current_active_listings','rooms',
    'current_active_listings_city', 'expected_avg_monthly_revenue',
    'avg_monthly_revenue','seasonality_monthly_revenue', 'rent_to_rent',
    'rent_to_rent_expected', 'distance_from_Eli','MedianRentalPrice_AllHomes2019-06',
    'ZriPerSqft_AllHomes2019-06', 'SizeRank', 'Hotels (except casino hotels) and motels'
]]

my_qgrid.to_excel('../process/data/df.xlsx', index=True)
my_qgrid.to_pickle('../process/data/df.pkl')
