# Scraping Analysis

In [2]:
# Imports
import gmaps
import math
import numpy as np
import pandas as pd
import qgrid
import ipywidgets as widgets
from IPython.display import display

### Read Data

In [23]:
scraped_zip_data = pd.read_pickle('../housing_data/data/final_database_frames/zip_a_v1.pkl')
scraped_city_data = pd.read_pickle('../housing_data/data/final_database_frames/city_v4.pkl')


def get_location_data():
    location_data = pd.read_csv(
        '../housing_data/data/geographic_data/zip_code_database_cleaned.csv',
        converters={'zip': lambda x: str(x)})
    
    # read data
    location_data = location_data.loc[:, [
        'state_city_zip', 'latitude', 'longitude', 'state', 'zip',
        'primary_city'
    ]]

    return location_data


location_data = get_location_data()

hotel_data = pd.read_csv('../housing_data/2012_census_hotel_accomodation/ECN_2012_US_72Z1_with_ann.csv')

# read zillow data
zillow = pd.read_pickle('../housing_data/data/final_database_frames/zillow_19-06.pkl')

  interactivity=interactivity, compiler=compiler, result=result)


### Merging locations and zillow

In [15]:
scraped_data_locations = scraped_data.merge(location_data,
                                            on='state_city_zip',
                                            how='left')

df = scraped_data_locations.merge(zillow, on='state_city_zip', how='left')

### Extrapolating columns

In [16]:
def get_history_extrapolations(my_history):
    avg = []
    cv = []
    for i in my_history:
        if type(i) is list:
            if len(i) > 0:
                avg.append(np.average(i))
                cv.append(np.std(i) / np.average(i))
            else:
                avg.append(np.nan)
                cv.append(np.nan)
        else:
            avg.append(np.nan)
            cv.append(np.nan)
    return avg, cv


def add_extrapolated_revenue(df):
    # now that info is finalized add the extrapolated revenue data
    svg_columns = ['monthly_revenue', 'nightly_revenue', 'monthly_occupancy']
    for col_name in svg_columns:
        # average of lists of history of svg
        avg, cv = get_history_extrapolations(df[col_name])
        df['avg_' + col_name] = avg
        # seasonality measure
        df['seasonality_' + col_name] = cv

    # the multiply by 0.3 comes from 30 days / 100 for occupancy to become a percent between 0-1
    df['expected_avg_monthly_revenue'] = (df['avg_nightly_revenue'].mul(
        df['avg_monthly_occupancy'])).apply(lambda x: x * .3)

    # divide revenue by rooms and guests to get average per room and per guest
    df['revenue_per_room'] = df['avg_monthly_revenue'].divide(df['rooms'])

    df['expected_revenue_per_room'] = df[
        'expected_avg_monthly_revenue'].divide(df['rooms'])

    df['expected_revenue_per_guest'] = df[
        'expected_avg_monthly_revenue'].divide(df['guests'])

    df['rent_to_rent'] = df.revenue_per_room.divide(
        df['ZriPerSqft_AllHomes2019-06'].apply(lambda x: x * 1000))
    df['rent_to_rent_expected'] = df.expected_revenue_per_room.divide(
        df['ZriPerSqft_AllHomes2019-06'].apply(lambda x: x * 1000))

    return df


df = add_extrapolated_revenue(df)

### Write for widgets and downloads

In [36]:
# simple function to determin in cell is nan
def isNaN(num):
    return num != num


# filtered df for qgrid
filtered_df = df[[True if not isNaN(i) else False for i in df.city]]

filtered_df = df[[i<15772 for i in df.SizeRank.values]]

# subset from df to use as qgrid frame
my_qgrid = filtered_df[[
    'zip', 'primary_city', 'state', 'current_active_listings',
    'expected_avg_monthly_revenue', 'avg_monthly_revenue',
    'avg_nightly_revenue', 'avg_monthly_occupancy', 'revenue_per_room',
    'expected_revenue_per_room', 'seasonality_monthly_revenue', 'rooms',
    'rent_to_rent', 'rent_to_rent_expected',
    'MedianRentalPricePerSqft_3Bedroom2019-06',
    'MedianRentalPricePerSqft_2Bedroom2019-06',
    'MedianRentalPricePerSqft_1Bedroom2019-06',
    'MedianRentalPricePerSqft_Studio2019-06',
    'MedianRentalPricePerSqft_Sfr2019-06', 'MedianRentalPrice_AllHomes2019-06',
    'ZriPerSqft_AllHomes2019-06','SizeRank','YoY'
]]


my_qgrid.to_excel('../process/data/df.xlsx', index=True)
my_qgrid.to_pickle('../process/data/df.pkl')