In [1]:
import pandas as pd
import requests 
import json
import seaborn as sns
import numpy as np
from time import sleep

In [2]:
def Cohen_d(group1, group2):

    # Compute Cohen's d.

    # group1: Series or NumPy array
    # group2: Series or NumPy array

    # returns a floating point number 

    diff = np.mean(group1) - np.mean(group2)

    n1, n2 = len(group1), len(group2)
    var1 = np.var(group1)
    var2 = np.var(group2)

    # Calculate the pooled threshold as shown earlier
    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    
    # Calculate Cohen's d statistic
    d = diff / np.sqrt(pooled_var)
    
    return d

In [3]:
import censusgeocode as cg
import re

In [4]:
hosp_cost_data = pd.read_pickle('./data/hosp_cost_data')

In [5]:
hosp_cost_data.provider_zip_code = hosp_cost_data.provider_zip_code.astype(str)

In [6]:
address_columns = ['provider_city', 'provider_state', 'provider_street_address', 'provider_zip_code']
 
hosp_cost_data['full_address'] = hosp_cost_data[address_columns[2]] + ', ' + hosp_cost_data[address_columns[0]] + ', ' + hosp_cost_data[address_columns[1]]

In [11]:
list(hosp_cost_data.full_address[0:5])

['1108 ROSS CLARK CIRCLE, DOTHAN, AL',
 '2505 U S HIGHWAY 431 NORTH, BOAZ, AL',
 '205 MARENGO STREET, FLORENCE, AL',
 '50 MEDICAL PARK EAST DRIVE, BIRMINGHAM, AL',
 '1000 FIRST STREET NORTH, ALABASTER, AL']

In [10]:
census_tract = pd.read_excel('./data/censustract-00-10.xlsx', dtype=str)

In [None]:
hosp_cost_data['provider_zip_code'] = hosp_cost_data['provider_zip_code'].astype(str)

In [None]:
zip_relationship = pd.read_csv('./data/zcta_cbsa_rel_10.txt', dtype=str)

In [None]:
zip_relationship.ZPOPPCT = zip_relationship.ZPOPPCT.astype(float)

In [None]:
len(zip_relationship.loc[zip_relationship['ZPOPPCT'] > 90])

In [None]:
large = zip_relationship['ZCTA5'].unique() #(zip_relationship['ZPOPPCT'] > 50)]

In [None]:
# metro_zips = zip_relationship.loc[(zip_relationship['MEMI'] == '2')]['ZCTA5'].unique()
metro_zips = zip_relationship['ZCTA5'].unique()
metro_hosp_zips = [] 

for i in hosp_cost_data.provider_zip_code.unique():
    if i in metro_zips:
        metro_hosp_zips.append(i)

In [None]:
metro_zips

In [None]:
hosp_cost_data.provider_zip_code.unique()

In [None]:
metro_hosp_zips

In [None]:
len(hosp_cost_data.provider_zip_code.unique())

In [None]:
hosp_cost_data['metro'] = hosp_cost_data.provider_zip_code.apply(lambda x: 1 if x in metro_zips else 0)

In [None]:
hosp_cost_data['coverage'] = hosp_cost_data.average_medicare_payments_2/hosp_cost_data.average_covered_charges

In [None]:
hosp_cost_data['out_of_pocket'] = hosp_cost_data.average_medicare_payments-hosp_cost_data.average_medicare_payments_2

In [None]:
hosp_cost_data.loc[hosp_cost_data['metro'] == 0]

In [None]:
procs = hosp_cost_data#.loc[hosp_cost_data['drg_definition'] == '039 - EXTRACRANIAL PROCEDURES W/O CC/MCC']

In [None]:
non_metro_charges = procs[procs.metro == 0].coverage

In [None]:
metro_charges = procs[procs.metro == 1].coverage

In [None]:
len(metro_charges)

In [None]:
sns.distplot(non_metro_charges)
sns.distplot(metro_charges)

In [None]:
metro_charges.describe()

In [None]:
non_metro_charges.describe()

In [None]:
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt

In [None]:
st.ttest_ind(metro_charges, non_metro_charges, equal_var=False)

In [None]:
(metro_charges.mean()-non_metro_charges.mean())/non_metro_charges.mean()

In [None]:
metro_means = []

for i in range(0,100000):
    a_mean = np.mean(np.random.choice(metro_charges, size=30))
    metro_means.append(a_mean)

In [None]:
non_metro_means = []

for i in range(0,100000):
    a_mean = np.mean(np.random.choice(non_metro_charges, size=30))
    non_metro_means.append(a_mean)

In [None]:
sns.distplot(metro_means, label='Metro')
sns.distplot(non_metro_means, label='Non-Metro')
plt.legend()


In [None]:
st.ttest_ind(metro_means, non_metro_means, equal_var=False)

In [None]:
Cohen_d(metro_means, non_metro_means)