In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt

In [None]:
def Cohen_d(group1, group2):

    # Compute Cohen's d.

    # group1: Series or NumPy array
    # group2: Series or NumPy array

    # returns a floating point number 

    diff = np.mean(group1) - np.mean(group2)

    n1, n2 = len(group1), len(group2)
    var1 = np.var(group1)
    var2 = np.var(group2)

    # Calculate the pooled threshold as shown earlier
    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    
    # Calculate Cohen's d statistic
    d = diff / np.sqrt(pooled_var)
    
    return d

## Import Data

In [2]:
import data_cleaning

In [3]:
clean_data = data_cleaning.full_clean()

In [4]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197164 entries, 0 to 197282
Data columns (total 11 columns):
average_covered_charges      197164 non-null float64
average_medicare_payments    197164 non-null float64
average_total_payments       197164 non-null float64
drg_definition               197164 non-null object
provider_name                197164 non-null object
total_discharges             197164 non-null int64
full_address                 197164 non-null object
geoid                        197164 non-null object
csi                          197164 non-null object
out_of_pocket                197164 non-null float64
perc_covered                 197164 non-null float64
dtypes: float64(5), int64(1), object(5)
memory usage: 18.1+ MB


In [5]:
sum(clean_data.csi.isna())

0

## Preliminary Analysis 

### Total Hospital Charges

In [None]:
metro_charges = clean_data.loc[clean_data['csi'] == '1']['average_covered_charges']

In [None]:
non_metro_charges = clean_data.loc[clean_data['csi'] != '1']['average_covered_charges']

In [None]:
sns.distplot(metro_charges, label='metro')
sns.distplot(non_metro_charges, label='non_metro')
plt.legend()
plt.xlim(left=0, right=500000)

In [None]:
st.ttest_ind(metro_charges, non_metro_charges, equal_var=False)

In [None]:
metro_means = []

for i in range(0,100000):
    a_mean = np.mean(np.random.choice(metro_charges, size=100))
    metro_means.append(a_mean)

In [None]:
non_metro_means = []

for i in range(0,100000):
    a_mean = np.mean(np.random.choice(non_metro_charges, size=100))
    non_metro_means.append(a_mean)

In [None]:
sns.distplot(metro_means, label='metro')
sns.distplot(non_metro_means, label='non_metro')
plt.legend()
plt.x_label = 'Average Hospital Charges'

In [None]:
Cohen_d(metro_charges, non_metro_charges)

### Patient Deductible/Charges

In [None]:
metro_deductible = clean_data.loc[clean_data['CSI'] == '1']['deductible']

In [None]:
non_metro_deductible = clean_data.loc[clean_data['CSI'] != '1']['deductible']

In [None]:
sns.distplot(metro_deductible, label='metro')
sns.distplot(non_metro_deductible, label='non_metro')
plt.legend()

In [None]:
metro_deductible_means = []

for i in range(0,100000):
    a_mean = np.mean(np.random.choice(metro_deductible, size=100))
    metro_deductible_means.append(a_mean)

In [None]:
non_metro_deductible_means = []

for i in range(0,100000):
    a_mean = np.mean(np.random.choice(non_metro_deductible, size=100))
    non_metro_deductible_means.append(a_mean)

In [None]:
st.ttest_ind(metro_deductible_means, non_metro_deductible_means, equal_var=False)

In [None]:
sns.distplot(metro_deductible_means, label='metro')
sns.distplot(non_metro_deductible_means, label='non_metro')
plt.legend()
plt.x_label = 'Deductible'

In [None]:
Cohen_d(metro_deductible_means, non_metro_deductible_means)