In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
import statsmodels.api as sm

In [2]:
df = pd.concat([pd.read_csv('dataset/gfm/memorial_df.csv'),pd.read_csv('dataset/gfm/animal_df.csv'),pd.read_csv('dataset/gfm/business_df.csv'),pd.read_csv('dataset/gfm/family_df.csv'),pd.read_csv('dataset/gfm/medical_df.csv'),pd.read_csv('dataset/gfm/newlywed_df.csv')],ignore_index=True)

In [3]:
def get_state(location):
    try:
        return location[-2:]
    except:
        return np.nan

In [4]:
df['State'] = df.apply(lambda row: get_state(row.Location), axis=1)

In [5]:
def get_mean(location):
    if type(location)==str:
        location = location.upper()
        if location in city_list:
            result = city_df.loc[location,'Mean']
            if type(result)==np.int64:
                return result
            else:
                candidate = result.iloc[0]
                for i in result:
                    if i>0:
                        if i<candidate:
                            candidate = i
                return candidate
        else:
            return np.nan

In [6]:
def get_location(location):
    if type(location)==str:
        location = location.upper()
        if location in city_list:
            latitude = city_df.loc[location,"Lat"]
            if type(latitude)!=np.float64:
                latitude = latitude.iloc[0]
            longitude = city_df.loc[location,"Lon"]
            if type(longitude)!=np.float64:
                longitude = longitude.iloc[0]
            return (latitude,longitude)
        else:
            return np.nan
    else:
        return np.nan

In [7]:
def get_latitude(coords):
    try:
        return coords[0]
    except:
        return None

In [8]:
def get_longitude(coords):
    try:
        return coords[1]
    except:
        return None

In [8]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [9]:
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [9]:
geo_df = pd.read_csv('dataset/kaggle_income.csv',encoding='latin-1')

In [10]:
geo_df['Location'] = geo_df.apply(lambda row: (row.City+', '+row.State_ab).upper(),axis=1)

In [11]:
city_df = geo_df.sort_values(by=['Location'])

In [12]:
city_list = list(geo_df['Location'])

In [13]:
city_df = city_df.set_index(['Location'])

In [14]:
df['Mean'] = df.apply(lambda row: get_mean(row.Location),axis=1)

In [15]:
def get_class(income):
    if income<46000:
        return 'lower'
    if income<135000:
        return 'middle'
    else:
        return 'upper'

In [24]:
df['Class'] = df.apply(lambda row: get_class(row.Mean),axis=1)

In [25]:
df["Coords"] = df.apply(lambda row: get_location(row.Location), axis=1)

In [26]:
def pass_fail(funds,goal):
    try:
        goal = float(goal.replace(',',''))
        funds = float(funds)
    except:
        return np.nan
    if funds>goal:
        return True
    return False

In [27]:
df["Pass"] = df.apply(lambda row: pass_fail(row.Amount_Raised,row.Goal),axis=1)

In [21]:
map_file = open('mapbox.txt','r').readlines()

In [23]:
mapbox_access_token = map_file[0] #read from a file if posting on github

In [28]:
def goal_percent(funds,goal):
    try:
        goal = float(goal.replace(',',''))
        funds = float(funds)
    except:
        return np.nan
    return funds/goal

In [29]:
df['Goal_Percent'] = df.apply(lambda row: goal_percent(row.Amount_Raised,row.Goal),axis=1)

In [30]:
df['Latitude'] = df.apply(lambda row: get_latitude(row.Coords),axis=1)

In [31]:
df['Longitude'] = df.apply(lambda row: get_longitude(row.Coords),axis=1)

In [34]:
df['Goal_Scale'] = df.apply(lambda row: min(row.Goal_Percent*100,300),axis=1)

In [35]:
df_drop = df[~df['Coords'].isna()]

In [36]:
df_drop = df_drop[~df_drop['Goal_Percent'].isna()]

In [38]:
scl = [0, 'rgb(139,0,0)'],[0.33, 'rgb(255,255,0)'],[0.5,'rgb(0,255,0)'],[1,'rgb(0,100,0)']

In [39]:
data = [go.Scattergeo(lat=df_drop['Latitude'],lon=df_drop['Longitude'],text=df_drop['Title'],
                     marker = dict(color = df_drop['Goal_Scale'],colorscale=scl,cauto=False,cmin=0,cmax=300,opacity=0.7,size=10,colorbar=dict(
                     thickness = 10,titleside = "right",outlinecolor = "rgba(68, 68, 68, 0)",ticks = "outside",ticklen = 3,showticksuffix = "last",ticksuffix = " Percent",dtick = 20)))]
layout = dict(
    geo = dict(
        scope = 'usa',
        showland = True,
        landcolor = "rgb(212, 212, 212)",
        subunitcolor = "rgb(255, 255, 255)",
        countrycolor = "rgb(255, 255, 255)",
        showlakes = True,
        lakecolor = "rgb(255, 255, 255)",
        showsubunits = True,
        showcountries = True,
        resolution = 50,
        projection = dict(
            type = 'albers usa',
            rotation = dict(
                lon = -100
            )
        ),
        lonaxis = dict(
            showgrid = True,
            gridwidth = 0.5,
            range= [ -140.0, -55.0 ],
            dtick = 5
        ),
        lataxis = dict (
            showgrid = True,
            gridwidth = 0.5,
            range= [ 20.0, 60.0 ],
            dtick = 5
        )
    ),
    title = 'Heat_test',
)

In [40]:
fig = go.Figure(data=data, layout=layout )
py.iplot(fig, filename='Heat_test')


Consider using IPython.display.IFrame instead



In [41]:
def draw_map(df):
    data = [go.Scattergeo(lat=df['Latitude'],lon=df['Longitude'],text=df['Title'],
                     marker = dict(color = df['Goal_Scale'],colorscale=scl,cauto=False,cmin=0,cmax=300,opacity=0.7,size=10,colorbar=dict(
                     thickness = 10,titleside = "right",outlinecolor = "rgba(68, 68, 68, 0)",ticks = "outside",ticklen = 3,showticksuffix = "last",ticksuffix = " Percent",dtick = 20)))]
    layout = dict(
        geo = dict(
            scope = 'usa',
            showland = True,
            landcolor = "rgb(212, 212, 212)",
            subunitcolor = "rgb(255, 255, 255)",
            countrycolor = "rgb(255, 255, 255)",
            showlakes = True,
            lakecolor = "rgb(255, 255, 255)",
            showsubunits = True,
            showcountries = True,
            resolution = 50,
            projection = dict(
                type = 'albers usa',
                rotation = dict(
                    lon = -100
                )
            ),
            lonaxis = dict(
                showgrid = True,
                gridwidth = 0.5,
                range= [ -140.0, -55.0 ],
                dtick = 5
            ),
            lataxis = dict (
                showgrid = True,
                gridwidth = 0.5,
                range= [ 20.0, 60.0 ],
                dtick = 5
            )
        ),
        title = 'Heat_test',
    
    )
    fig = go.Figure(data=data, layout=layout )
    return fig

In [48]:
medical_df = df[df["Category"]=='Medical'].copy()

In [49]:
def to_string(amount):
    try:
        amount = float(amount.replace(',',''))
    except:
        if 'M' in amount:
            amount = amount[0:-1]
            amount = float(amount)*1000000
    return amount

In [50]:
medical_df['Goal'] = medical_df.apply(lambda row: to_string(row.Goal),axis=1)

In [51]:
medical_df['Goal_Percent'] = medical_df.apply(lambda row: row.Amount_Raised/row.Goal,axis=1)

In [52]:
medical_df['Pass'] = medical_df.apply(lambda row: True if row.Amount_Raised>row.Goal else False,axis=1)

In [53]:
medical_df['Pass80'] = medical_df.apply(lambda row: True if row.Goal_Percent>0.8 else False,axis=1)

In [54]:
medical_df_drop = medical_df[~medical_df['Mean'].isna()]

In [55]:
medical_df_drop = medical_df_drop[medical_df_drop['Goal_Percent']<10]

In [57]:
X = medical_df_drop['Mean']

In [58]:
Y = medical_df_drop['Goal_Percent']

In [59]:
X_s = sm.add_constant(X)

In [60]:
Y_binary = medical_df_drop['Pass']

In [61]:
Y_binary2 = medical_df_drop['Pass80']

In [62]:
model = sm.OLS(Y,X_s)

In [63]:
results = model.fit()

In [64]:
results.params

const    8.597607e-01
Mean     7.623895e-07
dtype: float64

In [65]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:           Goal_Percent   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.110
Date:                Fri, 31 May 2019   Prob (F-statistic):              0.292
Time:                        15:35:35   Log-Likelihood:                -1013.6
No. Observations:                 904   AIC:                             2031.
Df Residuals:                     902   BIC:                             2041.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8598      0.045     19.057      0.0

In [66]:
model2 = sm.Logit(Y_binary,X_s)

In [67]:
results2 = model2.fit()

Optimization terminated successfully.
         Current function value: 0.609819
         Iterations 5


In [68]:
results2.summary()

0,1,2,3
Dep. Variable:,Pass,No. Observations:,904.0
Model:,Logit,Df Residuals:,902.0
Method:,MLE,Df Model:,1.0
Date:,"Fri, 31 May 2019",Pseudo R-squ.:,0.001405
Time:,15:35:41,Log-Likelihood:,-551.28
converged:,True,LL-Null:,-552.05
,,LLR p-value:,0.213

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.9866,0.133,-7.397,0.000,-1.248,-0.725
Mean,2.621e-06,2.1e-06,1.250,0.211,-1.49e-06,6.73e-06


In [69]:
model3 = sm.Logit(Y_binary2,X_s)

In [70]:
results3 = model3.fit()

Optimization terminated successfully.
         Current function value: 0.691653
         Iterations 4


In [71]:
results3.summary()

0,1,2,3
Dep. Variable:,Pass80,No. Observations:,904.0
Model:,Logit,Df Residuals:,902.0
Method:,MLE,Df Model:,1.0
Date:,"Fri, 31 May 2019",Pseudo R-squ.:,0.002028
Time:,15:35:50,Log-Likelihood:,-625.25
converged:,True,LL-Null:,-626.53
,,LLR p-value:,0.1109

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1356,0.122,-1.115,0.265,-0.374,0.103
Mean,3.112e-06,1.96e-06,1.590,0.112,-7.23e-07,6.95e-06


In [72]:
lower_df = medical_df_drop[medical_df_drop['Class']=='lower']

In [73]:
middle_df = medical_df_drop[medical_df_drop['Class']=='middle']

In [74]:
tresult = sm.stats.ttest_ind(middle_df['Mean'], lower_df['Mean'],usevar='unequal')

In [75]:
tresult

(45.04055543251784, 1.5621597027671235e-186, 549.1306447278355)

In [76]:
dummy_region_df = medical_df_drop[medical_df_drop['State']!='NY']

In [77]:
dummy_region_df = dummy_region_df[dummy_region_df['State']!='CA']

In [78]:
dummy_region_df = dummy_region_df[dummy_region_df['State']!='VA']

In [79]:
dummy_region_df = dummy_region_df[dummy_region_df['State']!='DC']

In [80]:
dummy_region_df = dummy_region_df[dummy_region_df['State']!='FL']

In [81]:
dummy_region_df = dummy_region_df[dummy_region_df['State']!='WA']

In [82]:
dummy_region_df = dummy_region_df[dummy_region_df['State']!='TX']

In [83]:
city_region_df = medical_df_drop[~medical_df_drop.index.isin(dummy_region_df.index)]

In [84]:
tresult2 = sm.stats.ttest_ind(dummy_region_df['Mean'], city_region_df['Mean'],usevar='unequal')

In [85]:
tresult2

(4.77781135422645, 2.067858327491998e-06, 901.998137511994)

In [86]:
chi_test = sm.stats.proportions_chisquare([city_region_df['Pass'].sum(),dummy_region_df['Pass'].sum()],[len(city_region_df),len(dummy_region_df)])

In [87]:
chi_test

(11.885084558873519, 0.0005658525653910414, (array([[104, 322],
         [167, 311]], dtype=int64), array([[127.70575221, 298.29424779],
         [143.29424779, 334.70575221]])))

In [88]:
chi_test2 = sm.stats.proportions_chisquare([lower_df['Pass'].sum(),middle_df['Pass'].sum()],[len(lower_df),len(middle_df)])

In [89]:
chi_test2

(0.017751971928947254, 0.8940063245755203, (array([[139, 330],
         [125, 291]], dtype=int64), array([[139.90508475, 329.09491525],
         [124.09491525, 291.90508475]])))

In [90]:
chi_test3 = sm.stats.proportions_chisquare([city_region_df['Pass80'].sum(),dummy_region_df['Pass80'].sum()],[len(city_region_df),len(dummy_region_df)])

In [91]:
chi_test3

(13.753517049508789, 0.00020843020097267186, (array([[188, 238],
         [270, 208]], dtype=int64), array([[215.82743363, 210.17256637],
         [242.17256637, 235.82743363]])))

In [92]:
chi_test4 = sm.stats.proportions_chisquare([lower_df['Pass80'].sum(),middle_df['Pass80'].sum()],[len(lower_df),len(middle_df)])

In [93]:
chi_test4

(0.4223963531228118, 0.5157437262228715, (array([[231, 238],
         [214, 202]], dtype=int64), array([[235.82485876, 233.17514124],
         [209.17514124, 206.82485876]])))

In [131]:
medical_df_drop.to_csv('medical_df_final.csv')

In [103]:
ny_df = medical_df_drop[medical_df_drop['State']=='NY']

data = [go.Scattermapbox(lat=ny_df['Latitude'],lon=ny_df['Longitude'],text=ny_df['Title'],mode='markers',marker=dict(color = ny_df['Goal_Scale'],colorscale=scl,cauto=False,cmin=0,cmax=300,size=10,opacity=.8))]

layout = go.Layout(autosize=False,mapbox=dict(accesstoken=mapbox_access_token,bearing=10,pitch=0,zoom=5,center=dict(lat=ny_df['Latitude'].mean(),lon=ny_df['Longitude'].mean())),width=900,height=600,title="Test NY")

fig = dict(data=data,layout=layout)
py.iplot(fig)


Consider using IPython.display.IFrame instead



In [114]:
cal_df = medical_df_drop[medical_df_drop['State']=='CA']

data = [go.Scattermapbox(lat=cal_df['Latitude'],lon=cal_df['Longitude'],text=cal_df['Title'],mode='markers',marker=dict(color = cal_df['Goal_Scale'],colorscale=scl,cauto=False,cmin=0,cmax=300,size=10,opacity=.8))]

layout = go.Layout(autosize=False,mapbox=dict(accesstoken=mapbox_access_token,bearing=10,pitch=0,zoom=5,center=dict(lat=cal_df['Latitude'].mean(),lon=cal_df['Longitude'].mean())),width=900,height=600,title="Test California")

fig = dict(data=data,layout=layout)
py.iplot(fig)


Consider using IPython.display.IFrame instead



In [107]:
texas_df = medical_df_drop[medical_df_drop['State']=='TX']

data = [go.Scattermapbox(lat=texas_df['Latitude'],lon=texas_df['Longitude'],text=texas_df['Title'],mode='markers',marker=dict(color = texas_df['Goal_Scale'],colorscale=scl,cauto=False,cmin=0,cmax=300,size=10,opacity=.8))]

layout = go.Layout(autosize=False,mapbox=dict(accesstoken=mapbox_access_token,bearing=10,pitch=0,zoom=5,center=dict(lat=texas_df['Latitude'].mean(),lon=texas_df['Longitude'].mean())),width=900,height=600,title="Test Texas")

fig = dict(data=data,layout=layout)
py.iplot(fig)


Consider using IPython.display.IFrame instead



In [109]:
dmv_df = medical_df_drop[(medical_df_drop["State"]=='VA')|(medical_df_drop["State"]=='DC')|(medical_df_drop['State']=='MD')]

data = [go.Scattermapbox(lat=dmv_df['Latitude'],lon=dmv_df['Longitude'],text=dmv_df['Title'],mode='markers',marker=dict(color = dmv_df['Goal_Scale'],colorscale=scl,cauto=False,cmin=0,cmax=300,size=10,opacity=.8))]

layout = go.Layout(autosize=False,mapbox=dict(accesstoken=mapbox_access_token,bearing=10,pitch=0,zoom=5,center=dict(lat=dmv_df['Latitude'].mean(),lon=dmv_df['Longitude'].mean())),width=900,height=600,title="Test DMV")

fig = dict(data=data,layout=layout)
py.iplot(fig)


Consider using IPython.display.IFrame instead



In [115]:
fl_df = medical_df_drop[medical_df_drop["State"]=='FL']
data = [go.Scattermapbox(lat=fl_df['Latitude'],lon=fl_df['Longitude'],text=fl_df['Title'],mode='markers',marker=dict(color = fl_df['Goal_Scale'],colorscale=scl,cauto=False,cmin=0,cmax=300,size=10,opacity=.8))]

layout = go.Layout(autosize=False,mapbox=dict(accesstoken=mapbox_access_token,bearing=10,pitch=0,zoom=5,center=dict(lat=fl_df['Latitude'].mean(),lon=fl_df['Longitude'].mean())),width=900,height=600,title="Test Florida")

fig = dict(data=data,layout=layout)
py.iplot(fig)


Consider using IPython.display.IFrame instead

