In [191]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split

from pgmpy.estimators import HillClimbSearch, ExhaustiveSearch, BayesianEstimator
from pgmpy.estimators import ConstraintBasedEstimator, K2Score, BicScore, BDeuScore
from pgmpy.estimators import MaximumLikelihoodEstimator

from pgmpy.models import BayesianModel

np.random.seed(359)

In [192]:
def LL(x,model,verbose=False):
    loglike = 0
    for cpd in model.get_cpds():
        temp_cpd = cpd.copy()
        thevariable = temp_cpd.variable
        theparents = model.predecessors(thevariable)
        for parent in theparents:
            temp_cpd.reduce([(parent, x[parent])])
        if x[thevariable] < len(temp_cpd.get_values()): # I added this to stop it from failing
#             print("HERE", x, thevariable, temp_cpd.get_values())
            try:
                theprob = temp_cpd.get_values()[x[thevariable],0]
                if verbose:
                    print (thevariable,theparents,theprob)
                loglike += np.log(theprob)
            except:
                pass #print('Error',  x, thevariable)
    return loglike

In [193]:
def get_anomaly_ranks(data):

    for i in data.iloc[:,:]:
        data[i] = pd.cut(data[i], bins=10, labels=False)

    hc = HillClimbSearch(data, scoring_method = BicScore(data))
    bic_best_model = hc.estimate()
    best_edges = bic_best_model.edges()
    print("Edges: ")
    for edge in best_edges: 
        print(edge)

    model = BayesianModel( bic_best_model.edges() )
    model.fit(data, estimator=MaximumLikelihoodEstimator)
    exmp = data.apply(lambda x: LL(x, model), axis=1)
    exmp2=pd.Series(exmp)
    exmp2.index = data.index
    return exmp2

In [194]:
age_sex_data = pd.read_csv('../data/Demographics/Age_Sex/tract_age_sex_acs2018.csv')

age_sex_data['00_19'] = age_sex_data['00-05']+age_sex_data['05-09']+age_sex_data['10-14']+age_sex_data['15-19']
age_sex_data['20_34'] = age_sex_data['20-24']+age_sex_data['25-29']+age_sex_data['30-34']
age_sex_data['35_49'] = age_sex_data['35-39']+age_sex_data['40-44']+age_sex_data['45-49']
age_sex_data['50_64'] = age_sex_data['50-54']+age_sex_data['55-59']+age_sex_data['60-64']
age_sex_data['65_UP'] = age_sex_data['65-69']+age_sex_data['70-74']+age_sex_data['75-79']+age_sex_data['80-84']+age_sex_data['85-UP']

age_sex_data['Male'] = age_sex_data['Male'] / age_sex_data['Total']
age_sex_data['Female'] = age_sex_data['Female'] / age_sex_data['Total']
age_sex_data['00_19'] = age_sex_data['00_19'] / age_sex_data['Total']
age_sex_data['20_34'] = age_sex_data['20_34'] / age_sex_data['Total']
age_sex_data['35_49'] = age_sex_data['35_49'] / age_sex_data['Total']
age_sex_data['50_64'] = age_sex_data['50_64'] / age_sex_data['Total']
age_sex_data['65_UP'] = age_sex_data['65_UP'] / age_sex_data['Total']

age_sex_data = age_sex_data[['city', 'tract', 'county', 'BoroCTLbl', 'Total', # 'Male', 'Female',
                            '00_19', '20_34', '35_49', '50_64', '65_UP']]
age_sex_data = age_sex_data.replace(np.nan, 0.0)

age_sex_data.head()

Unnamed: 0,city,tract,county,BoroCTLbl,Total,00_19,20_34,35_49,50_64,65_UP
0,New York,1.0,Bronx,Bronx 1,7080,0.076412,0.449859,0.318503,0.148023,0.007203
1,New York,2.0,Bronx,Bronx 2,4542,0.247908,0.161823,0.199031,0.182078,0.209159
2,New York,4.0,Bronx,Bronx 4,5634,0.216365,0.22595,0.215122,0.216542,0.126021
3,New York,16.0,Bronx,Bronx 16,5917,0.275477,0.206693,0.168498,0.182187,0.167146
4,New York,19.0,Bronx,Bronx 19,2765,0.292586,0.318987,0.209403,0.151537,0.027486


In [195]:
income_data = pd.read_csv('../data/Demographics/income/household_income_acs2018.csv')

income_data['00-50k'] = income_data.loc[:,'HH 0-10k':'HH 35k-50k'].sum(axis=1)
income_data['50-100k'] = income_data.loc[:,'HH 50k-75k':'HH 75k-100k'].sum(axis=1)
income_data['100-150k'] = income_data['HH 100k-150k']
income_data['150-UP'] = income_data.loc[:,'HH 150k-200k':'HH 200k-UP'].sum(axis=1)

income_data.head()

Unnamed: 0,city,tract,county,BoroCTLbl,Households (HH) Count,HH 0-10k,HH 10k-15k,HH 15k-25k,HH 25k-35k,HH 35k-50k,...,HH 75k-100k,HH 100k-150k,HH 150k-200k,HH 200k-UP,Household Median Income,Household Mean Income,00-50k,50-100k,100-150k,150-UP
0,New York,1.0,Bronx,Bronx 1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0
1,New York,2.0,Bronx,Bronx 2,1328,6.9,2.4,12.3,9.9,11.5,...,9.5,15.4,5.9,5.9,59914,72979,43.0,29.9,15.4,11.8
2,New York,4.0,Bronx,Bronx 4,1963,7.2,1.9,7.6,3.5,8.5,...,17.2,17.3,12.6,6.3,82073,94723,28.7,35.2,17.3,18.9
3,New York,16.0,Bronx,Bronx 16,1982,8.4,12.5,11.8,16.5,15.1,...,8.1,7.7,1.2,1.1,35802,59663,64.3,25.9,7.7,2.3
4,New York,19.0,Bronx,Bronx 19,929,13.7,10.3,6.7,7.2,18.2,...,12.4,10.7,0.0,1.6,42075,54415,56.1,31.7,10.7,1.6


In [196]:
race_data = pd.read_csv('../data/Demographics/Basic Count/tract_race_acs2018.csv')
race_data['White'] = race_data['White'] / race_data['Total']
race_data['Black'] = race_data['Black'] / race_data['Total']
race_data['Native'] = race_data['Native'] / race_data['Total']
race_data['Asian'] = race_data['Asian'] / race_data['Total']
race_data['Pacific Islander'] = race_data['Pacific Islander'] / race_data['Total']
race_data['Other'] = race_data['Other'] / race_data['Total']
race_data['Two or More'] = race_data['Two or More'] / race_data['Total']
race_data = race_data.replace(np.nan, 0.0)
race_data.head()

Unnamed: 0,city,tract,county,BoroCTLbl,Total,White,Black,Native,Asian,Pacific Islander,Other,Two or More
0,New York,429.02,Bronx,Bronx 429.02,4205,0.115339,0.329845,0.042568,0.047087,0.0,0.437337,0.027824
1,New York,330.0,Bronx,Bronx 330,5885,0.499745,0.152421,0.0,0.0,0.0,0.347833,0.0
2,New York,358.0,Bronx,Bronx 358,8054,0.096722,0.739757,0.0,0.022349,0.0,0.109635,0.031537
3,New York,371.0,Bronx,Bronx 371,4322,0.259602,0.341046,0.003008,0.004165,0.0,0.338501,0.053679
4,New York,385.0,Bronx,Bronx 385,4757,0.072314,0.319739,0.007358,0.0,0.0,0.561068,0.039521


In [197]:
internet_data = pd.read_csv("../data/ACS_Internet_Subscription/subscription_acs_2018.csv")
internet_data['Mobile_Dependent'] = internet_data['Mobile_Dependent'] / internet_data['Total']
internet_data['Wired_Broadband'] = internet_data['Wired_Broadband'] / internet_data['Total']
internet_data['No_Internet'] = internet_data['No_Internet'] / internet_data['Total']
internet_data = internet_data.replace(np.nan, 0.0)
internet_data.head()

Unnamed: 0,city,tract,county,BoroCTLbl,Total,Mobile_Dependent,Wired_Broadband,No_Internet
0,New York,429.02,Bronx,Bronx 429.02,1599,0.090056,0.626642,0.262039
1,New York,330.0,Bronx,Bronx 330,2129,0.042273,0.693753,0.19117
2,New York,358.0,Bronx,Bronx 358,2443,0.081867,0.714286,0.100287
3,New York,371.0,Bronx,Bronx 371,1739,0.054054,0.533065,0.361127
4,New York,385.0,Bronx,Bronx 385,1674,0.093787,0.525687,0.280765


In [198]:
fcc_data = pd.read_csv("../data/Fixed_Broadband_Deployment_Data__Jun__2019_Status_V1.csv")
fcc_data.head()

Unnamed: 0,Logical Record Number,Provider ID,FRN,Provider Name,DBA Name,Holding Company Name,Holding Company Number,Holding Company Final,State,Census Block FIPS Code,...,Consumer,Max Advertised Downstream Speed (mbps),Max Advertised Upstream Speed (mbps),Business,Max CIR Downstream Speed (mbps),Max CIR Upstream Speed (mbps),county_code,tract,block,boro
0,19376907,50820,4963088,"ViaSat, Inc.",Viasat Inc,"ViaSat, Inc.",290111,"ViaSat, Inc.",NY,360050001000001,...,1,35.0,3.0,1,0.0,0.0,36005,100,1,bronx
1,19376908,50820,4963088,"ViaSat, Inc.",Viasat Inc,"ViaSat, Inc.",290111,"ViaSat, Inc.",NY,360050001000002,...,1,35.0,3.0,1,0.0,0.0,36005,100,2,bronx
2,19376909,50820,4963088,"ViaSat, Inc.",Viasat Inc,"ViaSat, Inc.",290111,"ViaSat, Inc.",NY,360050001001000,...,1,35.0,3.0,1,0.0,0.0,36005,100,1000,bronx
3,59544342,52979,1568880,GCI Communication Corp.,GCI Communication Corp.,GCI Holdings LLC,130534,GCI Holdings LLC,NY,360050001001000,...,0,0.0,0.0,1,0.0,0.0,36005,100,1000,bronx
4,59881129,53153,12369286,"HNS License Sub, LLC",HughesNet,"Hughes Network Systems, LLC",130627,"Hughes Network Systems, LLC",NY,360050001001000,...,1,25.0,3.0,1,0.0,0.0,36005,100,1000,bronx


In [199]:
data = pd.merge(internet_data, 
                income_data[['BoroCTLbl','00-50k','50-100k','100-150k','150-UP']], 
                how='left', 
                on=['BoroCTLbl'])[[ 'Mobile_Dependent','Wired_Broadband','No_Internet',
                                    '00-50k', '50-100k', '100-150k','150-UP']] #.drop(columns=['Households (HH) Count'])
data.head()

Unnamed: 0,Mobile_Dependent,Wired_Broadband,No_Internet,00-50k,50-100k,100-150k,150-UP
0,0.090056,0.626642,0.262039,67.7,21.9,8.1,2.3
1,0.042273,0.693753,0.19117,67.4,23.1,6.5,2.9
2,0.081867,0.714286,0.100287,31.6,32.5,17.9,18.0
3,0.054054,0.533065,0.361127,66.0,24.1,5.7,4.3
4,0.093787,0.525687,0.280765,85.1,10.7,3.3,0.8


In [200]:
data.corr()

Unnamed: 0,Mobile_Dependent,Wired_Broadband,No_Internet,00-50k,50-100k,100-150k,150-UP
Mobile_Dependent,1.0,-0.205614,0.160272,0.250205,0.170289,-0.032095,-0.185596
Wired_Broadband,-0.205614,1.0,-0.459799,-0.281935,0.407483,0.56072,0.581876
No_Internet,0.160272,-0.459799,1.0,0.746643,-0.05413,-0.3822,-0.506771
00-50k,0.250205,-0.281935,0.746643,1.0,-0.103426,-0.523271,-0.644598
50-100k,0.170289,0.407483,-0.05413,-0.103426,1.0,0.229199,-0.125045
100-150k,-0.032095,0.56072,-0.3822,-0.523271,0.229199,1.0,0.398131
150-UP,-0.185596,0.581876,-0.506771,-0.644598,-0.125045,0.398131,1.0


In [201]:
anomaly_ranks = get_anomaly_ranks(data)

Edges: 
('Wired_Broadband', 'No_Internet')
('Wired_Broadband', 'Mobile_Dependent')
('00-50k', '150-UP')
('00-50k', 'Wired_Broadband')
('00-50k', '100-150k')
('00-50k', '50-100k')


  from ipykernel import kernelapp as app


In [202]:
data = pd.merge(internet_data, 
                race_data[['BoroCTLbl', 'White', 'Black', 'Native', 'Asian', 'Pacific Islander','Other','Two or More']], 
                how='left', 
                on=['BoroCTLbl'])[[ 'Mobile_Dependent','Wired_Broadband','No_Internet',
                                   'White','Black','Native','Asian','Pacific Islander','Other','Two or More']] #.drop(columns=['Households (HH) Count'])
data.head()

Unnamed: 0,Mobile_Dependent,Wired_Broadband,No_Internet,White,Black,Native,Asian,Pacific Islander,Other,Two or More
0,0.090056,0.626642,0.262039,0.115339,0.329845,0.042568,0.047087,0.0,0.437337,0.027824
1,0.042273,0.693753,0.19117,0.499745,0.152421,0.0,0.0,0.0,0.347833,0.0
2,0.081867,0.714286,0.100287,0.096722,0.739757,0.0,0.022349,0.0,0.109635,0.031537
3,0.054054,0.533065,0.361127,0.259602,0.341046,0.003008,0.004165,0.0,0.338501,0.053679
4,0.093787,0.525687,0.280765,0.072314,0.319739,0.007358,0.0,0.0,0.561068,0.039521


In [203]:
anomaly_ranks = get_anomaly_ranks(data)

Edges: 
('Wired_Broadband', 'No_Internet')
('Wired_Broadband', 'Mobile_Dependent')
('White', 'Asian')
('White', 'Wired_Broadband')
('Black', 'White')
('Black', 'Other')


In [204]:
data.corr()

Unnamed: 0,Mobile_Dependent,Wired_Broadband,No_Internet,White,Black,Native,Asian,Pacific Islander,Other,Two or More
Mobile_Dependent,1.0,-0.256809,0.123645,-0.305347,0.243367,0.011346,-0.007568,-0.004982,0.245944,-0.01139
Wired_Broadband,-0.256809,1.0,-0.507957,0.392862,-0.150972,-0.002635,0.162138,-0.004188,-0.217718,0.026967
No_Internet,0.123645,-0.507957,1.0,-0.212891,0.196678,0.015855,-0.092782,0.023945,0.269589,-0.030127
White,-0.305347,0.392862,-0.212891,1.0,-0.671236,-0.039098,-0.061904,-0.028648,-0.388218,-0.053255
Black,0.243367,-0.150972,0.196678,-0.671236,1.0,0.015489,-0.395649,0.021191,-0.04481,-0.065176
Native,0.011346,-0.002635,0.015855,-0.039098,0.015489,1.0,-0.032301,0.014203,0.025813,0.016634
Asian,-0.007568,0.162138,-0.092782,-0.061904,-0.395649,-0.032301,1.0,-0.020601,-0.128772,-0.024713
Pacific Islander,-0.004982,-0.004188,0.023945,-0.028648,0.021191,0.014203,-0.020601,1.0,0.023008,0.048534
Other,0.245944,-0.217718,0.269589,-0.388218,-0.04481,0.025813,-0.128772,0.023008,1.0,0.099428
Two or More,-0.01139,0.026967,-0.030127,-0.053255,-0.065176,0.016634,-0.024713,0.048534,0.099428,1.0


In [205]:
data = pd.merge(internet_data, 
                age_sex_data[['BoroCTLbl','00_19','20_34','35_49','50_64','65_UP']], 
                how='left', 
                on=['BoroCTLbl'])[['Mobile_Dependent','Wired_Broadband', 'No_Internet', #'Male', 'Female', 
                                   '00_19', '20_34','35_49', '50_64', '65_UP']]
data.head()

Unnamed: 0,Mobile_Dependent,Wired_Broadband,No_Internet,00_19,20_34,35_49,50_64,65_UP
0,0.090056,0.626642,0.262039,0.252081,0.251367,0.226159,0.160999,0.109394
1,0.042273,0.693753,0.19117,0.229227,0.214613,0.251317,0.207816,0.097026
2,0.081867,0.714286,0.100287,0.210206,0.268314,0.148373,0.237398,0.135709
3,0.054054,0.533065,0.361127,0.245488,0.21888,0.179778,0.184868,0.170986
4,0.093787,0.525687,0.280765,0.354635,0.25268,0.167543,0.127181,0.097961


In [206]:
anomaly_ranks = get_anomaly_ranks(data)

Edges: 
('Wired_Broadband', 'No_Internet')
('Wired_Broadband', 'Mobile_Dependent')
('Wired_Broadband', '00_19')
('Wired_Broadband', '35_49')
('00_19', '20_34')
('20_34', '50_64')
('20_34', '65_UP')


  from ipykernel import kernelapp as app


In [207]:
data.corr()

Unnamed: 0,Mobile_Dependent,Wired_Broadband,No_Internet,00_19,20_34,35_49,50_64,65_UP
Mobile_Dependent,1.0,-0.256809,0.123645,0.190664,0.031261,0.021475,0.061025,-0.084447
Wired_Broadband,-0.256809,1.0,-0.507957,-0.093553,0.24351,0.448132,0.293083,0.137944
No_Internet,0.123645,-0.507957,1.0,0.398644,-0.074675,-0.168767,-0.043007,0.036017
00_19,0.190664,-0.093553,0.398644,1.0,-0.15768,-0.025721,-0.122145,-0.276908
20_34,0.031261,0.24351,-0.074675,-0.15768,1.0,0.179417,-0.238034,-0.372243
35_49,0.021475,0.448132,-0.168767,-0.025721,0.179417,1.0,-0.022105,-0.145301
50_64,0.061025,0.293083,-0.043007,-0.122145,-0.238034,-0.022105,1.0,0.304733
65_UP,-0.084447,0.137944,0.036017,-0.276908,-0.372243,-0.145301,0.304733,1.0


Mobile_Dependent    1.280228
Wired_Broadband     6.513988
No_Internet         1.232812
00_19               2.871977
20_34               2.578473
35_49               3.055002
50_64               1.869606
65_UP               0.917971
dtype: float64