In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from time import time

In [3]:
gaz_zcta_years = [2000, 2010, 2012, 2013, 2014, 2015, 2016]
gaz_zcta = []
for y in gaz_zcta_years:
    fname = 'gaz_zcta' + str(y) + '.csv'
    gaz = pd.read_csv(fname, sep=',', header=(0), dtype={'GEOID':str, 'ALAND':float})
    gaz_zcta.append(gaz)

def which_gaz(year):
    test = [abs(year-zear) for zear in gaz_zcta_years]
    return gaz_zcta[test.index(min(test))]

In [107]:
class Year:
    def __init__(self, year):
        HOST = "http://api.census.gov/data"
        dataset = "zbp"
        self.year = year
        base_url = "/".join([HOST, str(year), dataset])

        predicates = {}
        get_vars = ["GEO_TTL", "GEO_ID", "EMP"]
        predicates["get"] = ",".join(get_vars)
        predicates["for"] = "zipcode:*"

        start = time()
        self.r = requests.get(base_url, params=predicates)
        print('***', self.year, '년 ZBP request 업데이트 완료! *** ...', time()-start, '[sec]')
    
    def process(self):
        col_names = ["GEO_TTL", "GEOID", "EMP", "ZIP"]
        self.zbp = pd.DataFrame(columns=col_names, data=self.r.json()[1:])
        self.zbp['EMP'] = self.zbp['EMP'].apply(pd.to_numeric)
        self.gaz = which_gaz(self.year)

        self.rho = pd.DataFrame(columns=['index_gaz', 'rho'])
        gaz_temp = self.gaz.copy()
        row_num = len(self.zbp)

        start = time()
        for row in self.zbp.iloc:
            j, rho_i = None, None
            match = gaz_temp[gaz_temp.GEOID == row.ZIP]
            if len(match) > 0:
                j = match.index[0]
                aland = match.ALAND.iloc[0]
                rho_i = 1e+6 * row.EMP / aland    # 1m²/ 1e-6 km²= 1 ... aland가 0일리는 없겠지?
                gaz_temp = gaz_temp.drop(j)
            self.rho = self.rho.append({'index_gaz':j, 'rho':rho_i}, ignore_index=True)
            if row.name % 5000 == 0: print(self.year, '년 ...', row.name, '/', row_num)
        print(len(gaz_temp))
        print('***', self.year, '년 처리완료! *** ...', time()-start, '[sec]')

### PROCESSING FILES

In [127]:
years = np.arange(2012, 2017)
dat = []

for y in years:
    dat.append(Year(y))
    dat[-1].process()

*** 2012 년 ZBP request 업데이트 완료! *** ... 20.717986822128296 [sec]
2012 년 ... 0 / 38818
2012 년 ... 5000 / 38818
2012 년 ... 10000 / 38818
2012 년 ... 15000 / 38818
2012 년 ... 20000 / 38818
2012 년 ... 25000 / 38818
2012 년 ... 30000 / 38818
2012 년 ... 35000 / 38818
711
*** 2012 년 처리완료! *** ... 147.06978964805603 [sec]
*** 2013 년 ZBP request 업데이트 완료! *** ... 14.686737775802612 [sec]
2013 년 ... 0 / 38804
2013 년 ... 5000 / 38804
2013 년 ... 10000 / 38804
2013 년 ... 15000 / 38804
2013 년 ... 20000 / 38804
2013 년 ... 25000 / 38804
2013 년 ... 30000 / 38804
2013 년 ... 35000 / 38804
729
*** 2013 년 처리완료! *** ... 155.41990756988525 [sec]
*** 2014 년 ZBP request 업데이트 완료! *** ... 42.464208364486694 [sec]
2014 년 ... 0 / 38792
2014 년 ... 5000 / 38792
2014 년 ... 10000 / 38792
2014 년 ... 15000 / 38792
2014 년 ... 20000 / 38792
2014 년 ... 25000 / 38792
2014 년 ... 30000 / 38792
2014 년 ... 35000 / 38792
716
*** 2014 년 처리완료! *** ... 149.77016949653625 [sec]
*** 2015 년 ZBP request 업데이트 완료! *** ... 13.789844989776611

### WRITING FILES

In [136]:
for d in dat:
    fname = 'dat' + str(d.year) + '.csv'
    if d.year >= 2012:
        dd = d.zbp.copy()
        dd['GEO_TTL'] = dd['GEO_TTL'].apply(lambda s: s[11:-1])
        pd.concat([dd, d.rho], axis=1).to_csv(path_or_buf=fname, index=None)
    else:
        pd.concat([d.zbp, d.rho], axis=1).to_csv(path_or_buf=fname, index=None)
        
# from google.colab import files
# for d in dat:
#     fname = 'A' + str(d.year) + '.csv'
#     path = '/content/' + fname
#     pd.concat([d.zbp, d.rho],axis=1).to_csv(path_or_buf=path, index=None)
#     files.download(fname)

### MODIFYING GAZETTEERS

In [168]:
# REWRITE GAZ_ZCTA00
with open('gaz_zcta00_raw.txt') as f:
    lines = f.readlines()

arr = []
for line in lines:
    vals = [v for v in line.split(' ') if len(v) > 0]
    try: vals.remove('(part)')
    except: pass
    
    z = vals[0][2:7]
    p = vals[3]
    h = vals[4]
    a = vals[5]
    s = vals[0][0:2]
    
    new_row = np.array([z,p,h,a,s])
    if np.char.isnumeric(new_row[:4]).all():
        arr.append(new_row)
    
df = pd.DataFrame(np.array(arr), columns=['GEOID','POP','HU','ALAND','STATE'])
print('ROW LOSS :', len(lines) - len(df))
print(df.head(5))

df.to_csv('gaz_zcta00_rewritten.csv', sep='\b', index=False)

ROW LOSS : 1152
   GEOID    POP     HU      ALAND STATE
0  35004   6998   2815   49387881    AL
1  35005   8985   3690   92158183    AL
2  35006   3109   1488  339241043    AL
3  35007  20157   7762  128235102    AL
4  35010  21732  10033  616923491    AL


In [214]:
gaz_zcta_years = [2000, 2010, 2012, 2013, 2014, 2015, 2016]

gaz_zcta, gaz_fname = [], []
for y in gaz_zcta_years:
    if y == 2000 or y == 2010:
        fname = 'gaz_zcta' + str(y) + '.csv'
        gaz = pd.read_csv(fname, sep=',', header=(0), dtype={'GEOID':str, 'ALAND':float})
    else:
        fname = 'gaz_zcta' + str(y)[2:] + '.txt' # ← csv
        gaz = pd.read_csv(fname, sep='\t', header=(0), dtype={'GEOID':str, 'ALAND':float})
    gaz_zcta.append(gaz)
    gaz_fname.append(fname)

print(len(gaz_zcta), '/', len(gaz_zcta_years))

det_pgaz = lambda y: gaz_zcta[0] if y < 2005 else gaz_zcta[1]
for y, g, f in zip(gaz_zcta_years, gaz_zcta, gaz_fname):
    tflag = time()
    if y != 2000 and y != 2010:
        pgaz = det_pgaz(y)
        for row in g.iloc:
            prow = pgaz[pgaz['GEOID'] == row.GEOID]
            if len(prow) == 1:
                g.loc[row.name, 'POP'] = prow['POP'].iloc[0]
    elif y == 2000: pass
    
    f = 'gaz_zcta' + str(y) + '.csv'
    g.to_csv(f, sep=',', index=False)
    print(y, '년 처리완료 ...', time()-tflag, '[sec]')

7 / 7
2000 년 처리완료 ... 0.0807492733001709 [sec]
2010 년 처리완료 ... 0.18778133392333984 [sec]
2012 년 처리완료 ... 93.14305925369263 [sec]
2013 년 처리완료 ... 86.5849039554596 [sec]
2014 년 처리완료 ... 86.7545394897461 [sec]
2015 년 처리완료 ... 90.84136748313904 [sec]
2016 년 처리완료 ... 90.05310297012329 [sec]


# YKP

In [4]:
class Year_read:
    def __init__(self, prefix, year):
        self.year = year
        self.gaz = which_gaz(year)

        fname = prefix + str(year) + '.csv'
        self.zbp = pd.read_csv(fname, header=(0), dtype={'EMP':int, 'ZIP':str, 'rho':float})

years = np.arange(1999, 2017)
dat = []
for y in years:
    dat.append(Year_read('dat', y))

tflag = time()
for i,d in enumerate(dat):
    for row in d.zbp.iloc:
        idx = row.index_gaz
        if not np.isnan(idx):
            idx = int(idx)
            pop = d.gaz.iloc[idx].POP
            d.zbp.loc[row.name, 'POP'] = pop
    print(i+1, '/', len(dat), '(', d.year, ')', time()-tflag, 'sec'); tflag = time()

1 / 18 ( 1999 ) 10.552040576934814 sec
2 / 18 ( 2000 ) 11.15696668624878 sec
3 / 18 ( 2001 ) 11.38993215560913 sec
4 / 18 ( 2002 ) 11.727988719940186 sec
5 / 18 ( 2003 ) 11.687982082366943 sec
6 / 18 ( 2004 ) 12.028985738754272 sec
7 / 18 ( 2005 ) 12.145947456359863 sec
8 / 18 ( 2006 ) 12.334025382995605 sec
9 / 18 ( 2007 ) 12.281139135360718 sec
10 / 18 ( 2008 ) 12.440815687179565 sec
11 / 18 ( 2009 ) 12.161474227905273 sec
12 / 18 ( 2010 ) 12.193055391311646 sec
13 / 18 ( 2011 ) 11.700193405151367 sec
14 / 18 ( 2012 ) 12.489984273910522 sec
15 / 18 ( 2013 ) 12.316980838775635 sec
16 / 18 ( 2014 ) 12.636054515838623 sec
17 / 18 ( 2015 ) 12.869965314865112 sec
18 / 18 ( 2016 ) 12.312079191207886 sec


In [16]:
ykp = []
alpha = 5
pcut = 1000
count_k = lambda v: len([w for w in v if w > max(v)/alpha])

for d in dat:
    tflag = time()
    g = d.zbp.dropna().groupby(['GEO_TTL'])
    r_count = g['rho'].apply(count_k)
    r_max = g['rho'].max()
    g_idx = g.groups

    new_block = []
    ykp.append(pd.DataFrame(columns=['Y','GEO_TTL', 'k', 'P']))
    for geo, p in zip(g_idx, g['POP'].sum()):
        if p >= pcut and r_max[geo] > 0:
            k = 1
            k_test = len(g_idx[geo])
            if k_test > 1:
                k = r_count[geo]
            new_row = {'Y':d.year, 'GEO_TTL':geo, 'k':k, 'P':p}
            ykp[-1] = ykp[-1].append(new_row, ignore_index=True)
    print(d.year, '/', time()-tflag, 'sec')

ykp = pd.concat(tuple(ykp), ignore_index=True)
ykp.head(3)

In [15]:
ykp_5.to_csv('ykp_0824_alpha_5.csv', sep=',', index=False)
ykp_10.to_csv('ykp_0824_alpha_10.csv', sep=',', index=False)
ykp_20.to_csv('ykp_0824_alpha_20.csv', sep=',', index=False)

# META_KS
### (Required : ykp, years) (Mind the $\alpha$ value)

In [5]:
from scipy.stats import ks_2samp

In [17]:
ykp = pd.read_csv('ykp_0824_alpha_10.csv', sep=',', header=(0), dtype={'GEOID':str, 'ALAND':float})

In [18]:
pKS_set = 0.1 * np.arange(1,10)
k_parts, P_parts, k_KS, P_KS = [], [], [], []
for i in pKS_set:
    k_parts.append([]); P_parts.append([])
    k_KS.append([]); P_KS.append([])

years_list = list(years)
year2zbp = lambda y: dat[years_list.index(y)].zbp

groupby_count = lambda v: len(v)
geo_len = ykp.groupby(['GEO_TTL'])['k'].apply(groupby_count) 

# Time Counter
tflag = time(); N = len(geo_len)
tdiv = 50; tarr = [round(T*N/tdiv) for T in range(1,tdiv+1)]

# Process
hook = None
for t, (thisgeo, thislen) in enumerate(geo_len.to_dict().items()):
    ykp_thisgeo = ykp[ykp['GEO_TTL'] == thisgeo]
    if thislen == 1:
        k_parts.append(ykp_thisgeo['k'])
        P_parts.append(ykp_thisgeo['P'])
        continue
    samples = []
    for row in ykp_thisgeo.iloc:
        zbp = year2zbp(row['Y'])
        df = zbp[zbp['GEO_TTL'] == thisgeo]
        sample = df['rho'].sort_values(ascending=False, ignore_index=True)
        samples.append(sample)
    surv = np.ones((len(pKS_set), thislen), dtype=bool) # survived
    for n in range(1,thislen):
        for m in range(n):
            for u,pKS in enumerate(pKS_set):
                if not surv[u,m]: continue
                D, pval = ks_2samp(samples[n], samples[m])
                if pval > 1-pKS: # two curves are the same
                    surv[u,m] = False
                    continue
    indices = np.arange(thislen)
    for i in range(len(pKS_set)):
        k_parts[i].append(ykp_thisgeo['k'].iloc[indices[surv[i]]])
        P_parts[i].append(ykp_thisgeo['P'].iloc[indices[surv[i]]])
    if t == tarr[0]:
        print('%d / %d (%.2f%%) ... %.5f sec' % (t, N, 100*t/N, time()-tflag))
        tarr.pop(0); tflag=time()

for i in range(len(pKS_set)):
    k_KS[i] = np.concatenate(k_parts[i])
    P_KS[i] = np.concatenate(P_parts[i])

378 / 18901 (2.00%) ... 25.31725 sec
756 / 18901 (4.00%) ... 26.82178 sec
1134 / 18901 (6.00%) ... 28.25195 sec
1512 / 18901 (8.00%) ... 25.83189 sec
1890 / 18901 (10.00%) ... 24.49373 sec
2268 / 18901 (12.00%) ... 24.93904 sec
2646 / 18901 (14.00%) ... 24.56729 sec
3024 / 18901 (16.00%) ... 24.97497 sec
3402 / 18901 (18.00%) ... 24.91854 sec
3780 / 18901 (20.00%) ... 25.46996 sec
4158 / 18901 (22.00%) ... 25.03242 sec
4536 / 18901 (24.00%) ... 26.02962 sec
4914 / 18901 (26.00%) ... 25.00946 sec
5292 / 18901 (28.00%) ... 26.21763 sec
5670 / 18901 (30.00%) ... 28.94428 sec
6048 / 18901 (32.00%) ... 27.74782 sec
6426 / 18901 (34.00%) ... 28.14722 sec
6804 / 18901 (36.00%) ... 28.65045 sec
7182 / 18901 (38.00%) ... 26.92910 sec
7560 / 18901 (40.00%) ... 27.03110 sec
7938 / 18901 (42.00%) ... 28.04884 sec
8316 / 18901 (44.00%) ... 27.29907 sec
8694 / 18901 (46.00%) ... 29.90892 sec
9072 / 18901 (48.00%) ... 27.66629 sec
9450 / 18901 (50.00%) ... 28.14746 sec
9829 / 18901 (52.00%) ... 26.20

In [19]:
for i in range(len(pKS_set)):
    fname = 'kP_KS_%02d_alpha_10.csv' % (i+1)
    df = pd.DataFrame({'k':k_KS[i], 'P':P_KS[i]})
    df.to_csv(fname, sep=',', index=False)