## On this notebook we extract Censo 2010 individual data from their files.


In [1]:
# Load modules
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

import numpy as np

### Create reference dataset for Radios. This holds all the info on id's and labels of districts together.

In [7]:
df = pd.read_csv('./../../censo_geo/RADIO.csv', ';'
                ).merge(pd.read_csv('./../../censo_geo/FRAC.csv', ';')
                       ).merge(pd.read_csv('./../../censo_geo/DPTO.csv', ';')
                              ).merge(pd.read_csv('./../../censo_geo/PROV.csv', ';'))

df['radio'] = df['IDPROV'].astype(str).str.zfill(2) + df['IDDPTO'].astype(str).str.zfill(3) \
+ df['IDFRAC'].astype(str).str.zfill(2) + df['IDRADIO'].astype(str).str.zfill(2)

df['NOMDPTO'] = df['NOMDPTO'].str.strip()

ref = pd.read_csv('./../../../Documents/EPH/radios_aglos_ref.csv')
ref['AGLOMERADO'] = ref['AGLOMERADO'].fillna(0).astype(int)
ref['radio'] = ref['LINK'].astype(str).str.zfill(9)

df = df.merge(ref[['radio', 'AGLOMERADO']], on = 'radio')

radio_ref = df#.set_index('RADIO_REF_ID', drop = True)
radio_ref.index = radio_ref.RADIO_REF_ID

#Ejemplo
# radio_ref.to_csv('./../../censo_geo/radio_ref.csv', index = False)
radio_ref = radio_ref.loc[radio_ref.AGLOMERADO.isin([2, 32, 33])]
radio_ref.sample(3)

Unnamed: 0_level_0,RADIO_REF_ID,FRAC_REF_ID,IDRADIO,DPTO_REF_ID,IDFRAC,PROV_REF_ID,IDDPTO,DPTO,NOMDPTO,CPV2010_REF_ID,IDPROV,PROV,NOMPROV,radio,AGLOMERADO
RADIO_REF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2385,2385,238,3,11,12,1,11,2011,Comuna 11,1,2,2,Ciudad Autónoma de Buenos Aires,20111203,32
11885,11885,1005,9,37,18,2,805,6805,Tigre,1,6,6,Buenos Aires,68051809,33
4002,4002,388,6,16,37,2,28,6028,Almirante Brown,1,6,6,Buenos Aires,60283706,33


### Check how long the data files are...

In [3]:
import subprocess

def file_len(fname):
    p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, 
                                              stderr=subprocess.PIPE)
    result, err = p.communicate()
    if p.returncode != 0:
        raise IOError(err)
    n_rows = int(result.strip().split()[0])+1
    print(f'{fname} Exact number of rows: {n_rows}')
#     return n_rows

file_len('./HOGAR.csv')
file_len('./VIVIENDA.csv')
file_len('./PERSONA.csv')

./HOGAR.csv Exact number of rows: 12197649
./VIVIENDA.csv Exact number of rows: 13838099
./PERSONA.csv Exact number of rows: 40117098


### That is why we have to choose which DEPTOS we'll work with

In [9]:
# Tomar pocos distritos que concentren mucha poblacion

personas = pd.read_csv('./../../../Documents/Datos_censo/Preguntas/PERSONA-P02.csv', encoding='latin-1')
personas['radio'] = personas['radio'].astype(str).str.zfill(9)
personas = personas.merge(radio_ref[['radio', 'PROV', 'DPTO', 'NOMDPTO']])

# Eg. choose the largest 25 in PBA and CABA, etc...
# n = 25
personas_prov = personas.loc[personas.PROV.isin([2, 6])]
# seleccion_DPTOS = personas['DPTO'].values
# seleccion_DPTOS = personas_prov['DPTO'].values
seleccion_DPTOS = radio_ref['DPTO'].unique()

# most_populous = personas_prov.groupby(['PROV', 'DPTO', 'NOMDPTO'])[['TOTAL']].sum().sort_values(
#     by = 'TOTAL', ascending = True)
# seleccion_DPTOS = most_populous.reset_index().sample(n)['DPTO'].values

# OR Choose directly by DPTO code
# seleccion_DPTOS = np.array([6105]) # Bolivar 6105 
# seleccion_DPTOS = np.array([6861, 6364]) # V Lopez 6861  # G. Rodriguez 6364

#check cumsum
# most_populous.cumsum()/most_populous.sum()

# This means we will need some of the AGLOS and not others. (e.g. don't use AGLO Cordoba if data is for BA)
# radio_ref_sel_aglos = radio_ref.loc[radio_ref.DPTO.isin(seleccion_DPTOS)]
# radio_ref_sel_aglos['AGLOMERADO'].value_counts()
# radio_ref_sel_aglos

## Load indiv level census data

## Start with VIVIENDA and HOGARES

We use dask for loading large datasets. Works the same as pandas but manages memory to avoid crashing.
Still it will be better to use usecols for choosing the necessary variables.

We load VIVIVENDA dataset, then merge HOGARES. 
PERSONA is merged later, after we saved the partial data. Cause crashes are not infrequent.

In [117]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

#Esto es para extraer las viviendas, hogares y personas de los partidos (DPTOs) en cuestion.
# seleccion_DPTOS y usecols nos sirven para no cargar data innecesaria.
VIVIENDA = dd.read_csv('./../../../Desktop/extracted_/VIVIENDA.csv', sep = ';', usecols = ['VIVIENDA_REF_ID', 'RADIO_REF_ID', 'TIPVV', 'V01'])
VIVIENDA = VIVIENDA.merge(radio_ref[['RADIO_REF_ID', 'DPTO']])
VIVIENDA_ = VIVIENDA.loc[VIVIENDA.DPTO.isin(seleccion_DPTOS)].sample(.2)
with ProgressBar():
    VIVIENDA_REF_ID_sel = VIVIENDA_['VIVIENDA_REF_ID'].values.compute()

HOGAR = dd.read_csv('./../../../Desktop/extracted_/HOGAR.csv', sep = ';', usecols = ['HOGAR_REF_ID', 'VIVIENDA_REF_ID', 'H05', 'H06', 'H07', 'H08',
       'H09', 'H10', 'H11', 'H12', 'H13', 'H14', 'H15', 'H16', 'PROP', 'TOTPERS']) # csv is too big, so it is dask-loaded. Not sure it's efficient thou
# For example computing len takes ages
# len(HOGAR.VIVIENDA_REF_ID)
HOGAR_ = HOGAR.loc[HOGAR.VIVIENDA_REF_ID.isin(VIVIENDA_REF_ID_sel)]
with ProgressBar():
    HOGAR_REF_ID_sel = HOGAR_['HOGAR_REF_ID'].values.compute()

tabla_censo = VIVIENDA_.merge(HOGAR_)#.merge(PERSONA_)

IX_TOT = tabla_censo.groupby('HOGAR_REF_ID').count().iloc[:, 0].reset_index()
IX_TOT.columns = ['HOGAR_REF_ID', 'IX_TOT']

tabla_censo = tabla_censo.merge(IX_TOT)

with ProgressBar():
    table = tabla_censo.compute()


[########################################] | 100% Completed | 12.3s
[########################################] | 100% Completed | 19.4s
[########################################] | 100% Completed | 35.9s


## Armonizar para adecuar dataset Censo a las opciones rta de EPH.
No correr 2 veces

In [118]:
# Approach: modify Census to fit EPH
table['V01'] = table['V01'].map({1:1, 2:6, 3:6, 4:2, 5:3, 6:4, 7:5, 8:6})
table['H06'] = table['H06'].map({1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:9})
table['H09'] = table['H09'].map({1:1, 2:2, 3:3, 4:4, 5:4, 6:4})
table['H16'] = table['H16'].clip(0, 9)
table['H14'] = table['H14'].map({1:1, 2:4, 3:2, 4:2, 5:4, 6:3, 7:4, 8:9})
table['H13'] = table['H13'].map({1:1, 2:2, 4:0})
# table['P07'] = table['P07'].map({1:1, 2:2, 0:2})

# saber de que aglo es la persona. Se usa los resultados de cada aglo.
table = table.merge(radio_ref[['RADIO_REF_ID','AGLOMERADO']]) 



#### We want to save in case merging PERSONA crashes.

In [119]:
# Only once to save time in the future.
# name =  'bolivar'#''
# name =  'vlopez_rodriguez'#''
# name =  'rand'#''
# table.to_csv('./sample_censo_table'+str(n).zfill(3)+name+'.csv', index = False)
# table.to_csv('./sample_censo_table_f.1BA.csv', index = False)

table.to_csv('./intermediate_save.csv')#.csv


## Merge PERSONA

In [120]:
PERSONA = dd.read_csv('./../../../Desktop/extracted_/PERSONA.csv', sep = ';', usecols = ['PERSONA_REF_ID', 'HOGAR_REF_ID', 'P01', 'P02', 'P03', 'P05', 'P06',
       'P07', 'P12', 'P08', 'P09', 'P10', 'CONDACT'])
PERSONA_ = PERSONA.loc[PERSONA.HOGAR_REF_ID.isin(HOGAR_REF_ID_sel) &
                      ((PERSONA.P09.isin([4, 5]) & (PERSONA.P10 == 1)) | 
                       (PERSONA.P09.isin([6, 7]) & (PERSONA.P10 == 2)))] # Secundaria completa # Terc/Univ incompleta

# PERSONA_ = PERSONA.loc[PERSONA.HOGAR_REF_ID.isin(HOGAR_REF_ID_sel)] # Secundaria completa # Terc/Univ incompleta


with ProgressBar():
    PERSONA_ = PERSONA_.compute()

table = table.merge(PERSONA_)

table['P07'] = table['P07'].map({1:1, 2:2, 0:2})


[########################################] | 100% Completed | 40.6s


In [134]:
table.to_csv('./estuds_indiv.csv')#.csv

In [133]:
5*table.loc[table.P09 == 7].groupby('DPTO').nunique()['PERSONA_REF_ID']

DPTO
2001    24950
2002    31345
2003    23685
2004    15910
2005    25050
2006    25630
2007    22210
2008     8225
2009    13895
2010    17935
2011    22435
2012    25660
2013    34455
2014    38985
2015    23280
6028    23175
6035    21915
6091    13775
6098     6130
6245     3250
6252     7665
6260    12605
6270     4310
6274     9635
6364     2695
6371    23315
6408    10240
6410    12455
6412     6610
6427    58385
6434    28535
6441    83290
6490    32735
6515    11255
6525     1385
6539    16070
6560    13160
6568    27900
6638    10830
6648     1700
6658    29775
6749     8440
6756    28365
6760    15600
6778     1750
6805    15280
6840    23200
6861    26115
Name: PERSONA_REF_ID, dtype: int64

In [124]:
table.groupby('DPTO').nunique().sort_values(by = 'VIVIENDA_REF_ID')['PERSONA_REF_ID']

DPTO
6525     1650
6778     2167
6245     2633
6648     2573
6364     2858
6098     4416
6270     5715
6749     7760
6252     8337
2008     9068
6408     8949
6410     9368
6412     8829
2009    10781
6638    10622
2010    12028
6260    12134
2004    12723
2002    11688
6760    12589
6515    12566
2015    13249
2006    12747
2011    13648
2005    13188
2003    13901
2001    14183
2007    14486
6274    13184
6091    14443
2012    14309
6805    15241
6861    15896
6560    15938
2013    15593
6756    17236
2014    15929
6035    18813
6840    19092
6568    19537
6539    18983
6371    20903
6028    22318
6434    23874
6658    26591
6490    28182
6441    37469
6427    67889
Name: PERSONA_REF_ID, dtype: int64

In [122]:
PERSONA_.groupby(['P03','P09', 'P10']).nunique()#.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PERSONA_REF_ID,HOGAR_REF_ID,P01,P02,P03,P05,P06,P07,P12,P08,P09,P10,CONDACT
P03,P09,P10,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
16,4,1,5,3,1,2,1,2,2,1,1,1,1,1,1
16,6,2,77,77,7,2,1,2,5,1,3,2,1,1,4
16,7,2,127,127,8,2,1,2,9,1,3,2,1,1,4
17,4,1,531,527,8,2,1,2,14,1,3,1,1,1,4
17,5,1,331,326,7,2,1,2,6,1,2,1,1,1,3
17,6,2,125,124,8,2,1,2,6,1,3,2,1,1,4
17,7,2,245,243,7,2,1,2,12,1,3,2,1,1,4
18,4,1,2710,2681,9,2,1,2,22,1,3,1,1,1,4
18,5,1,2068,2046,8,2,1,2,8,1,3,1,1,1,4
18,6,2,1392,1369,8,2,1,2,16,1,3,2,1,1,4


## Save Full Censo 2010 data

In [11]:
# AGLOS_censo = 
radio_ref_sel = radio_ref.loc[table['RADIO_REF_ID'].drop_duplicates().values]
aglos_sel = radio_ref_sel.AGLOMERADO.unique()

## Entradas de censo de los DPTOs elegidos

print(table.shape) #cuanta (gente, variables)?
table.sample(5)

(4476465, 34)


Unnamed: 0,VIVIENDA_REF_ID,RADIO_REF_ID,TIPVV,V01,DPTO,HOGAR_REF_ID,H05,H06,H07,H08,...,P02,P03,P05,P06,P07,P12,P08,P09,P10,CONDACT
4025913,5522547,17540,1,1.0,6413,4955999,1,4.0,1,1,...,2,61,1,0,1,2,2,2,1,1
1937464,2537276,6920,1,1.0,6427,2233811,1,2.0,1,1,...,2,83,1,0,1,2,2,2,1,3
1266747,2199796,5938,1,6.0,6408,1902924,1,2.0,1,1,...,2,23,1,0,1,1,2,5,2,1
2108944,2581371,7040,1,1.0,6427,2281593,1,3.0,1,1,...,2,33,1,0,1,1,2,6,1,1
2863906,2743779,7550,1,1.0,6427,2474703,1,4.0,1,1,...,1,4,1,0,1,1,1,1,0,0


# Old code (don't run)

### EPH va a adoptar nombres de columna del censo.

In [89]:
# # Misma info, distinto nombre. 
# # Censo INDEC 
# md_1 = table[['IX_TOT', 'P02', 'P03', 'CONDACT', 'AGLOMERADO', #las que no se erran, cant pers, sexo, edad, act, aglo
#     'V01', 'H05', 'H06', 'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14', 'H13',
#       'P07', 'P08', 'P09', 'P10', 'P05']] #las x que buscan matches un poquito mas laxamente


# #Mismas cosas, distinto nombre de columna para
# # EPH INDEC
# md_2 = EPH[['IX_TOT','CH04','CH06','CONDACT', 'AGLOMERADO',
#     'IV1', 'IV3', 'IV4','IV5','IV6','IV7','IV8','IV10','IV11','II1','II2','II7','II8','II9',
#     'CH09','CH10','CH12','CH13','CH15']]

# md_2.columns = md_1.columns


In [None]:
# # DEBUGGER
# # IF we want to see in each column what are the percentages, as a clue to where there can be issues
# # OK control check. Control there is less likely confusion. 

# for i in range(len(md_1.columns))[:2]: 
#     print('\n')
#     for md in [md_1, md_2]:
#         col = md.columns[i]
#         print(col)
#         print(md[col].value_counts().sort_index()/len(md))


In [90]:
# import numpy as np

In [91]:
# ### The 'y' variables will be predicted. K nearest neighbors is used.
# ## Variables in the EPH survey but not in the Censo.
# ## Preguntas de EncuestaPH que no estan en Censo.

# y_cols = ['P21','P47T','CAT_INAC','CAT_OCUP','CH07','CH08','CH16','TOT_P12','T_VI','V10_M','V11_M','V12_M','V18_M','V19_AM','V21_M','V2_M','V3_M','V4_M','V5_M','V8_M','V9_M',
#          'PP07G1', 'PP07G2', 'PP07G3', 'PP07G4', 'PP07G_59','PP07H','PP07I','PP07J','PP07K','PP08D1','PP08D4','PP08F1','PP08F2','PP08J1','PP08J2','PP08J3','PP10A','PP10C','PP10D','PP10E']

# #Remove ill predicted ones after trying them out
# y_cols = list(set(y_cols) - set(['PP10A', 'V11_M', 'PP08D4', 'PP08J3', 'PP08F1', 'V18_M', 'V10_M',
#        'V8_M', 'V4_M', 'PP08F2', 'V21_M', 'V9_M', 'PP08J2', 'PP08J1',
#        'V19_AM']))



# train = md_2.join(EPH[y_cols]).fillna(0)
# test = md_1.fillna(0).astype(int)

# # Grupo edad (deciles)

# # Numbers 0 to 9 for age deciles. 
# # Estas en el decil de edad. x% de la gente menor que uno.
# train['P03'] = np.floor(10*train[['P03']].rank(pct = 1)-0.01).astype(int)
# test['P03'] = np.floor(10*test[['P03']].rank(pct = 1)-0.01).astype(int)

# # train['CH07'] = train['CONDACT']

# # Superweight CONDACT
# train['CONDACT'] = 100*train['CONDACT']
# test['CONDACT'] = 100*test['CONDACT']
# # Superweight CONDACT
# train['AGLOMERADO'] = 10*train['AGLOMERADO']
# test['AGLOMERADO'] = 10*test['AGLOMERADO']

In [92]:
# md_1.columns

Index(['IX_TOT', 'P02', 'P03', 'CONDACT', 'AGLOMERADO', 'V01', 'H05', 'H06',
       'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14',
       'H13', 'P07', 'P08', 'P09', 'P10', 'P05'],
      dtype='object')

In [25]:
# res_1 = pd.read_csv('./test_result_'+str(n).zfill(3)+'dpto_1.csv')

In [26]:
# res_0 = pd.read_csv('./test_result_'+str(n).zfill(3)+'dpto.csv')

In [27]:
# # porcentaje error por reshuffle: 
# perr_ = np.round(100*abs(res_0.sum() - res_1.sum())/res_0.sum(), 1).sort_values()

# # perr_.tail(15)#.index
# # Bad ones: ['PP10A', 'V11_M', 'PP08D4', 'PP08J3', 'PP08F1', 'V18_M', 'V10_M',
# #        'V8_M', 'V4_M', 'PP08F2', 'V21_M', 'V9_M', 'PP08J2', 'PP08J1',
# #        'V19_AM']


NameError: name 'res_0' is not defined

In [None]:
# import numpy as np
# # Agregar $$$. En millones de usd
# # En millones de usd (USD = 30 ARS)
# _USD = 30.5 #ARS
# np.round(res_1.sum()/_USD/1e6, 1).sort_values().tail(14)

# #PPALES
# # negocio que no trabajo no laborable (V9_M)
# # alquiler no laborable (V8_M)
# # indemnizacion despido no laborable (V3_M)
# # comision Ocupacion ppal (PP08F1)
# # cuota alimentos no laborable (V12_M)
# # subsidio ayuda social no laborable (V5_M)
# # TOTAL otras ocupacions(TOT_P12)
# # jubilacion no laborable (V2_M)
# # TOTAL no laborables (T_VI)
# # sueldo Ocupacion ppal(PP08D1)
# # TOTAL Ocupacion ppal (P21)
# # TOTAL TOTAL (P47T)

In [None]:
# np.round(res_1.sum()/1e6/_USD, 1).sort_values().tail(14).index

In [None]:
# PERS_DPTO = table[['PERSONA_REF_ID', 'RADIO_REF_ID']].merge(radio_ref_sel[['RADIO_REF_ID', 'DPTO' #, 'NOMDPTO', 'NOMPROV'
#                                                                       ]]).drop(['RADIO_REF_ID'], axis = 1)

# res = res_1
# res_DPTO = res.merge(PERS_DPTO, on = 'PERSONA_REF_ID')

# #en miles de USD
# res_byDPTO = res_DPTO.groupby(['DPTO'])[[  'V3_M', 'V12_M', 'V5_M', 'TOT_P12',
#        'V2_M', 'T_VI', 'PP08D1', 'P21', 'P47T']].sum()

# np.round(100*res_byDPTO.div(res_byDPTO.P47T, axis = 0), 1).sort_values(by = 'P21').head() #percentage
# # np.round(res_byDPTO/1e3/_USD, 1) #in 1000 USD

In [None]:
# PERS_DPTO = table[['PERSONA_REF_ID', 'RADIO_REF_ID']].merge(radio_ref_sel[['RADIO_REF_ID', 'DPTO', 'NOMDPTO', 'NOMPROV']]
#                                                            )#.drop(['RADIO_REF_ID'], axis = 1)

# res = res_1
# res_DPTO = res.merge(PERS_DPTO, on = 'PERSONA_REF_ID')

# # variables = ['V9_M', 'V8_M', 'PP08F1', 'V3_M', 'V12_M', 'V5_M', 'TOT_P12',
# #        'V2_M', 'T_VI', 'PP08D1', 'P21', 'P47T']
# variables = [  'V3_M', 'V12_M', 'V5_M', 'TOT_P12',
#        'V2_M', 'T_VI', 'PP08D1', 'P21', 'P47T']
# #en miles de USD
# # res_byDPTO = res_DPTO.groupby(['DPTO', 'NOMDPTO', 'NOMPROV'])[variables].sum()
# res_byDPTO = res_DPTO.groupby(['RADIO_REF_ID'])[variables].sum()

# # np.round(100*res_byDPTO.div(res_byDPTO.P47T, axis = 0), 1).sort_values(by = 'P21').head() #percentage
# np.round(res_byDPTO/1e3/_USD, 1) #in 1000 USD

In [None]:
# # Save info at 'radio' level
# res_byDPTO.to_csv('res_byradio_sample_'+str(n).zfill(3)+'.csv')

In [None]:
# # Desoc, NA = 0. Not good.
# variables = ['PP07J', #turno habitual
#  'PP10D', #Desoc. Ha trabajado alguna vez?
#  'PP10C', #Desoc. Hizo changa mientras buscaba?
#  'PP07K', # Oc. ppal. Inc. serv. dom. Cobra con recibo
#  'PP07G2', # Oc. ppal. Inc. serv. dom. aguinaldo
#  'PP07G_59', # Oc. ppal. Inc. serv. dom. ninguno
#  'PP07G3', # Oc. ppal. Inc. serv. dom. dias enfermedad
#  'PP10E', # Desoc. Tiempo de que termino su ultimo trabajo/changa
#  'PP07H', # Oc. ppal. Inc. serv. dom. descuento jubilatorio
#  'PP07G4', # Oc. ppal. Inc. serv. dom. obra social
#  'PP07I', # Oc. ppal. Inc. serv. dom. Aporta jub por sí mismo 
#  'PP07G1', # Oc. ppal. Inc. serv. dom. vacaciones pagas
#  'CH07'] #Est civil

# #en miles de USD
# # res_byDPTO = res_DPTO.groupby(['DPTO', 'NOMDPTO', 'NOMPROV'])[variables].mean()
# res_byDPTO = res_DPTO.groupby(['RADIO_REF_ID'])[variables].mean()


# s = np.round(res_byDPTO, 2).sort_values(by = 'PP07K')#.head() 
# s.style.bar(color='#d65f5f')
# #  'CAT_OCUP', #CAT_INAC
# #  'CAT_INAC', #CAT_INAC

# #  'CH08', obra social/salud. nums altos como para mean
# # 'CH16', # Donde vivia hace 5. Desconfiable


In [None]:
# from sklearn.neighbors import NearestNeighbors
# neigh = NearestNeighbors(n_neighbors=1)
# neigh.fit(train[x_cols], train.sample(frac = 1)[y_cols])
# i = 60
# print(neigh.kneighbors([test.iloc[i].values], return_distance=True))
# train.iloc[7022][x_cols] - test.iloc[i]

In [None]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# for col in result.columns:
#     print('\n')
#     print(col)
#     df_ = result.loc[result.P03 > 2]
#     print(df_[col].value_counts().sort_index())