# Normalizing Data

This notebook will show separate ways to normalize data for our current project -- developing an index that measures the presence of organized crime in Mexican states. 

It will show the following normalizing methods:
- Ranking -- each state is ranked in a variable comparison to other states. This method deals well with outliers and allows you to compare states over time relative to each other.
- Categorical Scale -- each state is assigned a score based on the percentiles of distribution for each variable. Again, this accounts for outliers but it gets rid of a lot information in placing everything in scales. 
- Z-Score -- a z-score measures the number of standard deviations an observation is from the mean. This method might not deal with outliers as well, but keeps all the relevant variance information that's lost in the other two methods.

But first we have to import all the relevant files and read in our data once again

### 1) Reading in Data 

In [31]:
import missing_data_code as mdc
import exploration_pca_ca as epc
import normalization as norm
import jellyfish
import importlib
importlib.reload(mdc)
importlib.reload(epc)
importlib.reload(norm)

<module 'normalization' from '/Users/mariomoreno/Desktop/ML Projects/Mexico OC/normalization.py'>

In [32]:
# Reading in data
df = mdc.reading_in('data/Mexico_Final.xlsx')

In [33]:
mdc.impute_zero(df, {'costa_km':0}).head()

Unnamed: 0,entidad,year,alumnos_inscritos_hs,hombres_inscritos_hs,mujeres_inscritas_hs,maestros_hs,escuelas_hs,convenios_trabajo_no_juicio,conflictos_trabajo,emplazamientos_huelga,...,prod_zinc_t,prod_coque_t,prod_fierropellets_t,prod_azufre_t,prod_barita_t,prod_fluorita_t,altitud,costa_km,municipios,pop
0,Aguascalientes,2010,40129,19250,20879,2727,144,2698,3239,325,...,0.0,0.0,3883423.0,0.0,0.0,0.0,1870,0.0,11,1195787
1,Baja California,2010,107624,53692,53932,6192,271,2342,9353,3241,...,0.0,0.0,3883423.0,0.0,0.0,0.0,3,1493.0,5,3224843
2,Baja California Sur,2010,23247,11780,11467,1588,76,1255,1660,296,...,0.0,0.0,3883423.0,0.0,0.0,0.0,10,2131.0,5,649616
3,Campeche,2010,28350,14349,14001,1975,106,3373,1179,144,...,0.0,0.0,3883423.0,0.0,0.0,0.0,10,425.0,11,836747
4,Coahuila de Zaragoza,2010,82553,41397,41156,6219,330,23331,7867,545,...,0.0,1648709.0,2567865.0,0.0,22161.0,121833.0,1700,0.0,38,2782012


In [34]:
cols = mdc.find_missing_cols(df)
# loop to pull out the missing columns from the list above 
cols_impute = []
for c in cols:
    cols_impute.append(c[0])
working_df = mdc.single_imputation(df, 'median', cols_impute)

In [35]:
# per 100,000 calculation
working_df.iloc[:,2:24] = (working_df.iloc[:,2:24].div(working_df.iloc[:,-1], axis=0)) *100000
del working_df['pop']
working_df.head()

Unnamed: 0,entidad,year,alumnos_inscritos_hs,hombres_inscritos_hs,mujeres_inscritas_hs,maestros_hs,escuelas_hs,convenios_trabajo_no_juicio,conflictos_trabajo,emplazamientos_huelga,...,prod_cobre_t,prod_zinc_t,prod_coque_t,prod_fierropellets_t,prod_azufre_t,prod_barita_t,prod_fluorita_t,altitud,costa_km,municipios
0,Aguascalientes,2010,3355.865217,1609.818471,1746.046746,228.050648,12.042278,225.625467,270.867638,27.178753,...,0.0,0.0,0.0,3883423.0,0.0,0.0,0.0,1870,0.0,11
1,Baja California,2010,3337.340764,1664.94927,1672.391493,192.009347,8.40351,72.623691,290.029623,100.501017,...,0.0,0.0,0.0,3883423.0,0.0,0.0,0.0,3,1493.0,5
2,Baja California Sur,2010,3578.575651,1813.378981,1765.19667,244.452107,11.699219,193.191054,255.535578,45.56538,...,0.0,0.0,0.0,3883423.0,0.0,0.0,0.0,10,2131.0,5
3,Campeche,2010,3388.120902,1714.855267,1673.265635,236.033114,12.668106,403.108705,140.902806,17.209503,...,0.0,0.0,0.0,3883423.0,0.0,0.0,0.0,10,425.0,11
4,Coahuila de Zaragoza,2010,2967.384756,1488.023776,1479.36098,223.543249,11.861919,838.637648,282.780951,19.590138,...,0.0,0.0,1648709.0,2567865.0,0.0,22161.0,121833.0,1700,0.0,38


In [36]:
replacing = {'Aguascalientes':1, 'Baja California':2, 'Baja California Sur':3, 'Campeche':4, 'Coahuila de Zaragoza':5, 'Colima':6, 'Chiapas':7, 'Chihuahua':8, 'Ciudad de México /b':9, 'Durango':10, 'Guanajuato':11, 'Guerrero':12, 'Hidalgo':13, 'Jalisco':14, 'México':15, 'Michoacán de Ocampo':16, 'Morelos':17, 'Nayarit':18, 'Nuevo León':19, 'Oaxaca':20, 'Puebla':21, 'Querétaro':22, 'Quintana Roo':23, 'San Luis Potosí':24, 'Sinaloa':25, 'Sonora':26, 'Tabasco':27, 'Tamaulipas':28, 'Tlaxcala':29, 'Veracruz de Ignaxio de la Llave':30, 'Yucatán':31, 'Zacatecas':32}

new_col = []
for c in working_df['entidad']:
    for state, rep in replacing.items():
        if jellyfish.jaro_distance(c.strip(), state) > 0.95:
            new_col.append(state)

In [37]:
import pandas as pd
working_df['entidad'] = pd.Series(new_col)
working_df.head()

Unnamed: 0,entidad,year,alumnos_inscritos_hs,hombres_inscritos_hs,mujeres_inscritas_hs,maestros_hs,escuelas_hs,convenios_trabajo_no_juicio,conflictos_trabajo,emplazamientos_huelga,...,prod_cobre_t,prod_zinc_t,prod_coque_t,prod_fierropellets_t,prod_azufre_t,prod_barita_t,prod_fluorita_t,altitud,costa_km,municipios
0,Aguascalientes,2010,3355.865217,1609.818471,1746.046746,228.050648,12.042278,225.625467,270.867638,27.178753,...,0.0,0.0,0.0,3883423.0,0.0,0.0,0.0,1870,0.0,11
1,Baja California,2010,3337.340764,1664.94927,1672.391493,192.009347,8.40351,72.623691,290.029623,100.501017,...,0.0,0.0,0.0,3883423.0,0.0,0.0,0.0,3,1493.0,5
2,Baja California Sur,2010,3578.575651,1813.378981,1765.19667,244.452107,11.699219,193.191054,255.535578,45.56538,...,0.0,0.0,0.0,3883423.0,0.0,0.0,0.0,10,2131.0,5
3,Campeche,2010,3388.120902,1714.855267,1673.265635,236.033114,12.668106,403.108705,140.902806,17.209503,...,0.0,0.0,0.0,3883423.0,0.0,0.0,0.0,10,425.0,11
4,Coahuila de Zaragoza,2010,2967.384756,1488.023776,1479.36098,223.543249,11.861919,838.637648,282.780951,19.590138,...,0.0,0.0,1648709.0,2567865.0,0.0,22161.0,121833.0,1700,0.0,38


In [38]:
to_delete = []
for c in working_df:
    if 'hs' in c or 'mun' in c:
        to_delete.append(c)

In [39]:
pre_features = epc.clean_data(working_df, [], 0, [], to_delete)

In [40]:
years = [2010, 2011, 2012, 2013, 2014, 2015, 2016]
dfs = []
for year in years:
    df = pre_features[pre_features['year']==year].reset_index(drop=True).set_index(['entidad','year'])
    dfs.append(df)

### 2) Rank

Rank is a pretty robust measure in that it controls for outliers, though it doesn't accout for variance as well as it should. Regardless, if our aim is to compare states, then rank is a pretty communicable way to do it.

In [41]:
ranks = []
for df in dfs:
    rank = norm.ranking(df)
    ranks.append(rank)

In [42]:
ranks[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,convenios_trabajo_no_juicio,conflictos_trabajo,emplazamientos_huelga,emplazamientos_huelga_solucionados,huelgas_estalladas,huelgas_locales_solucionadas,denuncias_total,denuncias_salud,denuncias_arma,denuncias_bancaria,...,prod_plomo_t,prod_cobre_t,prod_zinc_t,prod_coque_t,prod_fierropellets_t,prod_azufre_t,prod_barita_t,prod_fluorita_t,altitud,costa_km
entidad,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Aguascalientes,2010,17.0,10.0,15.0,12.0,15.0,10.0,18.0,13.0,26.0,4.0,...,10.0,12.0,10.0,3.0,1.0,9.0,3.0,3.0,9.0,18.0
Baja California,2010,28.0,5.0,2.0,5.0,1.0,7.0,1.0,1.0,1.0,6.0,...,10.0,12.0,10.0,3.0,1.0,9.0,3.0,3.0,32.0,2.0
Baja California Sur,2010,19.0,11.0,5.0,9.0,3.0,10.0,6.0,9.0,22.0,22.0,...,10.0,12.0,10.0,3.0,1.0,9.0,3.0,3.0,28.0,1.0
Campeche,2010,7.0,21.0,23.0,28.0,15.0,10.0,15.0,17.0,21.0,17.0,...,10.0,12.0,10.0,3.0,1.0,9.0,3.0,3.0,28.0,10.0
Coahuila de Zaragoza,2010,2.0,6.0,19.0,25.0,15.0,10.0,27.0,25.0,28.0,7.0,...,10.0,12.0,10.0,1.0,11.0,9.0,2.0,2.0,13.0,18.0
Colima,2010,14.0,20.0,8.0,18.0,4.0,3.0,7.0,5.0,4.0,18.0,...,10.0,12.0,10.0,3.0,1.0,9.0,3.0,3.0,23.0,17.0
Chiapas,2010,25.0,32.0,32.0,31.0,15.0,10.0,26.0,22.0,23.0,32.0,...,10.0,12.0,10.0,3.0,13.0,2.0,3.0,3.0,21.0,14.0
Chihuahua,2010,11.0,7.0,31.0,29.0,15.0,10.0,10.0,12.0,8.0,16.0,...,2.0,4.0,2.0,3.0,13.0,9.0,3.0,3.0,14.0,18.0
Ciudad de México /b,2010,18.0,2.0,4.0,8.0,2.0,2.0,2.0,2.0,12.0,1.0,...,4.0,6.0,5.0,3.0,13.0,9.0,3.0,3.0,3.0,18.0
Durango,2010,9.0,23.0,7.0,15.0,9.0,10.0,13.0,11.0,15.0,29.0,...,3.0,9.0,7.0,3.0,13.0,9.0,3.0,3.0,10.0,18.0


### 3) Categorical (Quantiles)

This is essentially binning the variables into one of five variables (1-5) in order for comparison. Much like ranking, this is robust in regards to outliers but it misses a lot of relevant information that explains variance within the quantiles.

In [43]:
import pandas as pd
norms = []
for df in dfs:
    norm.categorical(df)
    norms.append(df)

In [44]:
dfs[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,convenios_trabajo_no_juicio,conflictos_trabajo,emplazamientos_huelga,emplazamientos_huelga_solucionados,huelgas_estalladas,huelgas_locales_solucionadas,denuncias_total,denuncias_salud,denuncias_arma,denuncias_bancaria,...,prod_plomo_t,prod_cobre_t,prod_zinc_t,prod_coque_t,prod_fierropellets_t,prod_azufre_t,prod_barita_t,prod_fluorita_t,altitud,costa_km
entidad,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Aguascalientes,2010,2,4,2,1,1,1,1,1,1,1,...,1,1,1,1,5,1,1,1,4,1
Baja California,2010,1,4,5,2,5,1,5,5,5,1,...,1,1,1,1,5,1,1,1,1,4
Baja California Sur,2010,2,4,3,2,1,1,2,1,1,1,...,1,1,1,1,5,1,1,1,1,5
Campeche,2010,3,2,1,1,1,1,1,1,1,1,...,1,1,1,1,5,1,1,1,1,1
Coahuila de Zaragoza,2010,5,4,1,1,1,1,1,1,1,1,...,1,1,1,5,4,1,1,1,4,1
Colima,2010,2,2,2,1,1,1,2,2,4,1,...,1,1,1,1,5,1,1,1,1,1
Chiapas,2010,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,3,1,1,2,1
Chihuahua,2010,2,4,1,1,1,1,1,1,3,1,...,3,1,3,1,1,1,1,1,3,1
Ciudad de México /b,2010,2,5,3,2,2,1,4,3,3,5,...,1,1,1,1,1,1,1,1,5,1
Durango,2010,2,2,2,1,1,1,1,1,2,1,...,1,1,1,1,1,1,1,1,4,1


### 4) Z-Score 

Z-Score has some benefits in that it respects variance within the variable, but it's highly sensitive to outliers of which we have several. 

In [45]:
z_scores = []
for df in dfs:
    z = norm.z_score(df)
    z_scores.append(z)

  return (a - mns) / sstd


In [46]:
z_scores[1]

Unnamed: 0_level_0,Unnamed: 1_level_0,convenios_trabajo_no_juicio,conflictos_trabajo,emplazamientos_huelga,emplazamientos_huelga_solucionados,huelgas_estalladas,huelgas_locales_solucionadas,denuncias_total,denuncias_salud,denuncias_arma,denuncias_bancaria,...,prod_plomo_t,prod_cobre_t,prod_zinc_t,prod_coque_t,prod_fierropellets_t,prod_azufre_t,prod_barita_t,prod_fluorita_t,altitud,costa_km
entidad,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Aguascalientes,2011,0.092715,0.769665,-0.578419,-0.406894,-0.296068,-0.179605,-0.563602,-0.370211,-0.954411,-0.179605,...,-0.219476,-0.179605,-0.244137,-0.219476,1.419048,-0.316228,-0.219476,-0.179605,0.82717,-0.516398
Baja California,2011,-0.896241,1.435321,3.318301,0.895167,5.117755,-0.179605,3.444234,4.895006,2.001184,-0.179605,...,-0.219476,-0.179605,-0.244137,-0.219476,1.419048,-0.316228,-0.219476,-0.179605,-1.208941,2.581989
Baja California Sur,2011,0.092715,0.769665,-0.578419,-0.406894,-0.296068,-0.179605,1.440316,-0.370211,-0.954411,-0.179605,...,-0.219476,-0.179605,-0.244137,-0.219476,1.419048,-0.316228,-0.219476,-0.179605,-1.208941,3.614784
Campeche,2011,1.08167,-0.561647,-0.578419,-0.406894,-0.296068,-0.179605,-0.563602,-0.370211,-0.954411,-0.179605,...,-0.219476,-0.179605,-0.244137,-0.219476,1.419048,-0.316228,-0.219476,-0.179605,-1.208941,-0.516398
Coahuila de Zaragoza,2011,2.070625,0.769665,-0.578419,-0.406894,-0.296068,-0.179605,-0.563602,-0.370211,0.030787,-0.179605,...,-0.219476,-0.179605,-0.244137,5.399101,0.878459,-0.316228,1.185168,-0.179605,0.82717,-0.516398
Colima,2011,0.092715,-0.561647,0.395761,-0.406894,1.057387,-0.179605,-0.563602,-0.370211,1.015986,-0.179605,...,-0.219476,-0.179605,-0.244137,-0.219476,1.419048,-0.316228,-0.219476,-0.179605,-1.208941,-0.516398
Chiapas,2011,-0.896241,-1.227304,-0.578419,-0.406894,-0.296068,-0.179605,-0.563602,-0.370211,-0.954411,-0.179605,...,-0.219476,-0.179605,-0.244137,-0.219476,-0.743311,2.213594,-0.219476,-0.179605,-0.530237,-0.516398
Chihuahua,2011,0.092715,0.769665,-0.578419,-0.406894,-0.296068,-0.179605,0.438357,-0.370211,0.030787,-0.179605,...,1.185168,-0.179605,2.359987,-0.219476,-0.743311,-0.316228,-0.219476,-0.179605,0.148466,-0.516398
Ciudad de México /b,2011,-0.896241,1.435321,0.395761,-0.406894,1.057387,-0.179605,2.442275,0.946094,0.030787,5.567764,...,-0.219476,-0.179605,-0.244137,-0.219476,-0.743311,-0.316228,-0.219476,-0.179605,1.505874,-0.516398
Durango,2011,0.092715,-0.561647,0.395761,-0.406894,1.057387,-0.179605,-0.563602,-0.370211,0.030787,-0.179605,...,-0.219476,-0.179605,-0.244137,-0.219476,-0.743311,-0.316228,-0.219476,-0.179605,0.82717,-0.516398
