# DATA TRANSFORMATION - 15_Habitos
## 0. Previos

Se cargan las bases y las librerías a utilizar.

In [1]:
# Librerías
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import numpy as np

import generic_funcions as gf

ruta_archivos = 'Data/'

## 1. Load

In [11]:
modulo = 'Habitos'

In [12]:
dcc = gf.diccionario_llaves()

In [13]:
base = gf.carga_datos( ruta = ruta_archivos, diccionario = dcc, modulo = modulo )
base.head()

Unnamed: 0,id,Tipo,Habito2,Habito3,FE_INICIO,FE_FIN,DS_OBSERVACION,fecha,year,month,year_month
0,547351,ALCOHOL,BEBEDOR MODERADO,,NaT,NaT,DIA POR MEDIO,2012-06-29,2012,6,201206
1,1123102,ALCOHOL,ABSTEMIO,,NaT,NaT,,2010-12-02,2010,12,201012
2,788432,ALCOHOL,ABSTEMIO,,NaT,NaT,,2014-02-20,2014,2,201402
3,885255,ALCOHOL,ABSTEMIO,,NaT,NaT,,2016-02-02,2016,2,201602
4,1261121,ALCOHOL,BEBEDOR MODERADO,,NaT,NaT,,2019-07-09,2019,7,201907


## 2. Transform

In [14]:
# rename columns
base.rename( columns={ 'Tipo':'tipo', 'Habito2':'hab'}, inplace=True)

# Elimina columnas sobran
base = base.drop(columns=['Habito3','FE_INICIO','FE_FIN','DS_OBSERVACION'])

# Cambiar texto a minuscula.
base = gf.letra_lower(base,'tipo')

# Elimina dplicados
base = base.drop_duplicates()


In [15]:
base

Unnamed: 0,id,tipo,hab,fecha,year,month,year_month
0,547351,alcohol,bebedor moderado,2012-06-29,2012,6,201206
1,1123102,alcohol,abstemio,2010-12-02,2010,12,201012
2,788432,alcohol,abstemio,2014-02-20,2014,2,201402
3,885255,alcohol,abstemio,2016-02-02,2016,2,201602
4,1261121,alcohol,bebedor moderado,2019-07-09,2019,7,201907
...,...,...,...,...,...,...,...
1926,1132214,sustancias psicoactivas,abstemio,2014-09-09,2014,9,201409
1927,1128432,sustancias psicoactivas,abstemio,2014-06-20,2014,6,201406
1928,1181123,sustancias psicoactivas,abstemio,2014-01-03,2014,1,201401
1929,1728166,sustancias psicoactivas,abstemio,2015-09-01,2015,9,201509


In [16]:
base = base[(base['tipo'].str.contains('alcohol'))\
                 | (base['tipo'].str.contains('ejercicio'))\
                 | (base['tipo'].str.contains('cigarrillo'))].reset_index(drop=True)
base

Unnamed: 0,id,tipo,hab,fecha,year,month,year_month
0,547351,alcohol,bebedor moderado,2012-06-29,2012,6,201206
1,1123102,alcohol,abstemio,2010-12-02,2010,12,201012
2,788432,alcohol,abstemio,2014-02-20,2014,2,201402
3,885255,alcohol,abstemio,2016-02-02,2016,2,201602
4,1261121,alcohol,bebedor moderado,2019-07-09,2019,7,201907
...,...,...,...,...,...,...,...
875,847022,ejercicio,1 vez por semana,2012-07-17,2012,7,201207
876,1331954,ejercicio,no realiza ejercicio,2020-03-19,2020,3,202003
877,1728166,ejercicio,no realiza ejercicio,2015-09-01,2015,9,201509
878,1255124,ejercicio,no realiza ejercicio,2015-05-27,2015,5,201505


In [17]:
# escalas numericas y categoricas para los habitos

cortaeje = np.round(np.linspace(1, 5, 6), 2)
cortacig = np.round(np.linspace(1, 5, 4), 2)
cortaalc = np.round(np.linspace(1, 5, 5), 2)

ejercicio = ['no realiza ejercicio',
            '1 vez por semana',
            '2 veces por semana',
            '3 veces por semana',
            '4 veces por semana',
            'gimnasio varias veces a la semana']

cigarrillo = ['diario',
            'si',
            'fumador pasivo',
            'no']

alcohol = ['bebedor abusivo sin dependencia',
         'bebedor moderado',
         'bebedor social',
         'bebedor excepcional',
         'abstemio']


listalag = [ejercicio, cigarrillo, alcohol]
listacor = [cortaeje, cortacig, cortaalc]

In [18]:
# Replace string by value
df1 = base.copy()

for i in range(3):
    df1 = df1.replace(listalag[i], listacor[i])
df1

Unnamed: 0,id,tipo,hab,fecha,year,month,year_month
0,547351,alcohol,2.0,2012-06-29,2012,6,201206
1,1123102,alcohol,5.0,2010-12-02,2010,12,201012
2,788432,alcohol,5.0,2014-02-20,2014,2,201402
3,885255,alcohol,5.0,2016-02-02,2016,2,201602
4,1261121,alcohol,2.0,2019-07-09,2019,7,201907
...,...,...,...,...,...,...,...
875,847022,ejercicio,1.8,2012-07-17,2012,7,201207
876,1331954,ejercicio,1.0,2020-03-19,2020,3,202003
877,1728166,ejercicio,1.0,2015-09-01,2015,9,201509
878,1255124,ejercicio,1.0,2015-05-27,2015,5,201505


In [19]:
# pivot table to have separate variables, nans replaced with zero
base_p = df1.pivot_table(index=['id','year','month'], columns='tipo', values=['hab']).reset_index()
base_p.columns = ['_'.join(col).strip() for col in base_p.columns.values]
base_p.head()

Unnamed: 0,id_,year_,month_,hab_alcohol,hab_cigarrillo,hab_ejercicio
0,506808,2014,12,,5.0,
1,514102,2013,4,,5.0,
2,524006,2015,7,5.0,5.0,1.0
3,524006,2015,9,,5.0,
4,526049,2014,8,5.0,5.0,1.0


In [20]:
base_p.rename( columns={ 'id_':'id', 'year_':'year', 'month_':'month' }, inplace=True)

## 3. Merge with ids table

In [21]:
# 2016 - 2020
ids_mensual = gf.base_ids_mensual( ruta_archivos )
base_final_hab = ids_mensual.merge( base_p, how='left')
base_final_hab.head()

Unnamed: 0,id,year,month,hab_alcohol,hab_cigarrillo,hab_ejercicio
0,500547,2016,1,,,
1,500547,2016,2,,,
2,500547,2016,3,,,
3,500547,2016,4,,,
4,500547,2016,5,,,


In [22]:
base_final_hab.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53280 entries, 0 to 53279
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              53280 non-null  int64  
 1   year            53280 non-null  int64  
 2   month           53280 non-null  int64  
 3   hab_alcohol     49 non-null     float64
 4   hab_cigarrillo  93 non-null     float64
 5   hab_ejercicio   29 non-null     float64
dtypes: float64(3), int64(3)
memory usage: 2.8 MB
