## Filtrado de datos


In [1]:
# nuestras librerias a utilizar
import pandas as pd
import numpy as np


# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [2]:
# traemos los datos que guardamos en el tema anterior.

df_final = pd.read_csv("df_final_limpieza.csv")

In [3]:
# recordemos el DataFrame
df_final.head(2)

Unnamed: 0,income,kidhome,teenhome,dt_customer,numwebvisitsmonth,id,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y,date,latitude,longitude,contact_month,contact_year
0,161770.0,1.0,0.0,2012-04-04,29.0,089b39d8-e4d0-461b-87d4-814d71e0e079,,housemaid,married,basic 4y,0.0,0.0,0.0,telephone,261,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,2-agosto-2019,41.495,-71.233,agosto,2019.0
1,85477.0,1.0,1.0,2012-12-30,7.0,e9d37224-cb6f-4942-98d7-46672963d097,57.0,services,married,high school,,0.0,0.0,telephone,149,1,999,0,nonexistent,1.1,93994,-364,,5191,no,14-septiembre-2016,34.601,-83.923,septiembre,2016.0


In [4]:
# imaginemos que queremos quedarnos solo con aquellos clientes que están casados 
# primero establecemos la condición de filtrado que queremos, en este caso que la columna de 'marital' sea igual a 'married'
condicion_casados = df_final["marital"] == "married"

# ahora aplicamos la condicion definida previamente a todo el DataFrame, esto lo haremos usando la sintaxis con corchetes
# vamos a almancenar los resultados en una variable que se llame 'df_casados'
df_casados = df_final[condicion_casados]

# comprobamos que el filtrado se hizo correctamente, en este caso, la columna 'marital' deberá tener un único valor  ('married')
df_casados["marital"].unique()

array(['married'], dtype=object)

In [5]:
# visualizamos el DataFrame
df_casados.head()

Unnamed: 0,income,kidhome,teenhome,dt_customer,numwebvisitsmonth,id,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y,date,latitude,longitude,contact_month,contact_year
0,161770.0,1.0,0.0,2012-04-04,29.0,089b39d8-e4d0-461b-87d4-814d71e0e079,,housemaid,married,basic 4y,0.0,0.0,0.0,telephone,261,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,2-agosto-2019,41.495,-71.233,agosto,2019.0
1,85477.0,1.0,1.0,2012-12-30,7.0,e9d37224-cb6f-4942-98d7-46672963d097,57.0,services,married,high school,,0.0,0.0,telephone,149,1,999,0,nonexistent,1.1,93994,-364,,5191,no,14-septiembre-2016,34.601,-83.923,septiembre,2016.0
2,147233.0,1.0,1.0,2012-02-02,5.0,3f9f49b5-e410-4948-bf6e-f9244f04918b,37.0,services,married,high school,0.0,1.0,0.0,telephone,226,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,15-febrero-2019,34.939,-94.847,febrero,2019.0
3,121393.0,1.0,2.0,2012-12-21,29.0,9991fafb-4447-451a-8be2-b0df6098d13e,40.0,admin.,married,basic 6y,0.0,0.0,0.0,telephone,151,1,999,0,nonexistent,1.1,93994,-364,,5191,no,29-noviembre-2015,49.041,-70.308,noviembre,2015.0
4,63164.0,1.0,2.0,2012-06-20,20.0,eca60b76-70b6-4077-80ba-bc52e8ebb0eb,56.0,services,married,high school,0.0,0.0,1.0,telephone,307,1,999,0,nonexistent,1.1,93994,-364,,5191,no,29-enero-2017,38.033,-104.463,enero,2017.0


In [6]:
# podríamos filtrar también con operadores de comparación, por ejemplo imaginemos que queremos quedarnos solo con los datos de aquellos clientes que tienen un 'income' mayor de 150000
condicion_income = df_final["income"] > 150000

# aplicamos el filtro al DataFrame
df_income_150 = df_final[condicion_income]

# comprobamos que el filtrado se hizo correctamente, en este caso el valor mínimo que tengamos para esta columna deberá ser mayor que 150000, para eso vamos a usar el método '.min()' de Pandas
df_income_150["income"].min()

150008.0

In [7]:
df_income_150.head()

Unnamed: 0,income,kidhome,teenhome,dt_customer,numwebvisitsmonth,id,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y,date,latitude,longitude,contact_month,contact_year
0,161770.0,1.0,0.0,2012-04-04,29.0,089b39d8-e4d0-461b-87d4-814d71e0e079,,housemaid,married,basic 4y,0.0,0.0,0.0,telephone,261,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,2-agosto-2019,41.495,-71.233,agosto,2019.0
7,159686.0,1.0,1.0,2012-12-10,21.0,87fdc08b-30ae-4dab-803f-561ecdf27ff0,,blue-collar,married,,,0.0,0.0,telephone,217,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,25-septiembre-2017,46.871,-122.235,septiembre,2017.0
8,179933.0,0.0,0.0,2012-08-22,18.0,87b79988-2be5-419d-88f4-56655852c565,24.0,technician,single,professional course,0.0,1.0,0.0,telephone,380,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,18-enero-2018,44.632,-85.811,enero,2018.0
19,173275.0,2.0,0.0,2012-04-17,26.0,74bf1569-4bdb-49e4-98ba-c1c607d90596,39.0,management,single,basic 9y,,0.0,0.0,telephone,195,1,999,0,nonexistent,1.1,93994,-364,,5191,no,1-julio-2017,27.284,-69.955,julio,2017.0
21,158116.0,0.0,2.0,2012-10-08,21.0,47591c83-e88b-40c2-bb8e-de40b94a44cb,55.0,blue-collar,married,basic 4y,,1.0,0.0,telephone,262,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,11-agosto-2016,25.892,-92.643,agosto,2016.0


In [8]:
# ¿y si quisieramos juntar dos condiciones? Tendríamos que utilizar el operador "&" (si queremos un and) o el operador "|" (si queremos un or)
# en este caso vamos a buscar aquellos clientes que están casado Y que tienen un 'income' mayor que 150000

df_income_casados = df_final[condicion_casados & condicion_income]
df_income_casados.head(2)

Unnamed: 0,income,kidhome,teenhome,dt_customer,numwebvisitsmonth,id,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y,date,latitude,longitude,contact_month,contact_year
0,161770.0,1.0,0.0,2012-04-04,29.0,089b39d8-e4d0-461b-87d4-814d71e0e079,,housemaid,married,basic 4y,0.0,0.0,0.0,telephone,261,1,999,0,nonexistent,1.1,93994,-364,4857,5191,no,2-agosto-2019,41.495,-71.233,agosto,2019.0
7,159686.0,1.0,1.0,2012-12-10,21.0,87fdc08b-30ae-4dab-803f-561ecdf27ff0,,blue-collar,married,,,0.0,0.0,telephone,217,1,999,0,nonexistent,1.1,93994,-364,4857,5191,no,25-septiembre-2017,46.871,-122.235,septiembre,2017.0


### Método `.isin()`

In [9]:
df_final.head()

Unnamed: 0,income,kidhome,teenhome,dt_customer,numwebvisitsmonth,id,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y,date,latitude,longitude,contact_month,contact_year
0,161770.0,1.0,0.0,2012-04-04,29.0,089b39d8-e4d0-461b-87d4-814d71e0e079,,housemaid,married,basic 4y,0.0,0.0,0.0,telephone,261,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,2-agosto-2019,41.495,-71.233,agosto,2019.0
1,85477.0,1.0,1.0,2012-12-30,7.0,e9d37224-cb6f-4942-98d7-46672963d097,57.0,services,married,high school,,0.0,0.0,telephone,149,1,999,0,nonexistent,1.1,93994,-364,,5191,no,14-septiembre-2016,34.601,-83.923,septiembre,2016.0
2,147233.0,1.0,1.0,2012-02-02,5.0,3f9f49b5-e410-4948-bf6e-f9244f04918b,37.0,services,married,high school,0.0,1.0,0.0,telephone,226,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,15-febrero-2019,34.939,-94.847,febrero,2019.0
3,121393.0,1.0,2.0,2012-12-21,29.0,9991fafb-4447-451a-8be2-b0df6098d13e,40.0,admin.,married,basic 6y,0.0,0.0,0.0,telephone,151,1,999,0,nonexistent,1.1,93994,-364,,5191,no,29-noviembre-2015,49.041,-70.308,noviembre,2015.0
4,63164.0,1.0,2.0,2012-06-20,20.0,eca60b76-70b6-4077-80ba-bc52e8ebb0eb,56.0,services,married,high school,0.0,0.0,1.0,telephone,307,1,999,0,nonexistent,1.1,93994,-364,,5191,no,29-enero-2017,38.033,-104.463,enero,2017.0


In [10]:
df_final["education"].unique()

array(['basic 4y', 'high school', 'basic 6y', 'basic 9y',
       'professional course', nan, 'university degree', 'illiterate'],
      dtype=object)

In [11]:
# imaginemos que nos interesan solo los clientes que tienen un nivel de educación de 'basic 6y', 'basic 9y'. 
# podríamos usar lo métodos aprendidos previamente, pero la realidad es que cuantas más condiciones pongamos más probabilidad hay en equivocarnos. 
# para eso usaremos el método 'isin()' 

# primero definimos una lista con los valores que queremos filtrar
filtro_educacion = ['basic 6y', 'basic 9y']

# aplicamos el método 'isin()' pasando la lista
df_educacion = df_final[df_final["education"].isin(filtro_educacion)]

# comprobamos que ahora solo tenemos dos valores únicos para la columna ' education'
df_educacion["education"].unique()

array(['basic 6y', 'basic 9y'], dtype=object)

### Método `.between()`



In [12]:
df_final.head(1)

Unnamed: 0,income,kidhome,teenhome,dt_customer,numwebvisitsmonth,id,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y,date,latitude,longitude,contact_month,contact_year
0,161770.0,1.0,0.0,2012-04-04,29.0,089b39d8-e4d0-461b-87d4-814d71e0e079,,housemaid,married,basic 4y,0.0,0.0,0.0,telephone,261,1,999,0,nonexistent,1.1,93994,-364,4857,5191,no,2-agosto-2019,41.495,-71.233,agosto,2019.0


In [13]:
# imaginemos que queremos saber que clientes han visitado nuestra página web entre 15 y 25 veces (inclusive), lo podríamos hacer con cualquiera de los métodos aprendidos hasta ahora, pero sería muy tedioso
# veamos como hacerlo con el método '.between()'

df_web = df_final[df_final["numwebvisitsmonth"].between(15, 25, inclusive = "both" )]
df_web.head()

Unnamed: 0,income,kidhome,teenhome,dt_customer,numwebvisitsmonth,id,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y,date,latitude,longitude,contact_month,contact_year
4,63164.0,1.0,2.0,2012-06-20,20.0,eca60b76-70b6-4077-80ba-bc52e8ebb0eb,56.0,services,married,high school,0.0,0.0,1.0,telephone,307,1,999,0,nonexistent,1.1,93994,-364,,5191,no,29-enero-2017,38.033,-104.463,enero,2017.0
7,159686.0,1.0,1.0,2012-12-10,21.0,87fdc08b-30ae-4dab-803f-561ecdf27ff0,,blue-collar,married,,,0.0,0.0,telephone,217,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,25-septiembre-2017,46.871,-122.235,septiembre,2017.0
8,179933.0,0.0,0.0,2012-08-22,18.0,87b79988-2be5-419d-88f4-56655852c565,24.0,technician,single,professional course,0.0,1.0,0.0,telephone,380,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,18-enero-2018,44.632,-85.811,enero,2018.0
9,77504.0,1.0,0.0,2012-02-09,18.0,ea6b7d04-9271-4c0a-a01f-07795d164aba,25.0,services,single,high school,0.0,1.0,0.0,telephone,50,1,999,0,nonexistent,1.1,93994,-364,,5191,no,2-noviembre-2016,30.297,-117.382,noviembre,2016.0
12,102006.0,0.0,1.0,2012-03-24,18.0,66a752e2-b2f1-440c-9a8f-cc3b10d74dd0,29.0,blue-collar,single,high school,0.0,0.0,1.0,telephone,137,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,14-junio-2016,30.047,-112.472,junio,2016.0


In [14]:
# veamos como estan compuestas nuestras columnas, es decir de que tipo son sus valores
df_final.dtypes

income               float64
kidhome              float64
teenhome             float64
dt_customer           object
numwebvisitsmonth    float64
id                    object
age                  float64
job                   object
marital               object
education             object
default              float64
housing              float64
loan                 float64
contact               object
duration               int64
campaign               int64
pdays                  int64
previous               int64
poutcome              object
empvarrate           float64
conspriceidx          object
consconfidx           object
euribor3m             object
nremployed            object
y                     object
date                  object
latitude             float64
longitude            float64
contact_month         object
contact_year         float64
dtype: object

In [15]:
# si nos fijamos la columna 'dt_customer' es de tipo object y nosotros nos guataria que sea del tipo datetime, por lo que usaremos el metodo to_datetime para especificar la columna a convertir.
df_final['dt_customer'] = pd.to_datetime(df_final['dt_customer'])

# nuevamente usamos dtypes sobre el dataframe para corroborar de que tipo es cada columna.
df_final.dtypes

income                      float64
kidhome                     float64
teenhome                    float64
dt_customer          datetime64[ns]
numwebvisitsmonth           float64
id                           object
age                         float64
job                          object
marital                      object
education                    object
default                     float64
housing                     float64
loan                        float64
contact                      object
duration                      int64
campaign                      int64
pdays                         int64
previous                      int64
poutcome                     object
empvarrate                  float64
conspriceidx                 object
consconfidx                  object
euribor3m                    object
nremployed                   object
y                            object
date                         object
latitude                    float64
longitude                   

In [16]:
# imaginemos que queremos saber que clientes entraron en el banco en enero del 2013. 
# lo primero que tendremos que hacer es generar dos objetos de tipo datetime con las fechas de inicio y fin de nuestro filtro
# para eso usaremos el método 'pd.to_datetime()' el cual necesita que le pasemos las fechas en formato string
inicio = pd.to_datetime('2013-01-01')
fin = pd.to_datetime('2013-01-31')

# después le pasaremos esas dos variables creadas al método '.between()' 
df_fechas = df_final[df_final["dt_customer"].between(inicio, fin, inclusive = "both")]
df_fechas.head(2)

Unnamed: 0,income,kidhome,teenhome,dt_customer,numwebvisitsmonth,id,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y,date,latitude,longitude,contact_month,contact_year
20021,113859.0,1.0,1.0,2013-01-06,12.0,f8aaf334-949d-4943-a214-b64fe9c088b4,52.0,services,married,high school,,1.0,0.0,cellular,99,1,999,0,nonexistent,1.4,93444,-361,,52281,no,15-septiembre-2016,32.499,-115.779,septiembre,2016.0
20028,169825.0,2.0,2.0,2013-01-01,10.0,9df0599f-1bc9-4772-9289-b6f9a8178714,51.0,technician,married,professional course,,1.0,1.0,cellular,273,1,999,0,nonexistent,1.4,93444,-361,4965.0,52281,no,6-noviembre-2015,48.691,-73.914,noviembre,2015.0


### Método `.str.contains()`




In [17]:
df_final.head(2)

Unnamed: 0,income,kidhome,teenhome,dt_customer,numwebvisitsmonth,id,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y,date,latitude,longitude,contact_month,contact_year
0,161770.0,1.0,0.0,2012-04-04,29.0,089b39d8-e4d0-461b-87d4-814d71e0e079,,housemaid,married,basic 4y,0.0,0.0,0.0,telephone,261,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,2-agosto-2019,41.495,-71.233,agosto,2019.0
1,85477.0,1.0,1.0,2012-12-30,7.0,e9d37224-cb6f-4942-98d7-46672963d097,57.0,services,married,high school,,0.0,0.0,telephone,149,1,999,0,nonexistent,1.1,93994,-364,,5191,no,14-septiembre-2016,34.601,-83.923,septiembre,2016.0


In [18]:
df_final.isnull().sum()

income                  0
kidhome                 0
teenhome                0
dt_customer             0
numwebvisitsmonth       0
id                      0
age                  5120
job                   345
marital                85
education            1807
default              8981
housing              1026
loan                 1026
contact                 0
duration                0
campaign                0
pdays                   0
previous                0
poutcome                0
empvarrate              0
conspriceidx          471
consconfidx             0
euribor3m            9256
nremployed              0
y                       0
date                  248
latitude                0
longitude               0
contact_month         248
contact_year          248
dtype: int64

In [19]:
# imaginemos que queremos trabajar con la columna de 'education' y quisieramos sacar todos aquellos clientes que en su educación aparece un dígito
# lo primero que debemos hacer es sacar el patrón de regex
patron_regex = "\d+"

df_numeros = df_final[df_final["education"].str.contains(patron_regex, regex = True, na = False)]
df_numeros.head()

Unnamed: 0,income,kidhome,teenhome,dt_customer,numwebvisitsmonth,id,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y,date,latitude,longitude,contact_month,contact_year
0,161770.0,1.0,0.0,2012-04-04,29.0,089b39d8-e4d0-461b-87d4-814d71e0e079,,housemaid,married,basic 4y,0.0,0.0,0.0,telephone,261,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,2-agosto-2019,41.495,-71.233,agosto,2019.0
3,121393.0,1.0,2.0,2012-12-21,29.0,9991fafb-4447-451a-8be2-b0df6098d13e,40.0,admin.,married,basic 6y,0.0,0.0,0.0,telephone,151,1,999,0,nonexistent,1.1,93994,-364,,5191,no,29-noviembre-2015,49.041,-70.308,noviembre,2015.0
5,143854.0,0.0,1.0,2012-01-24,26.0,d63ede72-0b6d-45b1-8872-385ac6897f65,45.0,services,married,basic 9y,,0.0,0.0,telephone,198,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,26-septiembre-2015,24.689,-101.643,septiembre,2015.0
13,140801.0,2.0,1.0,2012-02-11,32.0,54b1ea8a-a909-45d9-9562-775b64ac9c29,57.0,housemaid,divorced,basic 4y,0.0,1.0,0.0,telephone,293,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,25-agosto-2015,28.873,-70.852,agosto,2015.0
14,73262.0,2.0,2.0,2012-09-10,27.0,3a7b7570-aba9-40bc-9f80-3433d064b5b7,35.0,blue-collar,married,basic 6y,0.0,1.0,0.0,telephone,146,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,19-septiembre-2016,36.188,-109.835,septiembre,2016.0


In [20]:
# para entender que hace bien el parámetro 'na', ejecutemos de nuevo el código cambiando el valor de 'na = True'
# en este caso vemos que tenemos algunas filas más, como la que tiene índice 7 que no aparecía en el ejercicio anterior. 
df_numeros = df_final[df_final["education"].str.contains(patron_regex, regex = True, na = True)]
df_numeros.head()

Unnamed: 0,income,kidhome,teenhome,dt_customer,numwebvisitsmonth,id,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y,date,latitude,longitude,contact_month,contact_year
0,161770.0,1.0,0.0,2012-04-04,29.0,089b39d8-e4d0-461b-87d4-814d71e0e079,,housemaid,married,basic 4y,0.0,0.0,0.0,telephone,261,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,2-agosto-2019,41.495,-71.233,agosto,2019.0
3,121393.0,1.0,2.0,2012-12-21,29.0,9991fafb-4447-451a-8be2-b0df6098d13e,40.0,admin.,married,basic 6y,0.0,0.0,0.0,telephone,151,1,999,0,nonexistent,1.1,93994,-364,,5191,no,29-noviembre-2015,49.041,-70.308,noviembre,2015.0
5,143854.0,0.0,1.0,2012-01-24,26.0,d63ede72-0b6d-45b1-8872-385ac6897f65,45.0,services,married,basic 9y,,0.0,0.0,telephone,198,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,26-septiembre-2015,24.689,-101.643,septiembre,2015.0
7,159686.0,1.0,1.0,2012-12-10,21.0,87fdc08b-30ae-4dab-803f-561ecdf27ff0,,blue-collar,married,,,0.0,0.0,telephone,217,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,25-septiembre-2017,46.871,-122.235,septiembre,2017.0
10,92882.0,1.0,1.0,2012-01-05,9.0,8d700df5-0c33-4517-8cf8-92e1c92c9c12,41.0,blue-collar,married,,,0.0,0.0,telephone,55,1,999,0,nonexistent,1.1,93994,-364,4857.0,5191,no,31-noviembre-2015,43.616,-90.902,noviembre,2015.0


In [21]:
# antes de terminar la lección vamos a guardar este DataFrame que hemos hemos creado a lo largo de la lección para usarlo en la lección de mañana
# para eso usaremos el método 'pd.to_csv()' 

df_final.to_csv("bank-additional_full.csv",index=False)