# Análisis Exploratorio de Datos - Clientes Bancarios
## Este notebook realiza el análisis exploratorio de datos de un dataset de clientes bancarios con Python

## 1. Importación de librerías y carga de datos

In [118]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
from IPython.display import display

# Importar funciones de carga de datos
import sys
sys.path.append('../src')
from data.data_loader import load_customer_data, load_bank_data, get_basic_info

In [119]:
# Cargar datos de clientes
df_customers = load_customer_data()

# Cargar datos bancarios
df_bank = load_bank_data()

In [120]:
# Información básica de los datos de clientes
print("=== Información de Datos de Clientes ===")
get_basic_info(df_customers)

# Información básica de los datos bancarios
print("\n=== Información de Datos Bancarios ===")
get_basic_info(df_bank)

=== Información de Datos de Clientes ===

Información del DataFrame:
<class 'pandas.core.frame.DataFrame'>
Index: 20115 entries, 0 to 20114
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Income             20115 non-null  int64         
 1   Kidhome            20115 non-null  int64         
 2   Teenhome           20115 non-null  int64         
 3   Dt_Customer        20115 non-null  datetime64[ns]
 4   NumWebVisitsMonth  20115 non-null  int64         
 5   ID                 20115 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 1.1+ MB
None

Primeras 5 filas:
   Income  Kidhome  Teenhome Dt_Customer  NumWebVisitsMonth  \
0  161770        1         0  2012-04-04                 29   
1   85477        1         1  2012-12-30                  7   
2  147233        1         1  2012-02-02                  5   
3  121393        1         2  2012-12-21   

## 2. Transformación y limpieza de los datos

### 2.1 Estandarización de nombres de columnas

In [121]:
# Mostrar columnas de cada dataset
print("Columnas en datos de clientes:")
print(df_customers.columns.tolist())

print("\nColumnas en datos bancarios:")
print(df_bank.columns.tolist())

Columnas en datos de clientes:
['Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'NumWebVisitsMonth', 'ID']

Columnas en datos bancarios:
['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'date', 'latitude', 'longitude', 'id_']


In [122]:
# Rename de las columnas para una mejor comprensión de su significado, ya que algunas columnas presentan un exceso de abreviación
# Renombrar columnas del dataset de clientes
customer_columns_rename = {
    'Income': 'Income',
    'Kidhome': 'Number_of_Kids',
    'Teenhome': 'Number_of_Teenagers',
    'Dt_Customer': 'Registration_Date',
    'NumWebVisitsMonth': 'Monthly_Web_Visits',
    'ID': 'Customer_ID'
}

# Renombrar columnas del dataset bancario
bank_columns_rename = {
    'age': 'Age',
    'job': 'Job',
    'marital': 'Marital_Status',
    'education': 'Education_Level',
    'default': 'Credit_Default',
    'housing': 'Mortgage_Loan',
    'loan': 'Personal_Loan',
    'contact': 'Contact_Type',
    'duration': 'Call_Duration',
    'campaign': 'Campaign_Contacts',
    'pdays': 'Days_Since_Last_Contact',
    'previous': 'Previous_Contacts',
    'poutcome': 'Previous_Campaign_Outcome',
    'emp.var.rate': 'Employment_Variation_Rate',
    'cons.price.idx': 'Consumer_Price_Index',
    'cons.conf.idx': 'Consumer_Confidence_Index',
    'euribor3m': 'Euribor_3M_Rate',
    'nr.employed': 'Number_of_Employees',
    'y': 'Subscribed_to_Service',
    'date': 'Contacted_Date',
    'latitude': 'Latitude',
    'longitude': 'Longitude',
    'id_': 'Customer_ID'
}

# Aplicar los cambios
df_customers = df_customers.rename(columns=customer_columns_rename)
df_bank = df_bank.rename(columns=bank_columns_rename)

# Verificar los cambios
print("Columnas en datos de clientes después del rename:")
print(df_customers.columns.tolist())
print("\nColumnas en datos bancarios después del rename:")
print(df_bank.columns.tolist())

Columnas en datos de clientes después del rename:
['Income', 'Number_of_Kids', 'Number_of_Teenagers', 'Registration_Date', 'Monthly_Web_Visits', 'Customer_ID']

Columnas en datos bancarios después del rename:
['Age', 'Job', 'Marital_Status', 'Education_Level', 'Credit_Default', 'Mortgage_Loan', 'Personal_Loan', 'Contact_Type', 'Call_Duration', 'Campaign_Contacts', 'Days_Since_Last_Contact', 'Previous_Contacts', 'Previous_Campaign_Outcome', 'Employment_Variation_Rate', 'Consumer_Price_Index', 'Consumer_Confidence_Index', 'Euribor_3M_Rate', 'Number_of_Employees', 'Subscribed_to_Service', 'Contacted_Date', 'Latitude', 'Longitude', 'Customer_ID']


### 2.2 Corrección de formato y transformación de columnas

In [123]:
# En el dataset de clientes:

# Convertir Registration_Date a formato dd/mm/yyyy
df_customers['Registration_Date'] = pd.to_datetime(df_customers['Registration_Date']).dt.strftime('%d/%m/%Y')

# Analizar la distribución de visitas mensuales para los rangos de categorización
print("Distribución de visitas mensuales a la web:")
print(df_customers['Monthly_Web_Visits'].value_counts().sort_index())
print("\nEstadísticas descriptivas de visitas mensuales:")
print(df_customers['Monthly_Web_Visits'].describe())

# Hacer una columna categorica Frequency a partir de las visitas mensuales
def categorize_frequency(visits):
    if visits <= 8:
        return 'Low'
    elif visits <= 16:
        return 'Medium' 
    elif visits <= 24:
        return 'High'
    else:
        return 'Very High'

df_customers['Frequency'] = df_customers['Monthly_Web_Visits'].apply(categorize_frequency)

display(df_customers.head(7))

Distribución de visitas mensuales a la web:
Monthly_Web_Visits
1     588
2     604
3     652
4     650
5     617
6     629
7     612
8     657
9     617
10    649
11    626
12    663
13    631
14    643
15    629
16    596
17    600
18    612
19    588
20    643
21    636
22    650
23    629
24    615
25    627
26    653
27    617
28    646
29    628
30    636
31    621
32    651
Name: count, dtype: int64

Estadísticas descriptivas de visitas mensuales:
count    20115.000000
mean        16.540790
std          9.235324
min          1.000000
25%          9.000000
50%         16.000000
75%         25.000000
max         32.000000
Name: Monthly_Web_Visits, dtype: float64


Unnamed: 0,Income,Number_of_Kids,Number_of_Teenagers,Registration_Date,Monthly_Web_Visits,Customer_ID,Frequency
0,161770,1,0,04/04/2012,29,089b39d8-e4d0-461b-87d4-814d71e0e079,Very High
1,85477,1,1,30/12/2012,7,e9d37224-cb6f-4942-98d7-46672963d097,Low
2,147233,1,1,02/02/2012,5,3f9f49b5-e410-4948-bf6e-f9244f04918b,Low
3,121393,1,2,21/12/2012,29,9991fafb-4447-451a-8be2-b0df6098d13e,Very High
4,63164,1,2,20/06/2012,20,eca60b76-70b6-4077-80ba-bc52e8ebb0eb,High
5,143854,0,1,24/01/2012,26,d63ede72-0b6d-45b1-8872-385ac6897f65,Very High
6,76439,1,0,15/08/2012,3,5e3483e5-236d-437d-8351-541f9d09b9dd,Low


In [124]:
# En el dataset de datos bancarios:

# Pasar los valores de Age de string a integer, ya que la edad no puede ser un número decimal.

# Primero reemplazar ".0" en los valores no nulos (al hacer el parsing directo de str a int no funciona del todo bien)
df_bank['Age'] = df_bank['Age'].astype(object)

df_bank.loc[df_bank['Age'].notna(), 'Age'] = df_bank.loc[df_bank['Age'].notna(), 'Age'].astype(str).str.replace('.0', '', regex=False)

# Conversión final a int
df_bank['Age'] = df_bank['Age'].astype(float).astype('Int64')  # Nullable integer


# Detectar columnas numéricas que usan coma decimal
columnas_numericas = ['Consumer_Price_Index', 'Consumer_Confidence_Index', 'Employment_Variation_Rate', 'Euribor_3M_Rate']

# Reemplazar comas por puntos en las columnas numéricas y convertir a float
for col in columnas_numericas:
    # Verificar si hay comas en los valores numericos y sustituir por punto decimal
    if df_bank[col].dtype == 'object' and df_bank[col].str.contains(',').any():
        df_bank[col] = df_bank[col].str.replace(',', '.').astype('float64')

# Verificar si Credit_Default tiene valores distintos a 0.0 o nulo. Si no, eliminar la columna ya que no aporta valor para el análisis.
valores_unicos = df_bank['Credit_Default'].unique()
valores_no_nulos = [x for x in valores_unicos if pd.notnull(x) and x != 0.0]

if len(valores_no_nulos) == 0:
    # Si solo tiene valores 0.0 o nulos, eliminar la columna
    df_bank = df_bank.drop('Credit_Default', axis=1)
    print("La columna Credit_Default ha sido eliminada")
else:
    print("La columna Credit_Default contiene los siguientes valores distintos a 0.0:", valores_no_nulos)

# Pasar Credit_Default, Mortgage_Loan y Personal_Loan a booleanos, en lugar de usar 1.0 y 0.0.
df_bank['Credit_Default'] = df_bank['Credit_Default'].map({1.0: True, 0.0: False})
df_bank['Mortgage_Loan'] = df_bank['Mortgage_Loan'].map({1.0: True, 0.0: False})
df_bank['Personal_Loan'] = df_bank['Personal_Loan'].map({1.0: True, 0.0: False})

# Convertir fechas en formato 'dd-mes-yyyy' a 'dd/mm/yyyy'
meses_num = {
    'enero': '01', 'febrero': '02', 'marzo': '03', 'abril': '04',
    'mayo': '05', 'junio': '06', 'julio': '07', 'agosto': '08',
    'septiembre': '09', 'octubre': '10', 'noviembre': '11', 'diciembre': '12'
}

def convertir_fecha(fecha):
    if pd.isna(fecha):
        return fecha
    dia, mes, anyo = fecha.split('-')
    mes = meses_num[mes.lower()]
    return f"{dia.zfill(2)}/{mes}/{anyo}"   #zfill(2) asegura que el día tenga 2 dígitos añadiendo un 0 a la izq en caso de que sea menor a 10

df_bank['Contacted_Date'] = df_bank['Contacted_Date'].apply(convertir_fecha)


# Latitude y Longitude tienen algunos valores decimales y otros string, por como se muestran en la tabla, asi que se convierten a float ambas columnas.
df_bank['Latitude'] = df_bank['Latitude'].astype(float)
df_bank['Longitude'] = df_bank['Longitude'].astype(float)

# Agrupar Latitud y Longitud en una columna de Coordinates. 
df_bank['Coordinates'] = df_bank['Latitude'].astype(str) + ', ' + df_bank['Longitude'].astype(str)# Crear columna booleana Contacted basada en Days_Since_Last_Contact
df_bank['Contacted'] = df_bank['Days_Since_Last_Contact'].apply(
    lambda x: True if x < 999 
    else False
)

# Categorizar la duración de las llamadas en una nueva columna Call_Duration_Categ
df_bank['Call_Duration_Categ'] = df_bank['Call_Duration'].apply(
    lambda x: 'Very short' if x < 150
    else 'Short' if x < 500 
    else 'Medium' if x < 1200 
    else 'Long' if x < 2000
    else 'Very long'
)



La columna Credit_Default contiene los siguientes valores distintos a 0.0: [np.float64(1.0)]


In [125]:
# Reordenar las columnas
columnas = list(df_bank.columns)

# Obtener índices de las columnas a mover
idx_coordinates = columnas.index('Coordinates')
idx_contacted = columnas.index('Contacted') 
idx_duration_categ = columnas.index('Call_Duration_Categ')

# Eliminar las columnas de sus posiciones actuales
columnas.remove('Coordinates')
columnas.remove('Contacted')
columnas.remove('Call_Duration_Categ')

# Insertar en las nuevas posiciones
idx_longitude = columnas.index('Longitude')
idx_days = columnas.index('Days_Since_Last_Contact')
idx_duration = columnas.index('Call_Duration')

columnas.insert(idx_longitude + 1, 'Coordinates')
columnas.insert(idx_days + 1, 'Contacted')
columnas.insert(idx_duration + 1, 'Call_Duration_Categ')

# Reordenar el DataFrame
df_bank = df_bank[columnas]

df_chunk = df_bank.head(7)
display(df_chunk)

Unnamed: 0,Age,Job,Marital_Status,Education_Level,Credit_Default,Mortgage_Loan,Personal_Loan,Contact_Type,Call_Duration,Call_Duration_Categ,...,Consumer_Price_Index,Consumer_Confidence_Index,Euribor_3M_Rate,Number_of_Employees,Subscribed_to_Service,Contacted_Date,Latitude,Longitude,Coordinates,Customer_ID
0,,housemaid,MARRIED,basic.4y,False,False,False,telephone,261,Short,...,93.994,-36.4,4.857,5191,no,02/08/2019,41.495,-71.233,"41.495, -71.233",089b39d8-e4d0-461b-87d4-814d71e0e079
1,57.0,services,MARRIED,high.school,,False,False,telephone,149,Very short,...,93.994,-36.4,,5191,no,14/09/2016,34.601,-83.923,"34.601, -83.923",e9d37224-cb6f-4942-98d7-46672963d097
2,37.0,services,MARRIED,high.school,False,True,False,telephone,226,Short,...,93.994,-36.4,4.857,5191,no,15/02/2019,34.939,-94.847,"34.939, -94.847",3f9f49b5-e410-4948-bf6e-f9244f04918b
3,40.0,admin.,MARRIED,basic.6y,False,False,False,telephone,151,Short,...,93.994,-36.4,,5191,no,29/11/2015,49.041,-70.308,"49.041, -70.308",9991fafb-4447-451a-8be2-b0df6098d13e
4,56.0,services,MARRIED,high.school,False,False,True,telephone,307,Short,...,93.994,-36.4,,5191,no,29/01/2017,38.033,-104.463,"38.033, -104.463",eca60b76-70b6-4077-80ba-bc52e8ebb0eb
5,45.0,services,MARRIED,basic.9y,,False,False,telephone,198,Short,...,93.994,-36.4,4.857,5191,no,26/09/2015,24.689,-101.643,"24.689, -101.643",d63ede72-0b6d-45b1-8872-385ac6897f65
6,59.0,admin.,MARRIED,professional.course,False,False,False,telephone,139,Very short,...,93.994,-36.4,4.857,5191,no,25/01/2019,31.456,-94.561,"31.456, -94.561",5e3483e5-236d-437d-8351-541f9d09b9dd


### 2.3 Análisis de valores nulos

In [126]:
# Análisis de valores nulos en datos de clientes
print("=== Valores Nulos en Datos de Clientes ===")
null_customers = df_customers.isnull().sum()
print(null_customers[null_customers > 0])

# Análisis de valores nulos en datos bancarios
print("\n=== Valores Nulos en Datos Bancarios ===")
null_bank = df_bank.isnull().sum()
print(null_bank[null_bank > 0])

=== Valores Nulos en Datos de Clientes ===
Series([], dtype: int64)

=== Valores Nulos en Datos Bancarios ===
Age                     5120
Job                      345
Marital_Status            85
Education_Level         1807
Credit_Default          8981
Mortgage_Loan           1026
Personal_Loan           1026
Consumer_Price_Index     471
Euribor_3M_Rate         9256
Contacted_Date           248
dtype: int64


### 2.4 Tratado de los nulos

In [127]:
# Clasificar tipos de columnas categoricas con valores nulos
categorical_cols = ['Job', 'Marital_Status', 'Education_Level', 'Credit_Default', 'Mortgage_Loan', 'Personal_Loan']

# === Reemplazo de nulos en columnas categóricas con "unknown"
df_bank[categorical_cols] = df_bank[categorical_cols].fillna("unknown")

# === Reemplazo de nulos en Age (y redondeo)
df_bank['Age'] = pd.to_numeric(df_bank['Age'], errors='coerce')
mean_value = round(df_bank['Age'].mean())
df_bank['Age'] = df_bank['Age'].fillna(mean_value).astype(int)

# === Reemplazo de nulos en Consumer_Price_Index y Euribor_3M_Rate (y redondeo)
mean_value = round(df_bank['Consumer_Price_Index'].mean(), 1)
df_bank['Consumer_Price_Index'] = df_bank['Consumer_Price_Index'].fillna(mean_value)

mean_value = round(df_bank['Euribor_3M_Rate'].mean(), 3)
df_bank['Euribor_3M_Rate'] = df_bank['Euribor_3M_Rate'].fillna(mean_value)

# === Reemplazo de nulos en columnas de fecha con la media de la fecha
# Asegurar que sea datetime, errors='coerce' convierte strings invalidos a NaT (NaN para fechas)
df_bank['Contacted_Date'] = pd.to_datetime(df_bank['Contacted_Date'], errors='coerce')

# Calcular la media temporal (convertir a timestamp -> sacar media -> convertir a formato dd/mm/yyyy)
mean_timestamp = df_bank['Contacted_Date'].dropna().astype(np.int64).mean()
mean_date = pd.to_datetime(mean_timestamp)

# Reemplazar los nulos por la fecha promedio
df_bank['Contacted_Date'] = df_bank['Contacted_Date'].fillna(mean_date)

# Volver a convertir la fecha a formato dd/mm/yyyy
df_bank['Contacted_Date'] = df_bank['Contacted_Date'].dt.strftime('%d/%m/%Y')



# Verifica que no queden nulos
print(df_bank.isnull().sum().sort_values(ascending=False))

# Examinar los tipos de datos actuales en cada columna, ya que en google sheets se interpretan algunos valores como string
print("=== Tipos de datos en cada columna ===")
print("\nLatitude:")
print(df_bank['Latitude'].apply(type).value_counts())
print("\nLongitude:")
print(df_bank['Longitude'].apply(type).value_counts())
print("\nConsumer_Price_Index:")
print(df_bank['Consumer_Price_Index'].apply(type).value_counts())
print("\nEuribor_3M_Rate:")
print(df_bank['Euribor_3M_Rate'].apply(type).value_counts())

df_chunk = df_bank.head(10)
display(df_chunk)

Age                          0
Job                          0
Marital_Status               0
Education_Level              0
Credit_Default               0
Mortgage_Loan                0
Personal_Loan                0
Contact_Type                 0
Call_Duration                0
Call_Duration_Categ          0
Campaign_Contacts            0
Days_Since_Last_Contact      0
Contacted                    0
Previous_Contacts            0
Previous_Campaign_Outcome    0
Employment_Variation_Rate    0
Consumer_Price_Index         0
Consumer_Confidence_Index    0
Euribor_3M_Rate              0
Number_of_Employees          0
Subscribed_to_Service        0
Contacted_Date               0
Latitude                     0
Longitude                    0
Coordinates                  0
Customer_ID                  0
dtype: int64
=== Tipos de datos en cada columna ===

Latitude:
Latitude
<class 'float'>    43000
Name: count, dtype: int64

Longitude:
Longitude
<class 'float'>    43000
Name: count, dtype: int6

Unnamed: 0,Age,Job,Marital_Status,Education_Level,Credit_Default,Mortgage_Loan,Personal_Loan,Contact_Type,Call_Duration,Call_Duration_Categ,...,Consumer_Price_Index,Consumer_Confidence_Index,Euribor_3M_Rate,Number_of_Employees,Subscribed_to_Service,Contacted_Date,Latitude,Longitude,Coordinates,Customer_ID
0,40,housemaid,MARRIED,basic.4y,False,False,False,telephone,261,Short,...,93.994,-36.4,4.857,5191,no,08/02/2019,41.495,-71.233,"41.495, -71.233",089b39d8-e4d0-461b-87d4-814d71e0e079
1,57,services,MARRIED,high.school,unknown,False,False,telephone,149,Very short,...,93.994,-36.4,3.617,5191,no,17/06/2017,34.601,-83.923,"34.601, -83.923",e9d37224-cb6f-4942-98d7-46672963d097
2,37,services,MARRIED,high.school,False,True,False,telephone,226,Short,...,93.994,-36.4,4.857,5191,no,17/06/2017,34.939,-94.847,"34.939, -94.847",3f9f49b5-e410-4948-bf6e-f9244f04918b
3,40,admin.,MARRIED,basic.6y,False,False,False,telephone,151,Short,...,93.994,-36.4,3.617,5191,no,17/06/2017,49.041,-70.308,"49.041, -70.308",9991fafb-4447-451a-8be2-b0df6098d13e
4,56,services,MARRIED,high.school,False,False,True,telephone,307,Short,...,93.994,-36.4,3.617,5191,no,17/06/2017,38.033,-104.463,"38.033, -104.463",eca60b76-70b6-4077-80ba-bc52e8ebb0eb
5,45,services,MARRIED,basic.9y,unknown,False,False,telephone,198,Short,...,93.994,-36.4,4.857,5191,no,17/06/2017,24.689,-101.643,"24.689, -101.643",d63ede72-0b6d-45b1-8872-385ac6897f65
6,59,admin.,MARRIED,professional.course,False,False,False,telephone,139,Very short,...,93.994,-36.4,4.857,5191,no,17/06/2017,31.456,-94.561,"31.456, -94.561",5e3483e5-236d-437d-8351-541f9d09b9dd
7,40,blue-collar,MARRIED,unknown,unknown,False,False,telephone,217,Short,...,93.994,-36.4,4.857,5191,no,17/06/2017,46.871,-122.235,"46.871, -122.235",87fdc08b-30ae-4dab-803f-561ecdf27ff0
8,24,technician,SINGLE,professional.course,False,True,False,telephone,380,Short,...,93.994,-36.4,4.857,5191,no,17/06/2017,44.632,-85.811,"44.632, -85.811",87b79988-2be5-419d-88f4-56655852c565
9,25,services,SINGLE,high.school,False,True,False,telephone,50,Very short,...,93.994,-36.4,3.617,5191,no,11/02/2016,30.297,-117.382,"30.297, -117.382",ea6b7d04-9271-4c0a-a01f-07795d164aba


In [128]:
# Información básica de los datos de clientes
print("=== Información de Datos de Clientes ===")
get_basic_info(df_customers)

# Información básica de los datos bancarios
print("\n=== Información de Datos Bancarios ===")
get_basic_info(df_bank)

=== Información de Datos de Clientes ===

Información del DataFrame:
<class 'pandas.core.frame.DataFrame'>
Index: 20115 entries, 0 to 20114
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Income               20115 non-null  int64 
 1   Number_of_Kids       20115 non-null  int64 
 2   Number_of_Teenagers  20115 non-null  int64 
 3   Registration_Date    20115 non-null  object
 4   Monthly_Web_Visits   20115 non-null  int64 
 5   Customer_ID          20115 non-null  object
 6   Frequency            20115 non-null  object
dtypes: int64(4), object(3)
memory usage: 1.2+ MB
None

Primeras 5 filas:
   Income  Number_of_Kids  Number_of_Teenagers Registration_Date  \
0  161770               1                    0        04/04/2012   
1   85477               1                    1        30/12/2012   
2  147233               1                    1        02/02/2012   
3  121393               1                

### 2.5 Eliminación de duplicados

In [129]:
# Detectar si hay filas duplicadas en el dataset de datos bancarios
banco_duplicados = df_bank[df_bank.duplicated()]
print(banco_duplicados)

# Y en el dataset de clientes
clientes_duplicados = df_customers[df_customers.duplicated()]
print(clientes_duplicados)



Empty DataFrame
Columns: [Age, Job, Marital_Status, Education_Level, Credit_Default, Mortgage_Loan, Personal_Loan, Contact_Type, Call_Duration, Call_Duration_Categ, Campaign_Contacts, Days_Since_Last_Contact, Contacted, Previous_Contacts, Previous_Campaign_Outcome, Employment_Variation_Rate, Consumer_Price_Index, Consumer_Confidence_Index, Euribor_3M_Rate, Number_of_Employees, Subscribed_to_Service, Contacted_Date, Latitude, Longitude, Coordinates, Customer_ID]
Index: []

[0 rows x 26 columns]
Empty DataFrame
Columns: [Income, Number_of_Kids, Number_of_Teenagers, Registration_Date, Monthly_Web_Visits, Customer_ID, Frequency]
Index: []


### 2.6 Agrupación de dataframes

In [130]:
# Unir los dataframes usando merge con la columna Customer_ID
df_completo = pd.merge(df_customers, df_bank, on='Customer_ID', how='inner')

# Definir el orden deseado de las columnas
column_names = ['Customer_ID', 'Income', 'Number_of_Kids', 'Number_of_Teenagers', 
                'Registration_Date', 'Monthly_Web_Visits', 'Frequency', 'Age', 'Job',
                'Marital_Status', 'Education_Level', 'Credit_Default', 'Mortgage_Loan',
                'Personal_Loan', 'Contact_Type', 'Call_Duration', 'Call_Duration_Categ',
                'Campaign_Contacts', 'Days_Since_Last_Contact', 'Contacted', 'Previous_Contacts', 
                'Previous_Campaign_Outcome', 'Employment_Variation_Rate',
                'Consumer_Price_Index', 'Consumer_Confidence_Index', 'Euribor_3M_Rate', 'Number_of_Employees',
                'Subscribed_to_Service', 'Contacted_Date', 'Latitude', 'Longitude', 'Coordinates']

# Reordenar las columnas del dataframe
df_completo = df_completo[column_names]

# Mostrar las primeras filas del dataframe combinado
print("\nPrimeras filas del dataframe combinado:")
display(df_completo.head())
get_basic_info(df_completo)



Primeras filas del dataframe combinado:


Unnamed: 0,Customer_ID,Income,Number_of_Kids,Number_of_Teenagers,Registration_Date,Monthly_Web_Visits,Frequency,Age,Job,Marital_Status,...,Employment_Variation_Rate,Consumer_Price_Index,Consumer_Confidence_Index,Euribor_3M_Rate,Number_of_Employees,Subscribed_to_Service,Contacted_Date,Latitude,Longitude,Coordinates
0,089b39d8-e4d0-461b-87d4-814d71e0e079,161770,1,0,04/04/2012,29,Very High,40,housemaid,MARRIED,...,1.1,93.994,-36.4,4.857,5191,no,08/02/2019,41.495,-71.233,"41.495, -71.233"
1,e9d37224-cb6f-4942-98d7-46672963d097,85477,1,1,30/12/2012,7,Low,57,services,MARRIED,...,1.1,93.994,-36.4,3.617,5191,no,17/06/2017,34.601,-83.923,"34.601, -83.923"
2,3f9f49b5-e410-4948-bf6e-f9244f04918b,147233,1,1,02/02/2012,5,Low,37,services,MARRIED,...,1.1,93.994,-36.4,4.857,5191,no,17/06/2017,34.939,-94.847,"34.939, -94.847"
3,9991fafb-4447-451a-8be2-b0df6098d13e,121393,1,2,21/12/2012,29,Very High,40,admin.,MARRIED,...,1.1,93.994,-36.4,3.617,5191,no,17/06/2017,49.041,-70.308,"49.041, -70.308"
4,eca60b76-70b6-4077-80ba-bc52e8ebb0eb,63164,1,2,20/06/2012,20,High,56,services,MARRIED,...,1.1,93.994,-36.4,3.617,5191,no,17/06/2017,38.033,-104.463,"38.033, -104.463"



Información del DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20018 entries, 0 to 20017
Data columns (total 32 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Customer_ID                20018 non-null  object 
 1   Income                     20018 non-null  int64  
 2   Number_of_Kids             20018 non-null  int64  
 3   Number_of_Teenagers        20018 non-null  int64  
 4   Registration_Date          20018 non-null  object 
 5   Monthly_Web_Visits         20018 non-null  int64  
 6   Frequency                  20018 non-null  object 
 7   Age                        20018 non-null  int64  
 8   Job                        20018 non-null  object 
 9   Marital_Status             20018 non-null  object 
 10  Education_Level            20018 non-null  object 
 11  Credit_Default             20018 non-null  object 
 12  Mortgage_Loan              20018 non-null  object 
 13  Personal_Loan     

### 2.7 Exportación del dataset limpio

In [131]:
# Exportar en la carpeta de resultados ambos dataframes como csv
df_completo.to_csv('../results/bank_clean.csv', index=False)