In [None]:
# Importacion de librerias necesarias para el analisis exploratorio (EDA)
# -------------------------------------------------------------------------------
# Tratamiento de datos
import pandas as pd

#Gestion de archivos y rutas
import os
import sys

# Visualizacion de datos
import matplotlib.pyplot as plt
import seaborn as sns

# Configuracion de Pandas y eliminacion de warnings
pd.set_option('display.max_columns', None)  # Mostrar todas las columnas en los DataFrames
import warnings
warnings.filterwarnings("ignore")  # Ocultar warnings innecesarios

# Funciones de extraccion y exploracion de datos
# Obtener la ruta del directorio raiz del proyecto
ruta_proyecto = os.path.abspath(os.path.join(os.getcwd(), ".."))
# Agregar src/ al path si no esta ya incluido
if ruta_proyecto not in sys.path:
    sys.path.append(ruta_proyecto)
    
# Importar modulos de src   
from src import eda
from src import variables as va

## TEST FUNCIONES ✨

In [2]:
# cargar datos
df = eda.extraer_datos_csv("../datos/bruto/Bank_Customer_Churn_Prediction.csv")

✅ Datos extraidos correctamente. Filas: 10000, Columnas: 12


Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Revisar estructura del dataset
eda.revisar_estructura(df)

📌 Dimensiones del dataset:
  - Filas: 10000
  - Columnas: 12

📌 Nombres de las columnas:
['customer_id', 'credit_score', 'country', 'gender', 'age', 'tenure', 'balance', 'products_number', 'credit_card', 'active_member', 'estimated_salary', 'churn'] 

📌 Tipos de datos:
customer_id           int64
credit_score          int64
country              object
gender               object
age                   int64
tenure                int64
balance             float64
products_number       int64
credit_card           int64
active_member         int64
estimated_salary    float64
churn                 int64
dtype: object 

📌 Datos duplicados: 0 (0.0%)



In [4]:
# Identificar filas duplicadas
eda.get_duplicate_rows(df)

✅ No hay filas duplicadas en el dataset.


Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn


In [5]:
# Revisar valores nulos y duplicados
df_nulos = eda.revisar_valores_nulos(df)

📌 Columnas con valores nulos: 0
📌 Columnas sin valores nulos: 12


Unnamed: 0,Valores nulos,% Nulos,% No Nulos,Valores Unicos,Tipo de Dato


In [6]:
# Obtener estadisticas descriptivas
eda.obtener_estadisticas(df)

📌 Estadisticas descriptivas de variables numericas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_id,10000.0,15690940.0,71936.186123,15565701.0,15628528.25,15690740.0,15753230.0,15815690.0
credit_score,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
balance,10000.0,76485.89,62397.405202,0.0,0.0,97198.54,127644.2,250898.09
products_number,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
credit_card,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
active_member,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
estimated_salary,10000.0,100090.2,57510.492818,11.58,51002.11,100193.9,149388.2,199992.48
churn,10000.0,0.2037,0.402769,0.0,0.0,0.0,0.0,1.0



📌 Estadisticas descriptivas de variables categoricas:


Unnamed: 0,count,unique,top,freq
country,10000,3,France,5014
gender,10000,2,Male,5457


In [7]:
# Revisar valores unicos en variables categoricas
eda.revisar_valores_unicos(df)

📌 Analisis de valores unicos en variables categoricas:


----------- ANALIZANDO: 'COUNTRY' -----------

Valores unicos: ['France' 'Spain' 'Germany']

Frecuencia de valores:
country
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64


----------- ANALIZANDO: 'GENDER' -----------

Valores unicos: ['Female' 'Male']

Frecuencia de valores:
gender
Male      5457
Female    4543
Name: count, dtype: int64



In [8]:
# Renombrar columnas 
df.rename(columns=va.columnas_renombradas, inplace=True)
display(df.head())

Unnamed: 0,id_cliente,puntacion_credito,pais,genero,edad,antiguedad,saldo,num_productos,tarjeta_credito,miembro_activo,salario_estimado,abandono
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [9]:
df = eda.transformar_binarios(df, va.columnas_binarias)
display(df[va.columnas_binarias].head())

Unnamed: 0,tarjeta_credito,miembro_activo,abandono
0,Si,Si,Si
1,No,Si,No
2,Si,No,Si
3,No,No,No
4,Si,Si,No


In [10]:
eda.obtener_estadisticas(df)

📌 Estadisticas descriptivas de variables numericas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id_cliente,10000.0,15690940.0,71936.186123,15565701.0,15628528.25,15690740.0,15753230.0,15815690.0
puntacion_credito,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
edad,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
antiguedad,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
saldo,10000.0,76485.89,62397.405202,0.0,0.0,97198.54,127644.2,250898.09
num_productos,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
salario_estimado,10000.0,100090.2,57510.492818,11.58,51002.11,100193.9,149388.2,199992.48



📌 Estadisticas descriptivas de variables categoricas:


Unnamed: 0,count,unique,top,freq
pais,10000,3,France,5014
genero,10000,2,Male,5457
tarjeta_credito,10000,2,Si,7055
miembro_activo,10000,2,Si,5151
abandono,10000,2,No,7963
