In [45]:
import pandas as pd
import numpy as np
import funciones as fn  
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from ydata_profiling import ProfileReport

In [46]:
df = pd.read_csv('Datos_Train_Calidad_Lead.csv')

df.head()

Unnamed: 0,Cuota_Inicial__c,Presupuesto_inmueble_a_comprar__c,ingresos_totales__c,Ingresos_familiares__c,Calidad_de_lead__c,picklist_ciudad__c
0,7000000.0,300000000.0,5000000.0,5000000.0,Descalificado,Bogotá
1,2000000.0,200000000.0,2800000.0,2800000.0,Descalificado,Bogotá
2,4000000.0,,800000.0,800000.0,Descalificado,Bogotá
3,700.0,18000000.0,1800000.0,1800000.0,Descalificado,Bogotá
4,0.0,170000000.0,3000000.0,3000000.0,Descalificado,Bogotá


In [47]:
# Convertir la columna 'picklist_ciudad__c' en binaria (1 para 'Bogotá', 0 para otras ciudades)
df['picklist_ciudad__c'] = df['picklist_ciudad__c'].apply(lambda x: 1 if x == 'Bogotá' else 0)
print("Conversión completada")
print(df['picklist_ciudad__c'].value_counts())

Conversión completada
picklist_ciudad__c
1    552
0    151
Name: count, dtype: int64


In [48]:
mapping = {'Alta': 1, 'Media': 0, 'Descalificado': -1}
df['Calidad_de_lead__c'] = df['Calidad_de_lead__c'].map(mapping)
df['Calidad_de_lead__c'].value_counts()

Calidad_de_lead__c
-1    536
 1    116
 0     51
Name: count, dtype: int64

In [49]:
imputer = SimpleImputer(strategy='mean')
df.loc[:, df.isnull().any()] = imputer.fit_transform(df.loc[:, df.isnull().any()])

In [50]:
def eliminar_percentil_extremos(df, columna):
    percentil_inferior = np.percentile(df[columna], 1)
    percentil_superior = np.percentile(df[columna], 99)
    return df[(df[columna] >= percentil_inferior) & (df[columna] <= percentil_superior)]

In [51]:
df = eliminar_percentil_extremos(df, columna='Cuota_Inicial__c')
df = eliminar_percentil_extremos(df, columna='Presupuesto_inmueble_a_comprar__c')
df = eliminar_percentil_extremos(df, columna='ingresos_totales__c')
df = eliminar_percentil_extremos(df, columna='Ingresos_familiares__c')

In [52]:
df['Cuota_Inicial__c'] = df['Cuota_Inicial__c'].apply(fn.ajustar_valores)
df['ingresos_totales__c'] = df['ingresos_totales__c'].apply(fn.ajustar_valores)
df['Ingresos_familiares__c'] = df['Ingresos_familiares__c'].apply(fn.ajustar_valores)

In [53]:
def identificar_outliers(df, columna):
    media = df[columna].mean()
    desv_std = df[columna].std()
    umbral = 3
    outliers_std = np.abs(df[columna] - media) > (umbral * desv_std)
    return {
        'outliers_std': df[outliers_std].index.tolist()
    }

In [54]:
for columna in df.columns:
    outliers = identificar_outliers(df, columna)['outliers_std']
    print(f'Columna {columna}: {len(outliers)} outliers encontrados')

Columna Cuota_Inicial__c: 16 outliers encontrados
Columna Presupuesto_inmueble_a_comprar__c: 44 outliers encontrados
Columna ingresos_totales__c: 13 outliers encontrados
Columna Ingresos_familiares__c: 10 outliers encontrados
Columna Calidad_de_lead__c: 0 outliers encontrados
Columna picklist_ciudad__c: 0 outliers encontrados


In [55]:
scaler = RobustScaler()

In [56]:
for columna in df.columns:
    outliers = identificar_outliers(df, columna)['outliers_std']
    if outliers:
        # Seleccionar los valores outliers y escalarlos
        outlier_values = df.loc[outliers, columna].values.reshape(-1, 1)
        scaled_values = scaler.fit_transform(outlier_values)
        df.loc[outliers, columna] = scaled_values.flatten()

In [57]:
profile = ProfileReport(df, title='EDA Data Salesforce', explorative=True)
profile.to_file('EDA Data Salesforce')

Summarize dataset: 100%|██████████| 31/31 [00:01<00:00, 25.07it/s, Completed]                                                                   
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  3.01it/s]
Export report to file: 100%|██████████| 1/1 [00:00<?, ?it/s]


In [58]:
df.to_csv('Datos_Train_Calidad_Lead_Clean.csv', index=False)