# EDA

In [2]:
import pandas as pd

def merge_and_clean_data(files, column_order, months):
    """
    Fusionar y limpiar múltiples archivos CSV añadiendo una columna de mes y eliminando duplicados.

    Parámetros:
    files (lista de str): Lista de rutas de archivo a los archivos CSV.
    column_order (lista de str): Lista de columnas para ordenar el DataFrame.
    months (lista de str): Lista de nombres de meses correspondientes a cada archivo.

    Retorna:
    pd.DataFrame: El DataFrame fusionado y limpio.
    """
    dfs = []

    for file, month in zip(files, months):
        df = pd.read_csv(file)
        df['Month'] = month
        df = df[column_order]
        dfs.append(df)

    # Concatenar los DataFrames
    merged_df = pd.concat(dfs, ignore_index=True)

    # Eliminar filas duplicadas basadas en 'propertyCode', manteniendo la última ocurrencia
    merged_df = merged_df.drop_duplicates(subset='propertyCode', keep='last')

    return merged_df

# Uso de la función
path = 'C:/Users/nicol/OneDrive/Documentos/VSCLocal/Data/TFM/'
files = [path + "idealista_02.csv", path + "idealista_03.csv", path + "idealista_04.csv", path + "idealista_05.csv"]
months = ['Febrero', 'Marzo', 'Abril', 'Mayo']

column_order = ['index', 'propertyCode', 'thumbnail', 'externalReference', 'numPhotos','floor', 'price', 'propertyType', 'operation', 'size', 'exterior','rooms', 
                'bathrooms', 'address', 'province', 'municipality', 'district','country', 'neighborhood', 'latitude', 'longitude', 'showAddress','url', 'distance', 
                'description', 'hasVideo', 'status','newDevelopment', 'hasLift', 'priceByArea', 'detailedType','suggestedTexts', 'hasPlan', 'has3DTour', 'has360', 
                'hasStaging','topNewDevelopment', 'topPlus', 'highlight', 'parkingSpace','newDevelopmentFinished']

merged_df = merge_and_clean_data(files, column_order, months)

# Mostrar el tamaño del DataFrame fusionado
print(merged_df.shape)


(14993, 41)


In [3]:
from ydata_profiling import ProfileReport
df=merged_df
profile = ProfileReport(df, title="Informe de Perfil del DataFrame")
profile.to_notebook_iframe()

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'ipywidgets'

In [6]:
import pandas as pd
from ydata_profiling import ProfileReport

# Cargar tu DataFrame
df = pd.read_csv('C:/Users/nicol/OneDrive/Documentos/VSCLocal/Data/TFM/datos_procesados_final.csv')

df= df[(df['province']=='Madrid') & (df['price']<=800000)]

variables=['propertyCode', 'price', 'floor', 'propertyType', 'size', 'exterior', 'rooms', 'bathrooms', 'province', 'district', 'neighborhood', 'latitude', 
            'longitude', 'distance', 'status', 'hasLift', 'priceByArea']

df=df[variables]

# Generar el informe de perfil
profile = ProfileReport(df, title="Informe de Perfil del DataFrame")

# Guardar el informe en un archivo HTML
profile.to_file("C:/Users/nicol/OneDrive/Documentos/VSCLocal/Data/TFM/informe_perfil_final.html")

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'Villaverde'')
Summarize dataset: 100%|██████████| 151/151 [00:16<00:00,  9.02it/s, Completed]                                   
Generate report structure: 100%|██████████| 1/1 [00:06<00:00,  6.55s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.05s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 50.35it/s]
