# Preprocessing

In [1089]:
import pandas as pd

# read the data

df = pd.read_csv('train.csv')
df.describe()

Unnamed: 0,Characteristics.LotSizeSquareFeet,ImageData.c1c6.summary.bathroom,ImageData.c1c6.summary.exterior,ImageData.c1c6.summary.interior,ImageData.c1c6.summary.kitchen,ImageData.c1c6.summary.property,ImageData.q1q6.summary.bathroom,ImageData.q1q6.summary.exterior,ImageData.q1q6.summary.interior,ImageData.q1q6.summary.kitchen,...,Structure.BathroomsFull,Structure.BathroomsHalf,Structure.BedroomsTotal,Structure.BelowGradeFinishedArea,Structure.BelowGradeUnfinishedArea,Structure.FireplacesTotal,Structure.GarageSpaces,Structure.LivingArea,Structure.Rooms.RoomsTotal,Structure.YearBuilt
count,1690.0,90744.0,87789.0,93597.0,92320.0,103055.0,90708.0,82565.0,93589.0,92292.0,...,100063.0,100049.0,105024.0,14235.0,11674.0,51216.0,88621.0,99509.0,105061.0,102256.0
mean,59491.45,3.116429,3.308723,3.211074,3.117166,3.182633,3.311895,3.499023,3.108145,3.20957,...,1.921669,0.441973,3.102167,701.495539,649.350608,0.941737,2.039585,1742.211448,7.174756,1969.155199
std,478465.2,0.72416,0.676499,0.677497,0.743597,0.682011,0.50811,0.594215,0.540139,0.645833,...,0.864013,0.548588,1.235208,635.400209,577.616585,0.713055,8.542624,1144.743077,2.846424,53.661479
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3743.75,2.8,3.1,2.9,2.8,2.9,3.0,3.1,2.8,2.8,...,1.0,0.0,2.0,0.0,150.0,1.0,1.0,1100.0,5.0,1952.0
50%,8712.0,3.2,3.4,3.3,3.2,3.2,3.3,3.5,3.0,3.2,...,2.0,0.0,3.0,696.0,565.5,1.0,2.0,1550.0,7.0,1973.0
75%,24520.75,3.6,3.7,3.6,3.6,3.6,3.7,3.9,3.4,3.8,...,2.0,1.0,4.0,1073.0,1000.0,1.0,2.0,2200.0,8.0,1998.0
max,9999999.0,6.0,6.0,6.0,6.0,6.0,5.1,5.4,5.1,5.0,...,75.0,9.0,18.0,7782.0,4896.0,21.0,999.0,51400.0,99.0,2024.0


The first step is to divide the data in Train and Validation

In [1090]:
# divide train and validation data

from sklearn.model_selection import train_test_split

train, valid = train_test_split(df, test_size=0.2, random_state=42)

In [1091]:
train['Structure.YearBuilt'] = train['Structure.YearBuilt'].astype('object')
train['Location.GIS.Latitude'] = train['Location.GIS.Latitude'].astype('object')
train['Location.GIS.Longitude'] = train['Location.GIS.Longitude'].astype('object')

valid['Structure.YearBuilt'] = valid['Structure.YearBuilt'].astype('object')
valid['Location.GIS.Latitude'] = valid['Location.GIS.Latitude'].astype('object')
valid['Location.GIS.Longitude'] = valid['Location.GIS.Longitude'].astype('object')


In [1092]:
import pandas as pd
import numpy as np

def convert_to_missing(df):
    # Reemplazar todos los valores "missing" (NA, NaN, nan, None, '', 'none', 'na', etc.) con NaN
    missing_values = [None, 'NA', 'NaN', 'nan', '', 'none', 'na', 'Na', 'NULL']
    df = df.replace(missing_values, np.nan)
    return df

# Aplicar la función a los DataFrames de entrenamiento y validación
train = convert_to_missing(train)
valid = convert_to_missing(valid)

# Verificar que los valores faltantes hayan sido reemplazados
print(train.isna().sum())
print(valid.isna().sum())




Characteristics.LotFeatures               52372
Characteristics.LotSizeSquareFeet         84588
ImageData.c1c6.summary.bathroom           13386
ImageData.c1c6.summary.exterior           15699
ImageData.c1c6.summary.interior           11126
ImageData.c1c6.summary.kitchen            12142
ImageData.c1c6.summary.property            3485
ImageData.features_reso.results            2139
ImageData.q1q6.summary.bathroom           13415
ImageData.q1q6.summary.exterior           19907
ImageData.q1q6.summary.interior           11132
ImageData.q1q6.summary.kitchen            12165
ImageData.q1q6.summary.property            4498
ImageData.room_type_reso.results            453
ImageData.style.exterior.summary.label    19078
ImageData.style.stories.summary.label     19231
Listing.Dates.CloseDate                       0
Listing.ListingId                             0
Listing.Price.ClosePrice                      0
Location.Address.CensusBlock               5525
Location.Address.CensusTract            

In [1093]:
# We create a table with the missing values and the percentage of missing values

missing_values = df.isnull().sum()
missing_values_percent = missing_values / len(df) * 100
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table.columns = ["Missing Values", "Percentage"]
missing_values_table = missing_values_table[missing_values_table["Missing Values"] > 0]
missing_values_table = missing_values_table.sort_values(by="Missing Values", ascending=False)
print(missing_values_table)


                                        Missing Values  Percentage
Location.Address.StreetDirectionSuffix          106796   99.403371
Characteristics.LotSizeSquareFeet               105747   98.426985
Location.Address.PostalCodePlus4                104166   96.955425
UnitTypes.UnitTypeType                          102461   95.368448
Tax.Zoning                                       99939   93.021026
Structure.BelowGradeUnfinishedArea               95763   89.134097
Structure.ParkingFeatures                        93880   87.381442
Structure.BelowGradeFinishedArea                 93202   86.750375
Location.Address.UnitNumber                      82894   77.155915
Location.Area.SubdivisionName                    70896   65.988440
Characteristics.LotFeatures                      65417   60.888707
Location.Address.StreetDirectionPrefix           58743   54.676694
Structure.FireplacesTotal                        56221   52.329272
ImageData.q1q6.summary.exterior                  24872   23.15

In [1094]:
# Mirar los valores unicos de street direction sufix
print(train['Location.Address.StreetDirectionSuffix'].unique())

# Mirar los valores unicos de street direction prefix
print(train['Location.Address.StreetDirectionPrefix'].unique())

# Unir las columnas anteriores en una que sea street direction
train['Location.Address.StreetDirection'] = train['Location.Address.StreetDirectionPrefix'].fillna('') + train['Location.Address.StreetDirectionSuffix'].fillna('')
valid['Location.Address.StreetDirection'] = valid['Location.Address.StreetDirectionPrefix'].fillna('') + valid['Location.Address.StreetDirectionSuffix'].fillna('')

# Eliminar las columnas que ya no necesitamos
train.drop(columns=['Location.Address.StreetDirectionPrefix', 'Location.Address.StreetDirectionSuffix'], inplace=True)
valid.drop(columns=['Location.Address.StreetDirectionPrefix', 'Location.Address.StreetDirectionSuffix'], inplace=True)

#Los NA en el formato correcto
train['Location.Address.StreetDirection'] = train['Location.Address.StreetDirection'].replace('', None)
valid['Location.Address.StreetDirection'] = valid['Location.Address.StreetDirection'].replace('', None)





[nan 'e' 'n' 'w' 's' 'se' 'sw' 'ne' 'nw']
[nan 'e' 's' 'n' 'w' 'sw' 'ne' 'nw' 'se']


In [1095]:
# Mirar los missing values de la nueva variable
print(train['Location.Address.StreetDirection'].isnull().sum())
print(valid['Location.Address.StreetDirection'].isnull().sum())

#calcular los porcentages de missings
print(train['Location.Address.StreetDirection'].isnull().sum()/len(train))
print(valid['Location.Address.StreetDirection'].isnull().sum()/len(valid))

46632
11692
0.5425543054602148
0.5441176470588235


In [1096]:
#Eliminem les columnes 
train.drop(columns=['Characteristics.LotSizeSquareFeet', 'Location.Address.PostalCodePlus4'], inplace=True)
valid.drop(columns=['Characteristics.LotSizeSquareFeet', 'Location.Address.PostalCodePlus4'], inplace=True)

In [1097]:
# Eliminar las columnas que ya no necesitamos
train.drop(columns=['UnitTypes.UnitTypeType', 'Tax.Zoning'], inplace=True)
valid.drop(columns=['UnitTypes.UnitTypeType', 'Tax.Zoning'], inplace=True)

# Mirar los missing values de la nueva variable
missing_values = train.isnull().sum()
missing_values_percent = missing_values / len(train) * 100
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table.columns = ["Missing Values", "Percentage"]
missing_values_table = missing_values_table[missing_values_table["Missing Values"] > 0]
missing_values_table = missing_values_table.sort_values(by="Missing Values", ascending=False)
print(missing_values_table)


                                        Missing Values  Percentage
Structure.BelowGradeUnfinishedArea               76661   89.193592
Structure.ParkingFeatures                        75135   87.418120
Structure.BelowGradeFinishedArea                 74587   86.780533
Location.Address.UnitNumber                      66274   77.108518
Location.Area.SubdivisionName                    56740   66.015893
Characteristics.LotFeatures                      52372   60.933810
Location.Address.StreetDirection                 46632   54.255431
Structure.FireplacesTotal                        45036   52.398515
ImageData.q1q6.summary.exterior                  19907   23.161410
ImageData.style.stories.summary.label            19231   22.374897
ImageData.style.exterior.summary.label           19078   22.196884
ImageData.c1c6.summary.exterior                  15699   18.265483
Structure.GarageSpaces                           14994   17.445229
ImageData.q1q6.summary.bathroom                  13415   15.60

In [1098]:
#Structure.ParkingFeatures si tiene resultado 1 si no 0
train['Structure.ParkingFeatures'] = train['Structure.ParkingFeatures'].notnull().astype(int)
valid['Structure.ParkingFeatures'] = valid['Structure.ParkingFeatures'].notnull().astype(int)

# Mirar los missing values de la nueva variable
missing_values = train.isnull().sum()
missing_values_percent = missing_values / len(train) * 100
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table.columns = ["Missing Values", "Percentage"]
missing_values_table = missing_values_table[missing_values_table["Missing Values"] > 0]
missing_values_table = missing_values_table.sort_values(by="Missing Values", ascending=False)
print(missing_values_table)



                                        Missing Values  Percentage
Structure.BelowGradeUnfinishedArea               76661   89.193592
Structure.BelowGradeFinishedArea                 74587   86.780533
Location.Address.UnitNumber                      66274   77.108518
Location.Area.SubdivisionName                    56740   66.015893
Characteristics.LotFeatures                      52372   60.933810
Location.Address.StreetDirection                 46632   54.255431
Structure.FireplacesTotal                        45036   52.398515
ImageData.q1q6.summary.exterior                  19907   23.161410
ImageData.style.stories.summary.label            19231   22.374897
ImageData.style.exterior.summary.label           19078   22.196884
ImageData.c1c6.summary.exterior                  15699   18.265483
Structure.GarageSpaces                           14994   17.445229
ImageData.q1q6.summary.bathroom                  13415   15.608093
ImageData.c1c6.summary.bathroom                  13386   15.57

In [1099]:
# Unknown a Location.Area.SubdivisionName   
train['Location.Area.SubdivisionName'] = train['Location.Area.SubdivisionName'].replace('', None)
valid['Location.Area.SubdivisionName'] = valid['Location.Area.SubdivisionName'].replace('', None)

# Mirar los missing values de la nueva variable
missing_values = train.isnull().sum()
missing_values_percent = missing_values / len(train) * 100
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table.columns = ["Missing Values", "Percentage"]
missing_values_table = missing_values_table[missing_values_table["Missing Values"] > 0]
missing_values_table = missing_values_table.sort_values(by="Missing Values", ascending=False)
print(missing_values_table)


                                        Missing Values  Percentage
Structure.BelowGradeUnfinishedArea               76661   89.193592
Structure.BelowGradeFinishedArea                 74587   86.780533
Location.Address.UnitNumber                      66274   77.108518
Location.Area.SubdivisionName                    56740   66.015893
Characteristics.LotFeatures                      52372   60.933810
Location.Address.StreetDirection                 46632   54.255431
Structure.FireplacesTotal                        45036   52.398515
ImageData.q1q6.summary.exterior                  19907   23.161410
ImageData.style.stories.summary.label            19231   22.374897
ImageData.style.exterior.summary.label           19078   22.196884
ImageData.c1c6.summary.exterior                  15699   18.265483
Structure.GarageSpaces                           14994   17.445229
ImageData.q1q6.summary.bathroom                  13415   15.608093
ImageData.c1c6.summary.bathroom                  13386   15.57

In [1100]:
#Eliminar la Location.Address.UnitNumber
train.drop(columns=['Location.Address.UnitNumber'], inplace=True)
valid.drop(columns=['Location.Address.UnitNumber'], inplace=True)


In [1101]:
#Mirar los valores unicos de la variable Location.Address.StateOrProvince
print(train['Location.Address.StateOrProvince'].unique())
train.drop(columns=['Location.Address.StateOrProvince'], inplace=True)
valid.drop(columns=['Location.Address.StateOrProvince'], inplace=True)

['il']


In [1102]:
missing_values = train.isnull().sum()
missing_values_percent = missing_values / len(train) * 100
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table.columns = ["Missing Values", "Percentage"]
missing_values_table = missing_values_table[missing_values_table["Missing Values"] > 0]
missing_values_table = missing_values_table.sort_values(by="Missing Values", ascending=False)
print(missing_values_table)

                                        Missing Values  Percentage
Structure.BelowGradeUnfinishedArea               76661   89.193592
Structure.BelowGradeFinishedArea                 74587   86.780533
Location.Area.SubdivisionName                    56740   66.015893
Characteristics.LotFeatures                      52372   60.933810
Location.Address.StreetDirection                 46632   54.255431
Structure.FireplacesTotal                        45036   52.398515
ImageData.q1q6.summary.exterior                  19907   23.161410
ImageData.style.stories.summary.label            19231   22.374897
ImageData.style.exterior.summary.label           19078   22.196884
ImageData.c1c6.summary.exterior                  15699   18.265483
Structure.GarageSpaces                           14994   17.445229
ImageData.q1q6.summary.bathroom                  13415   15.608093
ImageData.c1c6.summary.bathroom                  13386   15.574352
ImageData.q1q6.summary.kitchen                   12165   14.15

In [1103]:
#Structure.Basement si es missing o ['none'] es 0 si no 1
train['Structure.Basement'] = train['Structure.Basement'].replace(['none'], None)
valid['Structure.Basement'] = valid['Structure.Basement'].replace(['none'], None)   
train['Structure.Basement'] = train['Structure.Basement'].notnull().astype(int)
valid['Structure.Basement'] = valid['Structure.Basement'].notnull().astype(int)


In [1104]:
# Los valores faltantes de Structure.GarageSpaces se ponen a 0
train['Structure.GarageSpaces'] = train['Structure.GarageSpaces'].fillna(0)
valid['Structure.GarageSpaces'] = valid['Structure.GarageSpaces'].fillna(0)



In [1105]:
# cogemos de esta variable ImageData.style.stories.summary.label el digito del valor de la variable (siempre es el primero) y los nulos a 1
# lo hacemos con re.sub
import re

# Función para extraer los primeros dígitos y hacer las modificaciones necesarias
def extract_and_adjust(value):
    # Extraemos los dígitos del valor (tanto enteros como decimales)
    number_str = re.sub(r'\D', '', str(value))
    if number_str:
        number = float(number_str)
        # Si el número es mayor a 10, lo dividimos por 10
        if number > 10:
            return number / 10
        else:
            return float(number)
    return 1  # Valor por defecto si no se encuentra ningún número

# Aplicamos la función a las columnas correspondientes
train['ImageData.style.stories.summary.label'] = train['ImageData.style.stories.summary.label'].apply(extract_and_adjust)
valid['ImageData.style.stories.summary.label'] = valid['ImageData.style.stories.summary.label'].apply(extract_and_adjust)

# print the unique values of the variable
print(train['ImageData.style.stories.summary.label'].unique())

[2.5 1.  2.  1.5 3. ]


In [1106]:
from fancyimpute import IterativeImputer

# Lista de columnas que queremos imputar
columns_to_impute = [
    "ImageData.q1q6.summary.bathroom",
    "ImageData.c1c6.summary.bathroom",
    "ImageData.q1q6.summary.kitchen",
    "ImageData.c1c6.summary.kitchen",
    "ImageData.q1q6.summary.interior",
    "ImageData.c1c6.summary.interior",
    "ImageData.q1q6.summary.exterior",
    "ImageData.c1c6.summary.exterior",
    "ImageData.q1q6.summary.property",
    "ImageData.c1c6.summary.property",
    "Structure.LivingArea"
]

# Verifica si las columnas están en el DataFrame
columns_in_train = [col for col in columns_to_impute if col in train.columns]
missing_columns = [col for col in columns_to_impute if col not in train.columns]

# Imprime las columnas faltantes, si hay alguna
if missing_columns:
    print(f"Estas columnas no están en el DataFrame: {missing_columns}")

# Imputar los datos solo para las columnas presentes
imputer = IterativeImputer()
imputed_data = imputer.fit_transform(train[columns_in_train])
data_val = imputer.transform(valid[columns_in_train])

# Reemplazar las columnas originales con los valores imputados
train[columns_in_train] = imputed_data
valid[columns_in_train] = data_val



In [1107]:
import numpy as np

# Crear la columna combinada (media de las dos columnas después de imputar)
train['ImageData.summary.exterior'] = (train['ImageData.c1c6.summary.property'] + train['ImageData.q1q6.summary.property']) / 2
valid['ImageData.summary.exterior'] = (valid['ImageData.c1c6.summary.property'] + valid['ImageData.q1q6.summary.property']) / 2

# Eliminar las columnas originales (opcional)
train.drop(columns=['ImageData.c1c6.summary.property', 'ImageData.q1q6.summary.property'], inplace=True)
valid.drop(columns=['ImageData.c1c6.summary.property', 'ImageData.q1q6.summary.property'], inplace=True)



In [1108]:
# juntamos ImageData.c1c6.summary.exterior, ImageData.q1q6.summary.exterior con la media de los valores no nulos
train['ImageData.q1q6.summary.interior'] = train['ImageData.q1q6.summary.interior'].astype(float)
train['ImageData.c1c6.summary.interior'] = train['ImageData.c1c6.summary.interior'].astype(float)
train['ImageData.summary.interior'] = (train['ImageData.c1c6.summary.interior'] + train['ImageData.q1q6.summary.interior']) / 2

valid['ImageData.q1q6.summary.interior'] = valid['ImageData.q1q6.summary.interior'].astype(float)
valid['ImageData.c1c6.summary.interior'] = valid['ImageData.c1c6.summary.interior'].astype(float)
valid['ImageData.summary.interior'] = (valid['ImageData.c1c6.summary.interior'] + valid['ImageData.q1q6.summary.interior']) / 2

# Eliminar las columnas que ya no necesitamos
train.drop(columns=['ImageData.c1c6.summary.interior', 'ImageData.q1q6.summary.interior'], inplace=True)
valid.drop(columns=['ImageData.c1c6.summary.interior', 'ImageData.q1q6.summary.interior'], inplace=True)

In [1109]:
train['ImageData.q1q6.summary.bathroom'] = train['ImageData.q1q6.summary.bathroom'].astype(float)
train['ImageData.c1c6.summary.bathroom'] = train['ImageData.c1c6.summary.bathroom'].astype(float)
train['ImageData.summary.bathroom'] = (train['ImageData.q1q6.summary.bathroom'] + train['ImageData.c1c6.summary.bathroom']) / 2

valid['ImageData.q1q6.summary.bathroom'] = valid['ImageData.q1q6.summary.bathroom'].astype(float)
valid['ImageData.c1c6.summary.bathroom'] = valid['ImageData.c1c6.summary.bathroom'].astype(float)
valid['ImageData.summary.bathroom'] = (valid['ImageData.c1c6.summary.bathroom'] + valid['ImageData.q1q6.summary.bathroom']) / 2


# Eliminar las columnas que ya no necesitamos
train.drop(columns=['ImageData.q1q6.summary.bathroom', 'ImageData.c1c6.summary.bathroom'], inplace=True)
valid.drop(columns=['ImageData.q1q6.summary.bathroom', 'ImageData.c1c6.summary.bathroom'], inplace=True)


In [1110]:
train['ImageData.q1q6.summary.kitchen'] = train['ImageData.q1q6.summary.kitchen'].astype(float)
train['ImageData.c1c6.summary.kitchen'] = train['ImageData.c1c6.summary.kitchen'].astype(float)
train['ImageData.summary.kitchen'] = (train['ImageData.q1q6.summary.kitchen'] + train['ImageData.c1c6.summary.kitchen']) / 2

valid['ImageData.q1q6.summary.kitchen'] = valid['ImageData.q1q6.summary.kitchen'].astype(float)
valid['ImageData.c1c6.summary.kitchen'] = valid['ImageData.c1c6.summary.kitchen'].astype(float)
valid['ImageData.summary.kitchen'] = (valid['ImageData.c1c6.summary.kitchen'] + valid['ImageData.q1q6.summary.kitchen']) / 2

# Eliminar las columnas que ya no necesitamos
train.drop(columns=['ImageData.q1q6.summary.kitchen', 'ImageData.c1c6.summary.kitchen'], inplace=True)
valid.drop(columns=['ImageData.q1q6.summary.kitchen', 'ImageData.c1c6.summary.kitchen'], inplace=True)

In [1111]:
# Reemplazamos Location.Address.CountyOrParish y Location.Address.StreetName de missing a categoria Unknown
train['Location.Address.CountyOrParish'] = train['Location.Address.CountyOrParish'].replace(np.nan, 'Unknown')
train['Location.Address.StreetName'] = train['Location.Address.StreetName'].replace(np.nan, 'Unknown')

valid['Location.Address.CountyOrParish'] = valid['Location.Address.CountyOrParish'].replace(np.nan, 'Unknown')
valid['Location.Address.StreetName'] = valid['Location.Address.StreetName'].replace(np.nan, 'Unknown')


In [1112]:
!pip install geopy



In [1113]:
# ImageData.style.exterior.summary.label missings a unknown
train['ImageData.style.exterior.summary.label'] = train['ImageData.style.exterior.summary.label'].replace(np.nan, 'Unknown')
valid['ImageData.style.exterior.summary.label'] = valid['ImageData.style.exterior.summary.label'].replace(np.nan, 'Unknown')

In [1114]:
#girem les columnes de latitude i longitude
# Intercambiar las columnas de latitud y longitud
train['Location.GIS.Latitude'], train['Location.GIS.Longitude'] = train['Location.GIS.Longitude'], train['Location.GIS.Latitude']
valid['Location.GIS.Latitude'], valid['Location.GIS.Longitude'] = valid['Location.GIS.Longitude'], valid['Location.GIS.Latitude']



In [1115]:
# Location.Address.StreetSuffix los missings por Unknown
train['Location.Address.StreetSuffix'] = train['Location.Address.StreetSuffix'].replace(np.nan, 'Unknown')
valid['Location.Address.StreetSuffix'] = valid['Location.Address.StreetSuffix'].replace(np.nan, 'Unknown')


In [1116]:
#Structure.NewConstructionYN miramos la edad de la casa y si es nueva o no
train['Structure.NewConstructionYN'] = (train['Structure.YearBuilt'] <= 1).astype(int)
valid['Structure.NewConstructionYN'] = (valid['Structure.YearBuilt'] <= 1).astype(int)

In [1117]:
# Borrar street number
train.drop(columns=['Location.Address.StreetNumber'], inplace=True)
valid.drop(columns=['Location.Address.StreetNumber'], inplace=True)

#Street Name missings a Unknown
train['Location.Address.StreetName'] = train['Location.Address.StreetName'].replace('', 'Unknown')
valid['Location.Address.StreetName'] = valid['Location.Address.StreetName'].replace('', 'Unknown')


In [1118]:
import ast

# Función para convertir cadenas que parecen listas en listas reales y tomar el primer valor si es una lista
def convert_to_first_value(x):
    if isinstance(x, str):
        try:
            # Intentar convertir la cadena a una lista real
            x = ast.literal_eval(x)
        except (ValueError, SyntaxError):
            pass  # Si no se puede convertir, dejamos el valor tal cual
    if isinstance(x, list) and len(x) > 0:
        return x[0]  # Si es una lista, devolver el primer valor
    return x  # Si no es una lista, devolver el valor tal cual

# Aplicar la función a la columna 'Location.School.HighSchoolDistrict'
train['Location.School.HighSchoolDistrict'] = train['Location.School.HighSchoolDistrict'].apply(convert_to_first_value)
valid['Location.School.HighSchoolDistrict'] = valid['Location.School.HighSchoolDistrict'].apply(convert_to_first_value)

# Verificar los valores únicos después de la modificación
print(train['Location.School.HighSchoolDistrict'].unique())



[200 '158' 308 '154' 155 4 299 227 219 205 60 nan 125 202 203 '299' '60'
 '207' 99 101 12 '202' 225 120 '87' 307 '140' 68 204 207 430 '5' 100 233
 118 '208' '46' 211 '424' 303 302 217 229 215 25 201 '210' 129 '215' 220
 '214' 158 '155' '120' 212 '116' 5 '61' '129' 1 '127' 46 170 90 122 427
 '101' 117 137 209 111 '99' 228 131 214 '225' 156 230 '200' 127 88 '401'
 126 432 '205' '125' '212' 300 '201' 208 '204' 87 16 502 61 '201u' '4' '1'
 '126' 121 210 124 '218' '86' 94 130 218 'chris' 40 '8' 206 223 301 73 116
 '115' '3' 86 95 128 '200u' 113 500 143.5 '111' 145 424 11 '170' '255u'
 '128' '88' 10 '117' '228' 2 'other' '100' '209' 187 '230' '303' 426 '701'
 '301' 231 108 '229' '233' '211' '219' '427' 193 18 154 '203' '95' '275'
 '307' '193' 140 '207u' '308' '15' '220' 305 428 304 323 157 '365u' '227'
 15 '300' '122' 21 '108' 3 '304' 7 '96' '217' '6' 221 54 160 '6-j' '302'
 '305' '131' '2' 401 '113' '18' '118' 115 74 6 314 '137' '21' '121' '156'
 165 425 8 '206' 9 '124' 50 152 429 '161' '53

In [1119]:
# Quitamos los warnings
import warnings
warnings.filterwarnings('ignore')



In [1120]:
import pandas as pd
from geopy.distance import geodesic
from collections import defaultdict
import numpy as np

# Función para calcular la distancia entre dos coordenadas (latitud, longitud)
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

# Precalcular las distancias entre las casas dentro de cada código postal y almacenar los nombres de las escuelas
def precalculate_distances(train_df):
    print('precalculate_distances')
    distances = defaultdict(list)
    
    # Iteramos sobre las casas para organizar por código postal
    for idx, house in train_df.iterrows():
        house_lat = house['Location.GIS.Latitude']
        house_lon = house['Location.GIS.Longitude']
        postal_code = house['Location.Address.CensusBlock']
        school_name = house['Location.School.HighSchoolDistrict']
        
        # Validar que los valores necesarios existan
        if not pd.isna(house_lat) and not pd.isna(house_lon) and not pd.isna(postal_code):
            distances[postal_code].append((house_lat, house_lon, school_name))  # Guardamos la escuela
    
    return distances

# Función para obtener la escuela más cercana
def get_nearest_school(lat, lon, postal_code, distances):
    # Buscar la escuela más cercana dentro del mismo código postal
    if postal_code not in distances:
        return 'unknown'  # Si no hay casas en el código postal, devolvemos 'unknown'
    
    # Ordenar las casas dentro del mismo código postal por distancia
    distances_sorted = sorted(distances[postal_code], key=lambda x: calculate_distance(lat, lon, x[0], x[1]))
    
    # Buscar el primer nombre de escuela no nulo
    for _, _, school_name in distances_sorted:
        if school_name:  # Si encontramos una escuela válida
            return school_name
    
    return 'unknown'  # Si no se encontró ninguna escuela válida

# Función para llenar los valores faltantes de 'Location.School.HighSchoolDistrict'
def fill_missing_districts(df, distances):
    print('fill_missing_districts')

    # Aplicar la función para buscar escuelas cercanas solo en filas donde falte el distrito escolar
    missing_idx = df['Location.School.HighSchoolDistrict'].isna()
    df.loc[missing_idx, 'Location.School.HighSchoolDistrict'] = df[missing_idx].apply(
        lambda row: get_nearest_school(row['Location.GIS.Latitude'], 
                                       row['Location.GIS.Longitude'], 
                                       row['Location.Address.CensusBlock'], 
                                       distances),
        axis=1
    )

    # Reemplazar cualquier valor faltante restante con "unknown"
    df['Location.School.HighSchoolDistrict'] = df['Location.School.HighSchoolDistrict'].fillna('unknown')
    return df

# Precalcular las distancias para el DataFrame de entrenamiento
distances = precalculate_distances(train)

# Llenar valores faltantes en 'train' y 'valid'
train = fill_missing_districts(train, distances)
valid = fill_missing_districts(valid, distances)

# Verificar que no haya valores faltantes
assert train['Location.School.HighSchoolDistrict'].isna().sum() == 0, "Quedan valores faltantes en train"
assert valid['Location.School.HighSchoolDistrict'].isna().sum() == 0, "Quedan valores faltantes en valid"

print("Todos los valores faltantes han sido rellenados.")


precalculate_distances
fill_missing_districts
fill_missing_districts
Todos los valores faltantes han sido rellenados.


In [1121]:
# Structure.BathroomsHalf, Structure.BathroomsFull los NA los ponemos a 0
train['Structure.BathroomsHalf'] = train['Structure.BathroomsHalf'].fillna(0)
valid['Structure.BathroomsHalf'] = valid['Structure.BathroomsHalf'].fillna(0)

train['Structure.BathroomsFull'] = train['Structure.BathroomsFull'].fillna(0)
valid['Structure.BathroomsFull'] = valid['Structure.BathroomsFull'].fillna(0)


In [1122]:
#Structure.Rooms.RoomsTotal los NA los ponemos a 0
train['Structure.Rooms.RoomsTotal'] = train['Structure.Rooms.RoomsTotal'].fillna(0)
valid['Structure.Rooms.RoomsTotal'] = valid['Structure.Rooms.RoomsTotal'].fillna(0)

In [1123]:
# Structure.BedroomsTotal  los NA los ponemos a 0
train['Structure.BedroomsTotal'] = train['Structure.BedroomsTotal'].fillna(0)
valid['Structure.BedroomsTotal'] = valid['Structure.BedroomsTotal'].fillna(0)                      


In [1124]:
# Structure.FireplacesTotal los NA a 0
train['Structure.FireplacesTotal'] = train['Structure.FireplacesTotal'].fillna(0)
valid['Structure.FireplacesTotal'] = valid['Structure.FireplacesTotal'].fillna(0)




In [1125]:
# Pels missing Location.GIS.Longitude,Location.GIS.Latitude calculem el valor mitja segons Location.Address.PostalCode
train['Location.GIS.Longitude'] = train['Location.GIS.Longitude'].fillna(train.groupby('Location.Address.PostalCode')['Location.GIS.Longitude'].transform('mean'))
train['Location.GIS.Latitude'] = train['Location.GIS.Latitude'].fillna(train.groupby('Location.Address.PostalCode')['Location.GIS.Latitude'].transform('mean'))

valid['Location.GIS.Longitude'] = valid['Location.GIS.Longitude'].fillna(valid.groupby('Location.Address.PostalCode')['Location.GIS.Longitude'].transform('mean'))
valid['Location.GIS.Latitude'] = valid['Location.GIS.Latitude'].fillna(valid.groupby('Location.Address.PostalCode')['Location.GIS.Latitude'].transform('mean'))

In [1126]:
def fill_mode(group):
    mode_value = group.mode()
    if not mode_value.empty:
        return mode_value.iloc[0]
    return group.iloc[0]  

train['Location.Address.CensusBlock'] = train.groupby('Location.Address.PostalCode')['Location.Address.CensusBlock'].transform(fill_mode)
train['Location.Address.CensusTract'] = train.groupby('Location.Address.PostalCode')['Location.Address.CensusTract'].transform(fill_mode)

valid['Location.Address.CensusBlock'] = valid.groupby('Location.Address.PostalCode')['Location.Address.CensusBlock'].transform(fill_mode)
valid['Location.Address.CensusTract'] = valid.groupby('Location.Address.PostalCode')['Location.Address.CensusTract'].transform(fill_mode)


In [1127]:
#borrar las rows de Location.Address.CensusBlock, Location.Address.CensusTract que tienen missing

train = train.dropna(subset=['Location.Address.CensusBlock', 'Location.Address.CensusTract'])
valid = valid.dropna(subset=['Location.Address.CensusBlock', 'Location.Address.CensusTract'])


In [1128]:
# Structure.BelowGradeUnfinishedArea, Structure.BelowGradeFinishedArea los NA a 0
train['Structure.BelowGradeUnfinishedArea'] = train['Structure.BelowGradeUnfinishedArea'].fillna(0)
train['Structure.BelowGradeFinishedArea'] = train['Structure.BelowGradeFinishedArea'].fillna(0)
valid['Structure.BelowGradeUnfinishedArea'] = valid['Structure.BelowGradeUnfinishedArea'].fillna(0)
valid['Structure.BelowGradeFinishedArea'] = valid['Structure.BelowGradeFinishedArea'].fillna(0)



In [1129]:
# Location.Area.SubdivisionName los NA a Unknown (substituir en un futur) 
train['Location.Area.SubdivisionName'] = train['Location.Area.SubdivisionName'].fillna('Unknown')
valid['Location.Area.SubdivisionName'] = valid['Location.Area.SubdivisionName'].fillna('Unknown')



In [1130]:
train['Location.Address.StreetDirection'] = train['Location.Address.StreetDirection'].fillna('Unknown')
valid['Location.Address.StreetDirection'] = valid['Location.Address.StreetDirection'].fillna('Unknown')

In [1131]:
#Structure.FireplacesTotal   los NA a 0
train['Structure.FireplacesTotal'] = train['Structure.FireplacesTotal'].fillna(0)
valid['Structure.FireplacesTotal'] = valid['Structure.FireplacesTotal'].fillna(0)


In [1132]:
#En el year built calcular la edad de la casa (2024 - year built)
train['Structure.YearBuilt'] = 2024 - train['Structure.YearBuilt']
valid['Structure.YearBuilt'] = 2024 - valid['Structure.YearBuilt']

#Structure.YearBuilt los NA a la mediana
train['Structure.YearBuilt'] = train['Structure.YearBuilt'].fillna(train['Structure.YearBuilt'].median())
valid['Structure.YearBuilt'] = valid['Structure.YearBuilt'].fillna(valid['Structure.YearBuilt'].median())





In [1133]:
import pandas as pd
import ast  # Para convertir cadenas en listas
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import spearmanr
from scipy.stats import f_oneway

# Paso 1: Manejar valores NaN y convertir cadenas en listas reales
# Si hay valores NaN, los reemplazamos con ['Unknown']
train['ImageData.features_reso.results'] = train['ImageData.features_reso.results'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

# Paso 2: Aplicar MultiLabelBinarizer para transformar las listas en variables binarias
mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(train['ImageData.features_reso.results'])
binary_df = pd.DataFrame(binary_features, columns=mlb.classes_, index=train.index)

# Paso 3: Concatenar las columnas binarias con el DataFrame original
binary_df.columns = [col + '_features' for col in binary_df.columns]
df = pd.concat([train, binary_df], axis=1)

# Paso 4: Realizar ANOVA para cada variable binarizada
anova_results = {}
for column in binary_df.columns:
    # Dividir la variable continua en grupos según la variable binaria
    group_0 = df[df[column] == 0]['Listing.Price.ClosePrice']
    group_1 = df[df[column] == 1]['Listing.Price.ClosePrice']
    
    # Calcular ANOVA
    if len(group_0) > 1 and len(group_1) > 1:  # Asegurarse de que hay suficientes datos en ambos grupos
        f_stat, p_value = f_oneway(group_0, group_1)
        anova_results[column] = (f_stat, p_value)
    else:
        anova_results[column] = (None, None)  # Insuficientes datos para el ANOVA

# Paso 5: Ordenar los resultados por el estadístico F
sorted_anova = sorted(anova_results.items(), key=lambda x: x[1][0] if x[1][0] is not None else 0, reverse=True)

# Paso 6: Filtrar características importantes según p-valor
threshold_p_value = 0.05
important_features = [feature for feature, (f_stat, p_value) in anova_results.items() if p_value is not None and p_value < threshold_p_value]

# Mostrar resultados
print("Resultados de ANOVA ordenados (por estadístico F):")
for feature, (f_stat, p_value) in sorted_anova:
    if f_stat is not None:
        print(f"{feature}: Estadístico F={f_stat:.3f}, p-valor={p_value:.3f}")

print("\nCaracterísticas importantes (p-valor < 0.05):", important_features)

significant_binary_df = binary_df[important_features]
train = pd.concat([train, significant_binary_df], axis=1)


Resultados de ANOVA ordenados (por estadístico F):
Appliances.BuiltInRefrigerator_features: Estadístico F=7089.580, p-valor=0.000
InteriorOrRoomFeatures.KitchenIsland_features: Estadístico F=6485.576, p-valor=0.000
Appliances.Oven_features: Estadístico F=6245.462, p-valor=0.000
InteriorOrRoomFeatures.DoubleVanity_features: Estadístico F=5151.486, p-valor=0.000
Appliances.WineCooler_features: Estadístico F=4857.708, p-valor=0.000
Appliances.Cooktop_features: Estadístico F=4840.727, p-valor=0.000
InteriorOrRoomFeatures.BreakfastBar_features: Estadístico F=4724.784, p-valor=0.000
Appliances.GasCooktop_features: Estadístico F=4717.824, p-valor=0.000
Appliances.DoubleOven_features: Estadístico F=3781.025, p-valor=0.000
Heating.Fireplaces_features: Estadístico F=3627.259, p-valor=0.000
InteriorOrRoomFeatures.BuiltInFeatures_features: Estadístico F=3494.007, p-valor=0.000
InteriorOrRoomFeatures.TrayCeilings_features: Estadístico F=3380.117, p-valor=0.000
DoorFeatures.FrenchDoors_features: Est

In [1134]:
import pandas as pd
import ast  # Para convertir cadenas en listas
from sklearn.preprocessing import MultiLabelBinarizer

# Paso 1: Manejar valores NaN y convertir cadenas en listas reales en el conjunto de validación
valid['ImageData.features_reso.results'] = valid['ImageData.features_reso.results'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

# Paso 2: Aplicar MultiLabelBinarizer entrenado en el conjunto de entrenamiento para transformar las listas en variables binarias
# Se asume que 'mlb' es el MultiLabelBinarizer que fue entrenado con 'train'
binary_features_valid = mlb.transform(valid['ImageData.features_reso.results'])
binary_df_valid = pd.DataFrame(binary_features_valid, columns=mlb.classes_, index=valid.index)

# Paso 3: Concatenar las columnas binarias con el DataFrame de validación
valid = pd.concat([valid, binary_df_valid], axis=1)

# Mostrar el DataFrame actualizado de validación con las nuevas características binarias
print(valid.head())



                             Characteristics.LotFeatures  \
32045                                                NaN   
64911                                                NaN   
60627  ['pond(s)', 'water view', 'sidewalks', 'street...   
97370                                                NaN   
2323                                                 NaN   

       ImageData.c1c6.summary.exterior  \
32045                          3.06213   
64911                          3.50000   
60627                          3.00000   
97370                          4.00000   
2323                           1.00000   

                         ImageData.features_reso.results  \
32045  [Appliances.Dishwasher, Appliances.ElectricRan...   
64911  [Appliances.GasRange, Appliances.Range, Applia...   
60627  [Appliances.Dishwasher, Appliances.GasRange, A...   
97370  [Appliances.Dishwasher, Appliances.Refrigerato...   
2323   [Appliances.Dishwasher, Appliances.Microwave, ...   

       ImageData.q1q6.sum

In [1135]:
import pandas as pd
import ast  # Para convertir cadenas en listas
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import spearmanr
from scipy.stats import f_oneway

# Paso 1: Manejar valores NaN y convertir cadenas en listas reales
# Si hay valores NaN, los reemplazamos con ['Unknown']
train['Characteristics.LotFeatures'] = train['Characteristics.LotFeatures'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

# Paso 2: Aplicar MultiLabelBinarizer para transformar las listas en variables binarias
mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(train['Characteristics.LotFeatures'])
binary_df = pd.DataFrame(binary_features, columns=mlb.classes_, index=train.index)

# Paso 3: Concatenar las columnas binarias con el DataFrame original
binary_df.columns = [col + '_characteristic' for col in binary_df.columns]
df = pd.concat([train, binary_df], axis=1)

# Paso 4: Realizar ANOVA para cada variable binarizada
anova_results = {}
for column in binary_df.columns:
    # Dividir la variable continua en grupos según la variable binaria
    group_0 = df[df[column] == 0]['Listing.Price.ClosePrice']
    group_1 = df[df[column] == 1]['Listing.Price.ClosePrice']
    
    # Calcular ANOVA
    if len(group_0) > 1 and len(group_1) > 1:  # Asegurarse de que hay suficientes datos en ambos grupos
        f_stat, p_value = f_oneway(group_0, group_1)
        anova_results[column] = (f_stat, p_value)
    else:
        anova_results[column] = (None, None)  # Insuficientes datos para el ANOVA

# Paso 5: Ordenar los resultados por el estadístico F
sorted_anova = sorted(anova_results.items(), key=lambda x: x[1][0] if x[1][0] is not None else 0, reverse=True)

# Paso 6: Filtrar características importantes según p-valor
threshold_p_value = 0.05
important_features = [feature for feature, (f_stat, p_value) in anova_results.items() if p_value is not None and p_value < threshold_p_value]

# Mostrar resultados
print("Resultados de ANOVA ordenados (por estadístico F):")
for feature, (f_stat, p_value) in sorted_anova:
    if f_stat is not None:
        print(f"{feature}: Estadístico F={f_stat:.3f}, p-valor={p_value:.3f}")

print("\nCaracterísticas importantes (p-valor < 0.05):", important_features)

significant_binary_df = binary_df[important_features]
train = pd.concat([train, significant_binary_df], axis=1)

Resultados de ANOVA ordenados (por estadístico F):
landscaped_characteristic: Estadístico F=1776.998, p-valor=0.000
Unknown_characteristic: Estadístico F=517.760, p-valor=0.000
outdoor lighting_characteristic: Estadístico F=409.883, p-valor=0.000
mature trees_characteristic: Estadístico F=341.788, p-valor=0.000
wooded_characteristic: Estadístico F=293.684, p-valor=0.000
cul-de-sac_characteristic: Estadístico F=278.130, p-valor=0.000
water view_characteristic: Estadístico F=194.392, p-valor=0.000
lake front_characteristic: Estadístico F=163.086, p-valor=0.000
beach_characteristic: Estadístico F=159.081, p-valor=0.000
common grounds_characteristic: Estadístico F=155.106, p-valor=0.000
horses allowed_characteristic: Estadístico F=137.428, p-valor=0.000
waterfront_characteristic: Estadístico F=100.232, p-valor=0.000
fenced yard_characteristic: Estadístico F=89.390, p-valor=0.000
lake access_characteristic: Estadístico F=85.743, p-valor=0.000
golf course lot_characteristic: Estadístico F=84

In [1136]:
import pandas as pd
import ast  # Para convertir cadenas en listas
from sklearn.preprocessing import MultiLabelBinarizer

# Paso 1: Manejar valores NaN y convertir cadenas en listas reales en el conjunto de validación
valid['Characteristics.LotFeatures'] = valid['Characteristics.LotFeatures'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

# Paso 2: Aplicar MultiLabelBinarizer entrenado con el conjunto de entrenamiento
# Utilizamos el `mlb` entrenado previamente en el conjunto de `train`
binary_features_valid = mlb.transform(valid['Characteristics.LotFeatures'])
binary_df_valid = pd.DataFrame(binary_features_valid, columns=mlb.classes_, index=valid.index)

# Paso 3: Concatenar las columnas binarias con el DataFrame de validación
binary_df_valid.columns = [col + '_characteristic' for col in binary_df_valid.columns]
valid = pd.concat([valid, binary_df_valid], axis=1)

# Mostrar el DataFrame actualizado de validación con las nuevas características binarias
print(valid.head())


                          Characteristics.LotFeatures  \
32045                                       [Unknown]   
64911                                       [Unknown]   
60627  [pond(s), water view, sidewalks, streetlights]   
97370                                       [Unknown]   
2323                                        [Unknown]   

       ImageData.c1c6.summary.exterior  \
32045                          3.06213   
64911                          3.50000   
60627                          3.00000   
97370                          4.00000   
2323                           1.00000   

                         ImageData.features_reso.results  \
32045  [Appliances.Dishwasher, Appliances.ElectricRan...   
64911  [Appliances.GasRange, Appliances.Range, Applia...   
60627  [Appliances.Dishwasher, Appliances.GasRange, A...   
97370  [Appliances.Dishwasher, Appliances.Refrigerato...   
2323   [Appliances.Dishwasher, Appliances.Microwave, ...   

       ImageData.q1q6.summary.exterior  \
3

In [1137]:
import pandas as pd
import ast  # Para convertir cadenas en listas
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import spearmanr

# Paso 1: Manejar valores NaN y convertir cadenas en listas reales
# Si hay valores NaN, los reemplazamos con ['Unknown']
train['Structure.Cooling'] = train['Structure.Cooling'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

# Paso 2: Aplicar MultiLabelBinarizer para transformar las listas en variables binarias
mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(train['Structure.Cooling'])
binary_df = pd.DataFrame(binary_features, columns=mlb.classes_, index=train.index)

# Paso 3: Concatenar las columnas binarias con el DataFrame original
binary_df.columns = [col + '_cooling' for col in binary_df.columns]
df = pd.concat([train, binary_df], axis=1)

# Paso 4: Realizar ANOVA para cada variable binarizada
anova_results = {}
for column in binary_df.columns:
    # Dividir la variable continua en grupos según la variable binaria
    group_0 = df[df[column] == 0]['Listing.Price.ClosePrice']
    group_1 = df[df[column] == 1]['Listing.Price.ClosePrice']
    
    # Calcular ANOVA
    if len(group_0) > 1 and len(group_1) > 1:  # Asegurarse de que hay suficientes datos en ambos grupos
        f_stat, p_value = f_oneway(group_0, group_1)
        anova_results[column] = (f_stat, p_value)
    else:
        anova_results[column] = (None, None)  # Insuficientes datos para el ANOVA

# Paso 5: Ordenar los resultados por el estadístico F
sorted_anova = sorted(anova_results.items(), key=lambda x: x[1][0] if x[1][0] is not None else 0, reverse=True)

# Paso 6: Filtrar características importantes según p-valor
threshold_p_value = 0.05
important_features = [feature for feature, (f_stat, p_value) in anova_results.items() if p_value is not None and p_value < threshold_p_value]

# Mostrar resultados
print("Resultados de ANOVA ordenados (por estadístico F):")
for feature, (f_stat, p_value) in sorted_anova:
    if f_stat is not None:
        print(f"{feature}: Estadístico F={f_stat:.3f}, p-valor={p_value:.3f}")

print("\nCaracterísticas importantes (p-valor < 0.05):", important_features)

significant_binary_df = binary_df[important_features]
train = pd.concat([train, significant_binary_df], axis=1)

Resultados de ANOVA ordenados (por estadístico F):
zoned_cooling: Estadístico F=9452.232, p-valor=0.000
none_cooling: Estadístico F=1343.623, p-valor=0.000
central air_cooling: Estadístico F=1180.513, p-valor=0.000
space pac_cooling: Estadístico F=939.757, p-valor=0.000
window/wall unit - 1_cooling: Estadístico F=738.796, p-valor=0.000
window/wall units - 2_cooling: Estadístico F=511.205, p-valor=0.000
office only_cooling: Estadístico F=103.947, p-valor=0.000
dual_cooling: Estadístico F=96.869, p-valor=0.000
central individual_cooling: Estadístico F=90.263, p-valor=0.000
geothermal_cooling: Estadístico F=71.052, p-valor=0.000
partial_cooling: Estadístico F=68.916, p-valor=0.000
wall sleeve_cooling: Estadístico F=51.258, p-valor=0.000
window unit(s)_cooling: Estadístico F=49.365, p-valor=0.000
other_cooling: Estadístico F=38.410, p-valor=0.000
window/wall units - 3+_cooling: Estadístico F=37.940, p-valor=0.000
power roof vents_cooling: Estadístico F=34.183, p-valor=0.000
Unknown_cooling

In [1138]:
import pandas as pd
import ast  # Para convertir cadenas en listas
from sklearn.preprocessing import MultiLabelBinarizer

# Paso 1: Manejar valores NaN y convertir cadenas en listas reales en el conjunto de validación
valid['Structure.Cooling'] = valid['Structure.Cooling'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

# Paso 2: Aplicar MultiLabelBinarizer entrenado con el conjunto de entrenamiento
# Utilizamos el `mlb` entrenado previamente en el conjunto de `train`
binary_features_valid = mlb.transform(valid['Structure.Cooling'])
binary_df_valid = pd.DataFrame(binary_features_valid, columns=mlb.classes_, index=valid.index)

# Paso 3: Concatenar las columnas binarias con el DataFrame de validación
binary_df_valid.columns = [col + '_cooling' for col in binary_df_valid.columns]
valid = pd.concat([valid, binary_df_valid], axis=1)

# Mostrar el DataFrame actualizado de validación con las nuevas características binarias
print(valid.head())


                          Characteristics.LotFeatures  \
32045                                       [Unknown]   
64911                                       [Unknown]   
60627  [pond(s), water view, sidewalks, streetlights]   
97370                                       [Unknown]   
2323                                        [Unknown]   

       ImageData.c1c6.summary.exterior  \
32045                          3.06213   
64911                          3.50000   
60627                          3.00000   
97370                          4.00000   
2323                           1.00000   

                         ImageData.features_reso.results  \
32045  [Appliances.Dishwasher, Appliances.ElectricRan...   
64911  [Appliances.GasRange, Appliances.Range, Applia...   
60627  [Appliances.Dishwasher, Appliances.GasRange, A...   
97370  [Appliances.Dishwasher, Appliances.Refrigerato...   
2323   [Appliances.Dishwasher, Appliances.Microwave, ...   

       ImageData.q1q6.summary.exterior  \
3

In [1139]:
import pandas as pd
import ast  # Para convertir cadenas en listas
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import spearmanr

# Paso 1: Manejar valores NaN y convertir cadenas en listas reales
# Si hay valores NaN, los reemplazamos con ['Unknown']
train['ImageData.room_type_reso.results'] = train['ImageData.room_type_reso.results'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

# Paso 2: Aplicar MultiLabelBinarizer para transformar las listas en variables binarias
mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(train['ImageData.room_type_reso.results'])
binary_df = pd.DataFrame(binary_features, columns=mlb.classes_, index=train.index)

# Paso 3: Concatenar las columnas binarias con el DataFrame original
binary_df.columns = [col + '_room' for col in binary_df.columns]

df = pd.concat([train, binary_df], axis=1)

# Paso 4: Realizar ANOVA para cada variable binarizada
anova_results = {}
for column in binary_df.columns:
    # Dividir la variable continua en grupos según la variable binaria
    group_0 = df[df[column] == 0]['Listing.Price.ClosePrice']
    group_1 = df[df[column] == 1]['Listing.Price.ClosePrice']
    
    # Calcular ANOVA
    if len(group_0) > 1 and len(group_1) > 1:  # Asegurarse de que hay suficientes datos en ambos grupos
        f_stat, p_value = f_oneway(group_0, group_1)
        anova_results[column] = (f_stat, p_value)
    else:
        anova_results[column] = (None, None)  # Insuficientes datos para el ANOVA

# Paso 5: Ordenar los resultados por el estadístico F
sorted_anova = sorted(anova_results.items(), key=lambda x: x[1][0] if x[1][0] is not None else 0, reverse=True)

# Paso 6: Filtrar características importantes según p-valor
threshold_p_value = 0.05
important_features = [feature for feature, (f_stat, p_value) in anova_results.items() if p_value is not None and p_value < threshold_p_value]

# Mostrar resultados
print("Resultados de ANOVA ordenados (por estadístico F):")
for feature, (f_stat, p_value) in sorted_anova:
    if f_stat is not None:
        print(f"{feature}: Estadístico F={f_stat:.3f}, p-valor={p_value:.3f}")

print("\nCaracterísticas importantes (p-valor < 0.05):", important_features)


significant_binary_df = binary_df[important_features]
train = pd.concat([train, significant_binary_df], axis=1)

Resultados de ANOVA ordenados (por estadístico F):
DiningArea_room: Estadístico F=2657.804, p-valor=0.000
Bar_room: Estadístico F=2399.485, p-valor=0.000
FloorPlan_room: Estadístico F=2372.253, p-valor=0.000
Patio_room: Estadístico F=2290.901, p-valor=0.000
Office_room: Estadístico F=2273.603, p-valor=0.000
MudRoom_room: Estadístico F=2145.328, p-valor=0.000
WineCellar_room: Estadístico F=1691.565, p-valor=0.000
ExerciseRoom_room: Estadístico F=1393.427, p-valor=0.000
LivingRoom_room: Estadístico F=1367.187, p-valor=0.000
GameRoom_room: Estadístico F=1337.106, p-valor=0.000
EntranceFoyer_room: Estadístico F=975.302, p-valor=0.000
MediaRoom_room: Estadístico F=933.523, p-valor=0.000
Stairs_room: Estadístico F=889.516, p-valor=0.000
WalkInClosets_room: Estadístico F=771.168, p-valor=0.000
AerialView_room: Estadístico F=763.306, p-valor=0.000
SittingRoom_room: Estadístico F=672.304, p-valor=0.000
Deck_room: Estadístico F=619.940, p-valor=0.000
Bedroom_room: Estadístico F=479.373, p-valor=

In [1140]:
import pandas as pd
import ast  # Para convertir cadenas en listas
from sklearn.preprocessing import MultiLabelBinarizer

# Paso 1: Manejar valores NaN y convertir cadenas en listas reales en el conjunto de validación
valid['ImageData.room_type_reso.results'] = valid['ImageData.room_type_reso.results'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

# Paso 2: Aplicar MultiLabelBinarizer entrenado con el conjunto de entrenamiento
# Usamos el `mlb` entrenado previamente en el conjunto de `train`
binary_features_valid = mlb.transform(valid['ImageData.room_type_reso.results'])
binary_df_valid = pd.DataFrame(binary_features_valid, columns=mlb.classes_, index=valid.index)

# Paso 3: Concatenar las columnas binarias con el DataFrame de validación
binary_df_valid.columns = [col + '_room' for col in binary_df_valid.columns]
valid = pd.concat([valid, binary_df_valid], axis=1)

# Mostrar el DataFrame actualizado de validación con las nuevas características binarias
print(valid.head())


                          Characteristics.LotFeatures  \
32045                                       [Unknown]   
64911                                       [Unknown]   
60627  [pond(s), water view, sidewalks, streetlights]   
97370                                       [Unknown]   
2323                                        [Unknown]   

       ImageData.c1c6.summary.exterior  \
32045                          3.06213   
64911                          3.50000   
60627                          3.00000   
97370                          4.00000   
2323                           1.00000   

                         ImageData.features_reso.results  \
32045  [Appliances.Dishwasher, Appliances.ElectricRan...   
64911  [Appliances.GasRange, Appliances.Range, Applia...   
60627  [Appliances.Dishwasher, Appliances.GasRange, A...   
97370  [Appliances.Dishwasher, Appliances.Refrigerato...   
2323   [Appliances.Dishwasher, Appliances.Microwave, ...   

       ImageData.q1q6.summary.exterior  \
3

In [1141]:
import pandas as pd
import ast  # Para convertir cadenas en listas
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import spearmanr

# Paso 1: Manejar valores NaN y convertir cadenas en listas reales
# Si hay valores NaN, los reemplazamos con ['Unknown']
train['Structure.Heating'] = train['Structure.Heating'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

# Paso 2: Aplicar MultiLabelBinarizer para transformar las listas en variables binarias
mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(train['Structure.Heating'])
binary_df = pd.DataFrame(binary_features, columns=mlb.classes_, index=train.index)

# Paso 3: Concatenar las columnas binarias con el DataFrame original
binary_df.columns = [col + '_heating' for col in binary_df.columns]
df = pd.concat([train, binary_df], axis=1)

# Paso 4: Realizar ANOVA para cada variable binarizada
anova_results = {}
for column in binary_df.columns:
    # Dividir la variable continua en grupos según la variable binaria
    group_0 = df[df[column] == 0]['Listing.Price.ClosePrice']
    group_1 = df[df[column] == 1]['Listing.Price.ClosePrice']
    
    # Calcular ANOVA
    if len(group_0) > 1 and len(group_1) > 1:  # Asegurarse de que hay suficientes datos en ambos grupos
        f_stat, p_value = f_oneway(group_0, group_1)
        anova_results[column] = (f_stat, p_value)
    else:
        anova_results[column] = (None, None)  # Insuficientes datos para el ANOVA

# Paso 5: Ordenar los resultados por el estadístico F
sorted_anova = sorted(anova_results.items(), key=lambda x: x[1][0] if x[1][0] is not None else 0, reverse=True)

# Paso 6: Filtrar características importantes según p-valor
threshold_p_value = 0.05
important_features = [feature for feature, (f_stat, p_value) in anova_results.items() if p_value is not None and p_value < threshold_p_value]

# Mostrar resultados
print("Resultados de ANOVA ordenados (por estadístico F):")
for feature, (f_stat, p_value) in sorted_anova:
    if f_stat is not None:
        print(f"{feature}: Estadístico F={f_stat:.3f}, p-valor={p_value:.3f}")

print("\nCaracterísticas importantes (p-valor < 0.05):", important_features)

significant_binary_df = binary_df[important_features]
train = pd.concat([train, significant_binary_df], axis=1)

Resultados de ANOVA ordenados (por estadístico F):
zoned_heating: Estadístico F=6192.548, p-valor=0.000
sep heating systems - 2+_heating: Estadístico F=3037.015, p-valor=0.000
radiant_heating: Estadístico F=799.974, p-valor=0.000
indv controls_heating: Estadístico F=690.720, p-valor=0.000
Unknown_heating: Estadístico F=452.416, p-valor=0.000
forced air_heating: Estadístico F=229.524, p-valor=0.000
electric_heating: Estadístico F=145.744, p-valor=0.000
natural gas_heating: Estadístico F=117.996, p-valor=0.000
baseboard_heating: Estadístico F=117.567, p-valor=0.000
heat pump_heating: Estadístico F=113.167, p-valor=0.000
geothermal_heating: Estadístico F=59.991, p-valor=0.000
radiator(s)_heating: Estadístico F=59.379, p-valor=0.000
none_heating: Estadístico F=47.579, p-valor=0.000
solar_heating: Estadístico F=32.651, p-valor=0.000
other_heating: Estadístico F=30.456, p-valor=0.000
propane_heating: Estadístico F=21.472, p-valor=0.000
floor furnace_heating: Estadístico F=10.483, p-valor=0.0

In [1142]:
import pandas as pd
import ast  # Para convertir cadenas en listas
from sklearn.preprocessing import MultiLabelBinarizer

# Paso 1: Manejar valores NaN y convertir cadenas en listas reales en el conjunto de validación
valid['Structure.Heating'] = valid['Structure.Heating'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

# Paso 2: Aplicar MultiLabelBinarizer entrenado con el conjunto de entrenamiento
# Usamos el `mlb` entrenado previamente en el conjunto de `train`
binary_features_valid = mlb.transform(valid['Structure.Heating'])
binary_df_valid = pd.DataFrame(binary_features_valid, columns=mlb.classes_, index=valid.index)

# Paso 3: Concatenar las columnas binarias con el DataFrame de validación
binary_df_valid.columns = [col + '_heating' for col in binary_df_valid.columns]
valid = pd.concat([valid, binary_df_valid], axis=1)

# Mostrar el DataFrame actualizado de validación con las nuevas características binarias
print(valid.head())


                          Characteristics.LotFeatures  \
32045                                       [Unknown]   
64911                                       [Unknown]   
60627  [pond(s), water view, sidewalks, streetlights]   
97370                                       [Unknown]   
2323                                        [Unknown]   

       ImageData.c1c6.summary.exterior  \
32045                          3.06213   
64911                          3.50000   
60627                          3.00000   
97370                          4.00000   
2323                           1.00000   

                         ImageData.features_reso.results  \
32045  [Appliances.Dishwasher, Appliances.ElectricRan...   
64911  [Appliances.GasRange, Appliances.Range, Applia...   
60627  [Appliances.Dishwasher, Appliances.GasRange, A...   
97370  [Appliances.Dishwasher, Appliances.Refrigerato...   
2323   [Appliances.Dishwasher, Appliances.Microwave, ...   

       ImageData.q1q6.summary.exterior  \
3

In [1143]:
#Mirar los valores missings de la base de datos
missing_values = train.isnull().sum()
missing_values_percent = missing_values / len(train) * 100
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table.columns = ["Missing Values", "Percentage"]
missing_values_table = missing_values_table[missing_values_table["Missing Values"] > 0]
missing_values_table = missing_values_table.sort_values(by="Missing Values", ascending=False)
print(missing_values_table)


Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


In [1144]:
missing_values = valid.isnull().sum()
missing_values_percent = missing_values / len(train) * 100
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table.columns = ["Missing Values", "Percentage"]
missing_values_table = missing_values_table[missing_values_table["Missing Values"] > 0]
missing_values_table = missing_values_table.sort_values(by="Missing Values", ascending=False)
print(missing_values_table)

Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


In [1145]:
# Guardem el dataset de train
train.to_csv('train_cleaned.csv', index=False)


In [1146]:
valid.to_csv('valid_cleaned.csv', index=False)