# Preprocessing

In [75]:
import pandas as pd

df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df.describe()

  df = pd.read_csv('train.csv')
  test = pd.read_csv('test.csv')


Unnamed: 0,Characteristics.LotSizeSquareFeet,ImageData.c1c6.summary.bathroom,ImageData.c1c6.summary.exterior,ImageData.c1c6.summary.interior,ImageData.c1c6.summary.kitchen,ImageData.c1c6.summary.property,ImageData.q1q6.summary.bathroom,ImageData.q1q6.summary.exterior,ImageData.q1q6.summary.interior,ImageData.q1q6.summary.kitchen,...,Structure.BathroomsFull,Structure.BathroomsHalf,Structure.BedroomsTotal,Structure.BelowGradeFinishedArea,Structure.BelowGradeUnfinishedArea,Structure.FireplacesTotal,Structure.GarageSpaces,Structure.LivingArea,Structure.Rooms.RoomsTotal,Structure.YearBuilt
count,1690.0,90744.0,87789.0,93597.0,92320.0,103055.0,90708.0,82565.0,93589.0,92292.0,...,100063.0,100049.0,105024.0,14235.0,11674.0,51216.0,88621.0,99509.0,105061.0,102256.0
mean,59491.45,3.116429,3.308723,3.211074,3.117166,3.182633,3.311895,3.499023,3.108145,3.20957,...,1.921669,0.441973,3.102167,701.495539,649.350608,0.941737,2.039585,1742.211448,7.174756,1969.155199
std,478465.2,0.72416,0.676499,0.677497,0.743597,0.682011,0.50811,0.594215,0.540139,0.645833,...,0.864013,0.548588,1.235208,635.400209,577.616585,0.713055,8.542624,1144.743077,2.846424,53.661479
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3743.75,2.8,3.1,2.9,2.8,2.9,3.0,3.1,2.8,2.8,...,1.0,0.0,2.0,0.0,150.0,1.0,1.0,1100.0,5.0,1952.0
50%,8712.0,3.2,3.4,3.3,3.2,3.2,3.3,3.5,3.0,3.2,...,2.0,0.0,3.0,696.0,565.5,1.0,2.0,1550.0,7.0,1973.0
75%,24520.75,3.6,3.7,3.6,3.6,3.6,3.7,3.9,3.4,3.8,...,2.0,1.0,4.0,1073.0,1000.0,1.0,2.0,2200.0,8.0,1998.0
max,9999999.0,6.0,6.0,6.0,6.0,6.0,5.1,5.4,5.1,5.0,...,75.0,9.0,18.0,7782.0,4896.0,21.0,999.0,51400.0,99.0,2024.0


The first step is to divide the data in Train and Validation

In [76]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df, test_size=0.2, random_state=42)

Convert some variables to the same correct format in order to avoid future possible problems

In [77]:
train['Structure.YearBuilt'] = train['Structure.YearBuilt'].astype('object')
train['Location.GIS.Latitude'] = train['Location.GIS.Latitude'].astype('object')
train['Location.GIS.Longitude'] = train['Location.GIS.Longitude'].astype('object')

valid['Structure.YearBuilt'] = valid['Structure.YearBuilt'].astype('object')
valid['Location.GIS.Latitude'] = valid['Location.GIS.Latitude'].astype('object')
valid['Location.GIS.Longitude'] = valid['Location.GIS.Longitude'].astype('object')

test['Structure.YearBuilt'] = test['Structure.YearBuilt'].astype('object')
test['Location.GIS.Latitude'] = test['Location.GIS.Latitude'].astype('object')
test['Location.GIS.Longitude'] = test['Location.GIS.Longitude'].astype('object')


We convert all missings to the same format. 

In [78]:
import pandas as pd
import numpy as np

def convert_to_missing(df):
    missing_values = [None, 'NA', 'NaN', 'nan', '', 'none', 'na', 'Na', 'NULL']
    df = df.replace(missing_values, np.nan)
    return df

train = convert_to_missing(train)
valid = convert_to_missing(valid)
test = convert_to_missing(test)


  df = df.replace(missing_values, np.nan)


We study the missing values in the dataset. 

In [79]:
missing_values = df.isnull().sum()
missing_values_percent = missing_values / len(df) * 100
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table.columns = ["Missing Values", "Percentage"]
missing_values_table = missing_values_table[missing_values_table["Missing Values"] > 0]
missing_values_table = missing_values_table.sort_values(by="Missing Values", ascending=False)
print(missing_values_table)


                                        Missing Values  Percentage
Location.Address.StreetDirectionSuffix          106796   99.403371
Characteristics.LotSizeSquareFeet               105747   98.426985
Location.Address.PostalCodePlus4                104166   96.955425
UnitTypes.UnitTypeType                          102461   95.368448
Tax.Zoning                                       99939   93.021026
Structure.BelowGradeUnfinishedArea               95763   89.134097
Structure.ParkingFeatures                        93880   87.381442
Structure.BelowGradeFinishedArea                 93202   86.750375
Location.Address.UnitNumber                      82894   77.155915
Location.Area.SubdivisionName                    70896   65.988440
Characteristics.LotFeatures                      65417   60.888707
Location.Address.StreetDirectionPrefix           58743   54.676694
Structure.FireplacesTotal                        56221   52.329272
ImageData.q1q6.summary.exterior                  24872   23.15

We decided to join the variable StreetDirectionSuffix with StreetDirectionPrefix since both give the same information. Moreover, StreetDirectionSuffix have a lot of missing. 

We decided to analyze the missing values individually in order to justify all changes and to make the most logical missing treatment. 

In [80]:
print(train['Location.Address.StreetDirectionSuffix'].unique())
print(train['Location.Address.StreetDirectionPrefix'].unique())

train['Location.Address.StreetDirection'] = train['Location.Address.StreetDirectionPrefix'].fillna('') + train['Location.Address.StreetDirectionSuffix'].fillna('')
valid['Location.Address.StreetDirection'] = valid['Location.Address.StreetDirectionPrefix'].fillna('') + valid['Location.Address.StreetDirectionSuffix'].fillna('')
test['Location.Address.StreetDirection'] = test['Location.Address.StreetDirectionPrefix'].fillna('') + test['Location.Address.StreetDirectionSuffix'].fillna('')

train.drop(columns=['Location.Address.StreetDirectionPrefix', 'Location.Address.StreetDirectionSuffix'], inplace=True)
valid.drop(columns=['Location.Address.StreetDirectionPrefix', 'Location.Address.StreetDirectionSuffix'], inplace=True)
test.drop(columns=['Location.Address.StreetDirectionPrefix', 'Location.Address.StreetDirectionSuffix'], inplace=True)

train['Location.Address.StreetDirection'] = train['Location.Address.StreetDirection'].replace('', None)
valid['Location.Address.StreetDirection'] = valid['Location.Address.StreetDirection'].replace('', None)
test['Location.Address.StreetDirection'] = test['Location.Address.StreetDirection'].replace('', None)



[nan 'e' 'n' 'w' 's' 'se' 'sw' 'ne' 'nw']
[nan 'e' 's' 'n' 'w' 'sw' 'ne' 'nw' 'se']


The first variable we analyzed was Location.Address.PostalCodePlus4. By observing the variable we can conclude that it has a large quantity of missing values, moreover, we considered that it is not an important variable that would influence our model, as we have the Postal Code variable which has less missings. 

In [81]:
train.drop(columns=['Characteristics.LotSizeSquareFeet', 'Location.Address.PostalCodePlus4'], inplace=True)
valid.drop(columns=['Characteristics.LotSizeSquareFeet', 'Location.Address.PostalCodePlus4'], inplace=True)
test.drop(columns=['Characteristics.LotSizeSquareFeet', 'Location.Address.PostalCodePlus4'], inplace=True)

We also observed that UnitTypes and TaxZones are variables that have a huge amount of missing data. Moreover, we considered that imputing these missing values could introduce noise and lead to biased results, as these features might not provide enough meaningful information to justify the complexity of imputation. Therefore, we decided to drop these columns to maintain data quality, improve model performance, and ensure consistency across the training, validation, and test datasets. By

In [82]:
train.drop(columns=['UnitTypes.UnitTypeType', 'Tax.Zoning'], inplace=True)
valid.drop(columns=['UnitTypes.UnitTypeType', 'Tax.Zoning'], inplace=True)
test.drop(columns=['UnitTypes.UnitTypeType', 'Tax.Zoning'], inplace=True)

We observed that the missing values in the Structure.ParkingFeatures field were not truly missing. Instead, they actually indicated that the house either does not have a parking space or does not possess any specific attribute related to parking. Given this, we decided that the best approach was to treat these missing values as a separate category, where a missing value would be interpreted as "no parking" or "no information available."
To implement this, we filled the missing values with 0 (indicating "no parking features") and converted the column to a binary format using .notnull().astype(int).

In [83]:
train['Structure.ParkingFeatures'] = train['Structure.ParkingFeatures'].notnull().astype(int)
valid['Structure.ParkingFeatures'] = valid['Structure.ParkingFeatures'].notnull().astype(int)
test['Structure.ParkingFeatures'] = test['Structure.ParkingFeatures'].notnull().astype(int)


First of all, we ensured that missing data is recognized as None, making it consistent with other missing values in the dataset. After taht, we filled the missing values with 'Unknown' since we decided that this varaible might be important and we can not impute it. By adding 'Unknown', we avoid introducing noice or misinterpreting the missing data. 

In [84]:
train['Location.Area.SubdivisionName'] = train['Location.Area.SubdivisionName'].replace('', None)
valid['Location.Area.SubdivisionName'] = valid['Location.Area.SubdivisionName'].replace('', None)
test['Location.Area.SubdivisionName'] = test['Location.Area.SubdivisionName'].replace('', None)

train['Location.Area.SubdivisionName'] = train['Location.Area.SubdivisionName'].fillna('Unknown')
valid['Location.Area.SubdivisionName'] = valid['Location.Area.SubdivisionName'].fillna('Unknown')
test['Location.Area.SubdivisionName'] = test['Location.Area.SubdivisionName'].fillna('Unknown')



We considered that having the unit number of the house is not an important value since it does not give us rellevant information, but it can contribute to meaningful patterns or predictions.

In [85]:
train.drop(columns=['Location.Address.UnitNumber'], inplace=True)
valid.drop(columns=['Location.Address.UnitNumber'], inplace=True)
test.drop(columns=['Location.Address.UnitNumber'], inplace=True)


We decided to remove the StateOrProvince since it is the same value for all rows, so it does not introduce new information. 

In [86]:
print(train['Location.Address.StateOrProvince'].unique())
train.drop(columns=['Location.Address.StateOrProvince'], inplace=True)
valid.drop(columns=['Location.Address.StateOrProvince'], inplace=True)
test.drop(columns=['Location.Address.StateOrProvince'], inplace=True)

['il']


As we have missing values in the Structure.Basement variable, we considered that the best way of imputing it was by changing the values corresponding to ['none'] to None. This allowed us to treat the absence of a basement consistently as missing data, rather than as a categorical value.

After this, we transformed the variable into a binary representation. The idea behind this transformation is that the presence or absence of a basement is a binary condition that can be effectively represented as 1 for presence and 0 for absence. This binary encoding serves multiple purposes:

-Simplicity: A binary representation simplifies the model’s interpretation of the data. Machine learning algorithms often handle binary features more efficiently, particularly when the feature represents the presence or absence of a condition.

-Avoiding Assumptions: By converting to binary, we avoid making assumptions about the meaning of missing values. Treating None as a missing entry allows the model to learn that the absence of a basement is a separate condition from the existence of one.



In [87]:
train['Structure.Basement'] = train['Structure.Basement'].replace(['none'], None)
valid['Structure.Basement'] = valid['Structure.Basement'].replace(['none'], None)  
test['Structure.Basement'] = test['Structure.Basement'].replace(['none'], None) 
train['Structure.Basement'] = train['Structure.Basement'].notnull().astype(int)
valid['Structure.Basement'] = valid['Structure.Basement'].notnull().astype(int)
test['Structure.Basement'] = test['Structure.Basement'].notnull().astype(int)


We also considered that the absence of value in the variable Structure.GarageSpaces does not necessarily imply a missing or unknown value. Instead, it could simply indicate that the property does not have any garage spaces. Therefore, we decided to impute the missing values with 0, which is a more logical representation in this case.

In [88]:
train['Structure.GarageSpaces'] = train['Structure.GarageSpaces'].fillna(0)
valid['Structure.GarageSpaces'] = valid['Structure.GarageSpaces'].fillna(0)
test['Structure.GarageSpaces'] = test['Structure.GarageSpaces'].fillna(0)



While analyzing the variable ImageData.style.stories.summary.label, we observed that the values were stored as strings, but the string values actually represented numerical information (for example, they could be describing the number of stories in a building). We realized that these strings were not informative by themselves, as they seemed to be names or identifiers rather than direct numeric values.

To address this, we decided to convert the values into numeric.

In [89]:
import re
def extract_and_adjust(value):
    number_str = re.sub(r'\D', '', str(value))
    if number_str:
        number = float(number_str)
        if number > 10:
            return number / 10
        else:
            return float(number)
    return 1  

train['ImageData.style.stories.summary.label'] = train['ImageData.style.stories.summary.label'].apply(extract_and_adjust)
valid['ImageData.style.stories.summary.label'] = valid['ImageData.style.stories.summary.label'].apply(extract_and_adjust)
test['ImageData.style.stories.summary.label'] = test['ImageData.style.stories.summary.label'].apply(extract_and_adjust)

print(train['ImageData.style.stories.summary.label'].unique())

[2.5 1.  2.  1.5 3. ]


We also observed that the Q and C variables, contain some values, we decided to use the MICE method to imputed this values. It provides a more sophisticated way to handle missing data. MICE works by filling in the missing values multiple times based on other available data, which helps to preserve the relationships between different features. 

In [90]:
from fancyimpute import IterativeImputer

columns_to_impute = [
    "ImageData.q1q6.summary.bathroom",
    "ImageData.c1c6.summary.bathroom",
    "ImageData.q1q6.summary.kitchen",
    "ImageData.c1c6.summary.kitchen",
    "ImageData.q1q6.summary.interior",
    "ImageData.c1c6.summary.interior",
    "ImageData.q1q6.summary.exterior",
    "ImageData.c1c6.summary.exterior",
    "ImageData.q1q6.summary.property",
    "ImageData.c1c6.summary.property",
    "Structure.LivingArea"
]

columns_in_train = [col for col in columns_to_impute if col in train.columns]
missing_columns = [col for col in columns_to_impute if col not in train.columns]

if missing_columns:
    print(f"Estas columnas no están en el DataFrame: {missing_columns}")

imputer = IterativeImputer()
imputed_data = imputer.fit_transform(train[columns_in_train])
data_val = imputer.transform(valid[columns_in_train])
test[columns_in_train] = imputer.transform(test[columns_in_train])


train[columns_in_train] = imputed_data
valid[columns_in_train] = data_val
test[columns_in_train] = test[columns_in_train]





We observed that the variables CountyOrParish and StreetName had missing values. Since these variables represent categorical data, dropping the missing rows or imputing numeric values wouldn't make sense in this context. Instead, we decided to replace the missing values with the string 'Unknown'. This approach allows us to retain the information in the dataset without introducing biases or incorrect assumptions.

In [91]:
train['Location.Address.CountyOrParish'] = train['Location.Address.CountyOrParish'].replace(np.nan, 'Unknown')
train['Location.Address.StreetName'] = train['Location.Address.StreetName'].replace(np.nan, 'Unknown')

valid['Location.Address.CountyOrParish'] = valid['Location.Address.CountyOrParish'].replace(np.nan, 'Unknown')
valid['Location.Address.StreetName'] = valid['Location.Address.StreetName'].replace(np.nan, 'Unknown')

test['Location.Address.CountyOrParish'] = test['Location.Address.CountyOrParish'].replace(np.nan, 'Unknown')
test['Location.Address.StreetName'] = test['Location.Address.StreetName'].replace(np.nan, 'Unknown')


We noticed that the variable ImageData.style.exterior.summary.label contains missing values. Since this variable appears to describe the style or summary of a property’s exterior, it is categorical in nature. Rather than dropping rows with missing data—which would result in a loss of potentially valuable information—or attempting to impute these values with an arbitrary label, we replaced the missing entries with 'Unknown'.

In [92]:
train['ImageData.style.exterior.summary.label'] = train['ImageData.style.exterior.summary.label'].replace(np.nan, 'Unknown')
valid['ImageData.style.exterior.summary.label'] = valid['ImageData.style.exterior.summary.label'].replace(np.nan, 'Unknown')
test['ImageData.style.exterior.summary.label'] = test['ImageData.style.exterior.summary.label'].replace(np.nan, 'Unknown')

By analyzing the values of Latitude and Longitude, we realized that they were swapped in the dataset. 

In [93]:
train['Location.GIS.Latitude'], train['Location.GIS.Longitude'] = train['Location.GIS.Longitude'], train['Location.GIS.Latitude']
valid['Location.GIS.Latitude'], valid['Location.GIS.Longitude'] = valid['Location.GIS.Longitude'], valid['Location.GIS.Latitude']
test['Location.GIS.Latitude'], test['Location.GIS.Longitude'] = test['Location.GIS.Longitude'], test['Location.GIS.Latitude']

By examining the Location.Address.StreetSuffix variable, we identified that some of its values were missing. Given that this variable represents the suffix of the street name and could provide contextual information about the location, we decided to replace missing values with "Unknown". 

In [94]:
train['Location.Address.StreetSuffix'] = train['Location.Address.StreetSuffix'].replace(np.nan, 'Unknown')
valid['Location.Address.StreetSuffix'] = valid['Location.Address.StreetSuffix'].replace(np.nan, 'Unknown')
test['Location.Address.StreetSuffix'] = test['Location.Address.StreetSuffix'].replace(np.nan, 'Unknown')


We found it more insightful to calculate the age of the house instead of directly using the year of construction. The age provides a clearer understanding of the property's condition, potential maintenance requirements, and market value over time. To do this, we transformed the Structure.YearBuilt column into the age of the house by subtracting the year built from the current year.

In [95]:
train['Structure.YearBuilt'] = 2024 - train['Structure.YearBuilt']
valid['Structure.YearBuilt'] = 2024 - valid['Structure.YearBuilt']
test['Structure.YearBuilt'] = 2024 - test['Structure.YearBuilt']

train['Structure.YearBuilt'] = train['Structure.YearBuilt'].fillna(train['Structure.YearBuilt'].median())
valid['Structure.YearBuilt'] = valid['Structure.YearBuilt'].fillna(valid['Structure.YearBuilt'].median())
test['Structure.YearBuilt'] = test['Structure.YearBuilt'].fillna(test['Structure.YearBuilt'].median())


We found missing values in the NewConstructionYN varaible. We considered that a good option to identify whether a property is a new construction, we used the YearBuilt (the calculated age). If its age is 1 year or less, it can be considered a new property. 

In [96]:
train['Structure.NewConstructionYN'] = (train['Structure.YearBuilt'] <= 1).astype(int)
valid['Structure.NewConstructionYN'] = (valid['Structure.YearBuilt'] <= 1).astype(int)
test['Structure.NewConstructionYN'] = (test['Structure.YearBuilt'] <= 1).astype(int)

We decided to drop the Location.Address.StreetNumber column because it contains specific, unique identifiers for each property that do not contribute valuable information for predictive modeling, as they don't capture patterns that could generalize across the dataset. As for the Location.Address.StreetName column, we replaced empty strings with the placeholder 'Unknown' to handle missing values, ensuring that the dataset remains consistent and complete. This approach prevents errors or inconsistencies caused by missing data while maintaining

In [97]:
train.drop(columns=['Location.Address.StreetNumber'], inplace=True)
valid.drop(columns=['Location.Address.StreetNumber'], inplace=True)
test.drop(columns=['Location.Address.StreetNumber'], inplace=True)

train['Location.Address.StreetName'] = train['Location.Address.StreetName'].replace('', 'Unknown')
valid['Location.Address.StreetName'] = valid['Location.Address.StreetName'].replace('', 'Unknown')
test['Location.Address.StreetName'] = test['Location.Address.StreetName'].replace('', 'Unknown')


As some values of the variable HighSchoolDistrict are value into lists, we transformed it to correct values by removing the lists. 

In [98]:
import ast

def convert_to_first_value(x):
    if isinstance(x, str):
        try:
            x = ast.literal_eval(x)
        except (ValueError, SyntaxError):
            pass 
    if isinstance(x, list) and len(x) > 0:
        return x[0]  
    return x  

train['Location.School.HighSchoolDistrict'] = train['Location.School.HighSchoolDistrict'].apply(convert_to_first_value)
valid['Location.School.HighSchoolDistrict'] = valid['Location.School.HighSchoolDistrict'].apply(convert_to_first_value)
test['Location.School.HighSchoolDistrict'] = test['Location.School.HighSchoolDistrict'].apply(convert_to_first_value)

print(train['Location.School.HighSchoolDistrict'].unique())



[200 '158' 308 '154' 155 4 299 227 219 205 60 nan 125 202 203 '299' '60'
 '207' 99 101 12 '202' 225 120 '87' 307 '140' 68 204 207 430 '5' 100 233
 118 '208' '46' 211 '424' 303 302 217 229 215 25 201 '210' 129 '215' 220
 '214' 158 '155' '120' 212 '116' 5 '61' '129' 1 '127' 46 170 90 122 427
 '101' 117 137 209 111 '99' 228 131 214 '225' 156 230 '200' 127 88 '401'
 126 432 '205' '125' '212' 300 '201' 208 '204' 87 16 502 61 '201u' '4' '1'
 '126' 121 210 124 '218' '86' 94 130 218 'chris' 40 '8' 206 223 301 73 116
 '115' '3' 86 95 128 '200u' 113 500 143.5 '111' 145 424 11 '170' '255u'
 '128' '88' 10 '117' '228' 2 'other' '100' '209' 187 '230' '303' 426 '701'
 '301' 231 108 '229' '233' '211' '219' '427' 193 18 154 '203' '95' '275'
 '307' '193' 140 '207u' '308' '15' '220' 305 428 304 323 157 '365u' '227'
 15 '300' '122' 21 '108' 3 '304' 7 '96' '217' '6' 221 54 160 '6-j' '302'
 '305' '131' '2' 401 '113' '18' '118' 115 74 6 314 '137' '21' '121' '156'
 165 425 8 '206' 9 '124' 50 152 429 '161' '53

To impute the missing values in the HighSchoolDistrict variable, we decided to use a nearest neighbor approach. Specifically, we identified the most similar property in the dataset based on geographical proximity and assigned the same HighSchoolDistrict value to the properties with missing values. We filtered the values by Census and then, with the Latitude and Longitude we calculated the nearest house. 

In [99]:
import pandas as pd
from geopy.distance import geodesic
from collections import defaultdict
import numpy as np

def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

def precalculate_distances(train_df):
    print('precalculate_distances')
    distances = defaultdict(list)
    
    for idx, house in train_df.iterrows():
        house_lat = house['Location.GIS.Latitude']
        house_lon = house['Location.GIS.Longitude']
        postal_code = house['Location.Address.CensusBlock']
        school_name = house['Location.School.HighSchoolDistrict']
        
        if not pd.isna(house_lat) and not pd.isna(house_lon) and not pd.isna(postal_code):
            distances[postal_code].append((house_lat, house_lon, school_name))  
    
    return distances

def get_nearest_school(lat, lon, postal_code, distances):
    if postal_code not in distances:
        return 'unknown' 
    
    distances_sorted = sorted(distances[postal_code], key=lambda x: calculate_distance(lat, lon, x[0], x[1]))
    
    for _, _, school_name in distances_sorted:
        if school_name:  
            return school_name
    
    return 'unknown'  
def fill_missing_districts(df, distances):
    print('fill_missing_districts')

    missing_idx = df['Location.School.HighSchoolDistrict'].isna()
    df.loc[missing_idx, 'Location.School.HighSchoolDistrict'] = df[missing_idx].apply(
        lambda row: get_nearest_school(row['Location.GIS.Latitude'], 
                                       row['Location.GIS.Longitude'], 
                                       row['Location.Address.CensusBlock'], 
                                       distances),
        axis=1
    )

    df['Location.School.HighSchoolDistrict'] = df['Location.School.HighSchoolDistrict'].fillna('unknown')
    return df

distances = precalculate_distances(train)

train = fill_missing_districts(train, distances)
valid = fill_missing_districts(valid, distances)
test = fill_missing_districts(test, distances)

assert train['Location.School.HighSchoolDistrict'].isna().sum() == 0, "Quedan valores faltantes en train"
assert valid['Location.School.HighSchoolDistrict'].isna().sum() == 0, "Quedan valores faltantes en valid"
assert test['Location.School.HighSchoolDistrict'].isna().sum() == 0, "Quedan valores faltantes en test"

print("Todos los valores faltantes han sido rellenados.")


precalculate_distances
fill_missing_districts
fill_missing_districts
fill_missing_districts
Todos los valores faltantes han sido rellenados.


To treat the missings in the variables: BathroomsHalf and BathroomsFull, we considered that the missing data is indicating that the house does not have this attribute so we imputed using a 0. We made this assumption for the RoomsTotal, BedroomsTotal and FiteplaceTotal. 

In [100]:
train['Structure.BathroomsHalf'] = train['Structure.BathroomsHalf'].fillna(0)
valid['Structure.BathroomsHalf'] = valid['Structure.BathroomsHalf'].fillna(0)
test['Structure.BathroomsHalf'] = test['Structure.BathroomsHalf'].fillna(0)

train['Structure.BathroomsFull'] = train['Structure.BathroomsFull'].fillna(0)
valid['Structure.BathroomsFull'] = valid['Structure.BathroomsFull'].fillna(0)
test['Structure.BathroomsFull'] = test['Structure.BathroomsFull'].fillna(0)


In [101]:
train['Structure.Rooms.RoomsTotal'] = train['Structure.Rooms.RoomsTotal'].fillna(0)
valid['Structure.Rooms.RoomsTotal'] = valid['Structure.Rooms.RoomsTotal'].fillna(0)
test['Structure.Rooms.RoomsTotal'] = test['Structure.Rooms.RoomsTotal'].fillna(0)

In [102]:
train['Structure.BedroomsTotal'] = train['Structure.BedroomsTotal'].fillna(0)
valid['Structure.BedroomsTotal'] = valid['Structure.BedroomsTotal'].fillna(0)    
test['Structure.BedroomsTotal'] = test['Structure.BedroomsTotal'].fillna(0)                  


In [103]:
train['Structure.FireplacesTotal'] = train['Structure.FireplacesTotal'].fillna(0)
valid['Structure.FireplacesTotal'] = valid['Structure.FireplacesTotal'].fillna(0)
test['Structure.FireplacesTotal'] = test['Structure.FireplacesTotal'].fillna(0)




To impute the missing values for longitude and latitude, we chose to calculate the mean coordinates of the properties within the same postal code. This approach assumes that houses located in the same postal code are geographically close to each other, and therefore, using the average latitude and longitude of nearby properties provides a reasonable estimate for the missing values.

In [104]:
train['Location.GIS.Longitude'] = train['Location.GIS.Longitude'].fillna(train.groupby('Location.Address.PostalCode')['Location.GIS.Longitude'].transform('mean'))
train['Location.GIS.Latitude'] = train['Location.GIS.Latitude'].fillna(train.groupby('Location.Address.PostalCode')['Location.GIS.Latitude'].transform('mean'))
test['Location.GIS.Longitude'] = test['Location.GIS.Longitude'].fillna(test.groupby('Location.Address.PostalCode')['Location.GIS.Longitude'].transform('mean'))

valid['Location.GIS.Longitude'] = valid['Location.GIS.Longitude'].fillna(valid.groupby('Location.Address.PostalCode')['Location.GIS.Longitude'].transform('mean'))
valid['Location.GIS.Latitude'] = valid['Location.GIS.Latitude'].fillna(valid.groupby('Location.Address.PostalCode')['Location.GIS.Latitude'].transform('mean'))
test['Location.GIS.Latitude'] = test['Location.GIS.Latitude'].fillna(test.groupby('Location.Address.PostalCode')['Location.GIS.Latitude'].transform('mean'))

In [105]:
mean_longitude = train['Location.GIS.Longitude'].mean()
mean_latitude = train['Location.GIS.Latitude'].mean()

train['Location.GIS.Longitude'] = train['Location.GIS.Longitude'].fillna(mean_longitude)
train['Location.GIS.Latitude'] = train['Location.GIS.Latitude'].fillna(mean_latitude)

test['Location.GIS.Longitude'] = test['Location.GIS.Longitude'].fillna(mean_longitude)
test['Location.GIS.Latitude'] = test['Location.GIS.Latitude'].fillna(mean_latitude)

valid['Location.GIS.Longitude'] = valid['Location.GIS.Longitude'].fillna(mean_longitude)
valid['Location.GIS.Latitude'] = valid['Location.GIS.Latitude'].fillna(mean_latitude)


o impute the missing values for CensusBlock and CensusTract, we decided to group the data by PostalCode, as homes within the same postal code are likely to have similar characteristics, including their census data. Once the data is grouped by postal code, we used the mode of the CensusBlock and CensusTract within each group to impute the missing values.

In [106]:
def fill_mode(group):
    mode_value = group.mode()
    if not mode_value.empty:
        return mode_value.iloc[0]
    return group.iloc[0]  

train['Location.Address.CensusBlock'] = train.groupby('Location.Address.PostalCode')['Location.Address.CensusBlock'].transform(fill_mode)
train['Location.Address.CensusTract'] = train.groupby('Location.Address.PostalCode')['Location.Address.CensusTract'].transform(fill_mode)

valid['Location.Address.CensusBlock'] = valid.groupby('Location.Address.PostalCode')['Location.Address.CensusBlock'].transform(fill_mode)
valid['Location.Address.CensusTract'] = valid.groupby('Location.Address.PostalCode')['Location.Address.CensusTract'].transform(fill_mode)

test['Location.Address.CensusBlock'] = test.groupby('Location.Address.PostalCode')['Location.Address.CensusBlock'].transform(fill_mode)
test['Location.Address.CensusTract'] = test.groupby('Location.Address.PostalCode')['Location.Address.CensusTract'].transform(fill_mode)


In [107]:
# Cambiamos los NA en unknown 'Location.Address.CensusBlock', 'Location.Address.CensusTract'
train['Location.Address.CensusBlock'] = train['Location.Address.CensusBlock'].fillna('Unknown')
train['Location.Address.CensusTract'] = train['Location.Address.CensusTract'].fillna('Unknown')

valid['Location.Address.CensusBlock'] = valid['Location.Address.CensusBlock'].fillna('Unknown')
valid['Location.Address.CensusTract'] = valid['Location.Address.CensusTract'].fillna('Unknown')

test['Location.Address.CensusBlock'] = test['Location.Address.CensusBlock'].fillna('Unknown')
test['Location.Address.CensusTract'] = test['Location.Address.CensusTract'].fillna('Unknown')


In this case, the missing values in the BelowGradeUnfinishedArea and BelowGradeFinishedArea variables were imputed with a value of 0. These two variables represent the areas of the house that are either unfinished or finished below the ground level (such as basements). The decision to fill missing values with 0 implies that, if no information is available for these fields, the house is assumed to not have any area in these categories.

In [108]:
train['Structure.BelowGradeUnfinishedArea'] = train['Structure.BelowGradeUnfinishedArea'].fillna(0)
train['Structure.BelowGradeFinishedArea'] = train['Structure.BelowGradeFinishedArea'].fillna(0)
valid['Structure.BelowGradeUnfinishedArea'] = valid['Structure.BelowGradeUnfinishedArea'].fillna(0)
valid['Structure.BelowGradeFinishedArea'] = valid['Structure.BelowGradeFinishedArea'].fillna(0)
test['Structure.BelowGradeUnfinishedArea'] = test['Structure.BelowGradeUnfinishedArea'].fillna(0)
test['Structure.BelowGradeFinishedArea'] = test['Structure.BelowGradeFinishedArea'].fillna(0)



In order to solve problems with missings related to the StreetDirection, we filled with 'Unknown'.

In [109]:
train['Location.Address.StreetDirection'] = train['Location.Address.StreetDirection'].fillna('Unknown')
valid['Location.Address.StreetDirection'] = valid['Location.Address.StreetDirection'].fillna('Unknown')
test['Location.Address.StreetDirection'] = test['Location.Address.StreetDirection'].fillna('Unknown')

Finally, we decided the change the structure of the variables which the values are in lists. For this varaibles, as there were so much values in the lists, we decided to apply one hot encoding and for each new varaible, calculate the correlation with the target. If they are correlated, we mantained the column. 

In [110]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import spearmanr
from scipy.stats import f_oneway


train['ImageData.features_reso.results'] = train['ImageData.features_reso.results'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(train['ImageData.features_reso.results'])
binary_df = pd.DataFrame(binary_features, columns=mlb.classes_, index=train.index)

binary_df.columns = [col + '_features' for col in binary_df.columns]
df = pd.concat([train, binary_df], axis=1)

anova_results = {}
for column in binary_df.columns:
    group_0 = df[df[column] == 0]['Listing.Price.ClosePrice']
    group_1 = df[df[column] == 1]['Listing.Price.ClosePrice']
    
    if len(group_0) > 1 and len(group_1) > 1:  
        f_stat, p_value = f_oneway(group_0, group_1)
        anova_results[column] = (f_stat, p_value)
    else:
        anova_results[column] = (None, None)  

sorted_anova = sorted(anova_results.items(), key=lambda x: x[1][0] if x[1][0] is not None else 0, reverse=True)

threshold_p_value = 0.05
important_features = [feature for feature, (f_stat, p_value) in anova_results.items() if p_value is not None and p_value < threshold_p_value]

print("Resultados de ANOVA ordenados (por estadístico F):")
for feature, (f_stat, p_value) in sorted_anova:
    if f_stat is not None:
        print(f"{feature}: Estadístico F={f_stat:.3f}, p-valor={p_value:.3f}")

print("\nCaracterísticas importantes (p-valor < 0.05):", important_features)

significant_binary_df = binary_df[important_features]
train = pd.concat([train, significant_binary_df], axis=1)


Resultados de ANOVA ordenados (por estadístico F):
Appliances.BuiltInRefrigerator_features: Estadístico F=7082.821, p-valor=0.000
InteriorOrRoomFeatures.KitchenIsland_features: Estadístico F=6473.108, p-valor=0.000
Appliances.Oven_features: Estadístico F=6236.259, p-valor=0.000
InteriorOrRoomFeatures.DoubleVanity_features: Estadístico F=5141.779, p-valor=0.000
Appliances.WineCooler_features: Estadístico F=4849.899, p-valor=0.000
Appliances.Cooktop_features: Estadístico F=4833.361, p-valor=0.000
InteriorOrRoomFeatures.BreakfastBar_features: Estadístico F=4713.352, p-valor=0.000
Appliances.GasCooktop_features: Estadístico F=4710.792, p-valor=0.000
Appliances.DoubleOven_features: Estadístico F=3776.813, p-valor=0.000
Heating.Fireplaces_features: Estadístico F=3617.861, p-valor=0.000
InteriorOrRoomFeatures.BuiltInFeatures_features: Estadístico F=3491.208, p-valor=0.000
InteriorOrRoomFeatures.TrayCeilings_features: Estadístico F=3376.119, p-valor=0.000
DoorFeatures.FrenchDoors_features: Est

In [111]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer

valid['ImageData.features_reso.results'] = valid['ImageData.features_reso.results'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)


binary_features_valid = mlb.transform(valid['ImageData.features_reso.results'])

binary_df_valid = pd.DataFrame(binary_features_valid, columns=mlb.classes_, index=valid.index)
binary_df_valid.columns = [col + '_features' for col in binary_df_valid.columns]

valid = pd.concat([valid, binary_df_valid[important_features]], axis=1)
print(valid.head())



                             Characteristics.LotFeatures  \
32045                                                NaN   
64911                                                NaN   
60627  ['pond(s)', 'water view', 'sidewalks', 'street...   
97370                                                NaN   
2323                                                 NaN   

       ImageData.c1c6.summary.bathroom  ImageData.c1c6.summary.exterior  \
32045                          3.30000                          3.06213   
64911                          3.08973                          3.50000   
60627                          3.10000                          3.00000   
97370                          3.70000                          4.00000   
2323                           1.00000                          1.00000   

       ImageData.c1c6.summary.interior  ImageData.c1c6.summary.kitchen  \
32045                              3.4                             3.1   
64911                              3.0  

In [112]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer

test['ImageData.features_reso.results'] = test['ImageData.features_reso.results'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

binary_features_test = mlb.transform(test['ImageData.features_reso.results'])

binary_df_test = pd.DataFrame(binary_features_test, columns=mlb.classes_, index=test.index)

binary_df_test.columns = [col + '_features' for col in binary_df_test.columns]

test = pd.concat([test, binary_df_test[important_features]], axis=1)

#Eliminem columnes que no ens interessen
train.drop(columns=['ImageData.features_reso.results'], inplace=True)
valid.drop(columns=['ImageData.features_reso.results'], inplace=True)
test.drop(columns=['ImageData.features_reso.results'], inplace=True)

print(test.head())




                        Characteristics.LotFeatures  \
0                                               NaN   
1                                               NaN   
2                                               NaN   
3                                               NaN   
4  ['fenced yard', 'golf course lot', 'landscaped']   

   ImageData.c1c6.summary.bathroom  ImageData.c1c6.summary.exterior  \
0                         2.000000                         3.300000   
1                         3.144666                         3.262419   
2                         3.300000                         3.100000   
3                         3.144666                         3.262419   
4                         3.900000                         3.600000   

   ImageData.c1c6.summary.interior  ImageData.c1c6.summary.kitchen  \
0                          2.20000                        2.000000   
1                          3.22855                        3.146163   
2                          3.100

In [113]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import spearmanr
from scipy.stats import f_oneway

train['Characteristics.LotFeatures'] = train['Characteristics.LotFeatures'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(train['Characteristics.LotFeatures'])
binary_df = pd.DataFrame(binary_features, columns=mlb.classes_, index=train.index)

binary_df.columns = [col + '_characteristic' for col in binary_df.columns]
df = pd.concat([train, binary_df], axis=1)

anova_results = {}
for column in binary_df.columns:
    group_0 = df[df[column] == 0]['Listing.Price.ClosePrice']
    group_1 = df[df[column] == 1]['Listing.Price.ClosePrice']
    
    if len(group_0) > 1 and len(group_1) > 1: 
        f_stat, p_value = f_oneway(group_0, group_1)
        anova_results[column] = (f_stat, p_value)
    else:
        anova_results[column] = (None, None)  

sorted_anova = sorted(anova_results.items(), key=lambda x: x[1][0] if x[1][0] is not None else 0, reverse=True)

threshold_p_value = 0.05
important_features = [feature for feature, (f_stat, p_value) in anova_results.items() if p_value is not None and p_value < threshold_p_value]

print("Resultados de ANOVA ordenados (por estadístico F):")
for feature, (f_stat, p_value) in sorted_anova:
    if f_stat is not None:
        print(f"{feature}: Estadístico F={f_stat:.3f}, p-valor={p_value:.3f}")

print("\nCaracterísticas importantes (p-valor < 0.05):", important_features)

significant_binary_df = binary_df[important_features]
train = pd.concat([train, significant_binary_df], axis=1)

Resultados de ANOVA ordenados (por estadístico F):
landscaped_characteristic: Estadístico F=1774.523, p-valor=0.000
Unknown_characteristic: Estadístico F=516.286, p-valor=0.000
outdoor lighting_characteristic: Estadístico F=409.464, p-valor=0.000
mature trees_characteristic: Estadístico F=340.916, p-valor=0.000
wooded_characteristic: Estadístico F=293.270, p-valor=0.000
cul-de-sac_characteristic: Estadístico F=277.676, p-valor=0.000
water view_characteristic: Estadístico F=194.088, p-valor=0.000
lake front_characteristic: Estadístico F=162.888, p-valor=0.000
beach_characteristic: Estadístico F=158.939, p-valor=0.000
common grounds_characteristic: Estadístico F=155.250, p-valor=0.000
horses allowed_characteristic: Estadístico F=140.909, p-valor=0.000
waterfront_characteristic: Estadístico F=100.119, p-valor=0.000
fenced yard_characteristic: Estadístico F=89.008, p-valor=0.000
lake access_characteristic: Estadístico F=85.647, p-valor=0.000
golf course lot_characteristic: Estadístico F=84

In [114]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer

valid['Characteristics.LotFeatures'] = valid['Characteristics.LotFeatures'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

binary_features_valid = mlb.transform(valid['Characteristics.LotFeatures'])
binary_df_valid = pd.DataFrame(binary_features_valid, columns=mlb.classes_, index=valid.index)

binary_df_valid.columns = [col + '_characteristic' for col in binary_df_valid.columns]
valid = pd.concat([valid, binary_df_valid[important_features]], axis=1)


print(valid.head())


                          Characteristics.LotFeatures  \
32045                                       [Unknown]   
64911                                       [Unknown]   
60627  [pond(s), water view, sidewalks, streetlights]   
97370                                       [Unknown]   
2323                                        [Unknown]   

       ImageData.c1c6.summary.bathroom  ImageData.c1c6.summary.exterior  \
32045                          3.30000                          3.06213   
64911                          3.08973                          3.50000   
60627                          3.10000                          3.00000   
97370                          3.70000                          4.00000   
2323                           1.00000                          1.00000   

       ImageData.c1c6.summary.interior  ImageData.c1c6.summary.kitchen  \
32045                              3.4                             3.1   
64911                              3.0                    

In [115]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer


test['Characteristics.LotFeatures'] = test['Characteristics.LotFeatures'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

binary_features_test = mlb.transform(test['Characteristics.LotFeatures'])

binary_df_test = pd.DataFrame(binary_features_test, columns=mlb.classes_, index=test.index)

binary_df_test.columns = [col + '_characteristic' for col in binary_df_test.columns]

test = pd.concat([test, binary_df_test[important_features]], axis=1)

#Eliminem columnes que no ens interessen
train.drop(columns=['Characteristics.LotFeatures'], inplace=True)
valid.drop(columns=['Characteristics.LotFeatures'], inplace=True)
test.drop(columns=['Characteristics.LotFeatures'], inplace=True)


print(test.head())


   ImageData.c1c6.summary.bathroom  ImageData.c1c6.summary.exterior  \
0                         2.000000                         3.300000   
1                         3.144666                         3.262419   
2                         3.300000                         3.100000   
3                         3.144666                         3.262419   
4                         3.900000                         3.600000   

   ImageData.c1c6.summary.interior  ImageData.c1c6.summary.kitchen  \
0                          2.20000                        2.000000   
1                          3.22855                        3.146163   
2                          3.10000                        2.900000   
3                          3.22855                        3.146163   
4                          4.00000                        4.000000   

   ImageData.c1c6.summary.property  ImageData.q1q6.summary.bathroom  \
0                         2.300000                         2.800000   
1         

In [116]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import spearmanr


train['Structure.Cooling'] = train['Structure.Cooling'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(train['Structure.Cooling'])
binary_df = pd.DataFrame(binary_features, columns=mlb.classes_, index=train.index)

binary_df.columns = [col + '_cooling' for col in binary_df.columns]
df = pd.concat([train, binary_df], axis=1)

anova_results = {}
for column in binary_df.columns:
    group_0 = df[df[column] == 0]['Listing.Price.ClosePrice']
    group_1 = df[df[column] == 1]['Listing.Price.ClosePrice']
    
    if len(group_0) > 1 and len(group_1) > 1:  
        f_stat, p_value = f_oneway(group_0, group_1)
        anova_results[column] = (f_stat, p_value)
    else:
        anova_results[column] = (None, None)  

sorted_anova = sorted(anova_results.items(), key=lambda x: x[1][0] if x[1][0] is not None else 0, reverse=True)

threshold_p_value = 0.05
important_features = [feature for feature, (f_stat, p_value) in anova_results.items() if p_value is not None and p_value < threshold_p_value]

print("Resultados de ANOVA ordenados (por estadístico F):")
for feature, (f_stat, p_value) in sorted_anova:
    if f_stat is not None:
        print(f"{feature}: Estadístico F={f_stat:.3f}, p-valor={p_value:.3f}")

print("\nCaracterísticas importantes (p-valor < 0.05):", important_features)

significant_binary_df = binary_df[important_features]
train = pd.concat([train, significant_binary_df], axis=1)

Resultados de ANOVA ordenados (por estadístico F):
zoned_cooling: Estadístico F=9443.304, p-valor=0.000
none_cooling: Estadístico F=1344.719, p-valor=0.000
central air_cooling: Estadístico F=1164.438, p-valor=0.000
space pac_cooling: Estadístico F=938.965, p-valor=0.000
window/wall unit - 1_cooling: Estadístico F=739.328, p-valor=0.000
window/wall units - 2_cooling: Estadístico F=507.839, p-valor=0.000
office only_cooling: Estadístico F=103.862, p-valor=0.000
dual_cooling: Estadístico F=96.782, p-valor=0.000
central individual_cooling: Estadístico F=90.181, p-valor=0.000
geothermal_cooling: Estadístico F=70.990, p-valor=0.000
partial_cooling: Estadístico F=68.847, p-valor=0.000
wall sleeve_cooling: Estadístico F=51.210, p-valor=0.000
window unit(s)_cooling: Estadístico F=49.309, p-valor=0.000
other_cooling: Estadístico F=39.261, p-valor=0.000
window/wall units - 3+_cooling: Estadístico F=37.974, p-valor=0.000
Unknown_cooling: Estadístico F=36.916, p-valor=0.000
power roof vents_cooling

In [117]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer

valid['Structure.Cooling'] = valid['Structure.Cooling'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

binary_features_valid = mlb.transform(valid['Structure.Cooling'])
binary_df_valid = pd.DataFrame(binary_features_valid, columns=mlb.classes_, index=valid.index)

binary_df_valid.columns = [col + '_cooling' for col in binary_df_valid.columns]
valid = pd.concat([valid, binary_df_valid[important_features]], axis=1)


print(valid.head())


       ImageData.c1c6.summary.bathroom  ImageData.c1c6.summary.exterior  \
32045                          3.30000                          3.06213   
64911                          3.08973                          3.50000   
60627                          3.10000                          3.00000   
97370                          3.70000                          4.00000   
2323                           1.00000                          1.00000   

       ImageData.c1c6.summary.interior  ImageData.c1c6.summary.kitchen  \
32045                              3.4                             3.1   
64911                              3.0                             2.9   
60627                              3.0                             3.0   
97370                              3.6                             3.8   
2323                               1.0                             1.0   

       ImageData.c1c6.summary.property  ImageData.q1q6.summary.bathroom  \
32045                        



In [118]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer


test['Structure.Cooling'] = test['Structure.Cooling'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

binary_features_test = mlb.transform(test['Structure.Cooling'])

binary_df_test = pd.DataFrame(binary_features_test, columns=mlb.classes_, index=test.index)

binary_df_test.columns = [col + '_cooling' for col in binary_df_test.columns]

test = pd.concat([test, binary_df_test[important_features]], axis=1)

# Eliminamos las columnas originales
train.drop(columns=['Structure.Cooling'], inplace=True)
valid.drop(columns=['Structure.Cooling'], inplace=True)
test.drop(columns=['Structure.Cooling'], inplace=True)

print(test.head())


   ImageData.c1c6.summary.bathroom  ImageData.c1c6.summary.exterior  \
0                         2.000000                         3.300000   
1                         3.144666                         3.262419   
2                         3.300000                         3.100000   
3                         3.144666                         3.262419   
4                         3.900000                         3.600000   

   ImageData.c1c6.summary.interior  ImageData.c1c6.summary.kitchen  \
0                          2.20000                        2.000000   
1                          3.22855                        3.146163   
2                          3.10000                        2.900000   
3                          3.22855                        3.146163   
4                          4.00000                        4.000000   

   ImageData.c1c6.summary.property  ImageData.q1q6.summary.bathroom  \
0                         2.300000                         2.800000   
1         

In [119]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import spearmanr

train['ImageData.room_type_reso.results'] = train['ImageData.room_type_reso.results'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(train['ImageData.room_type_reso.results'])
binary_df = pd.DataFrame(binary_features, columns=mlb.classes_, index=train.index)

binary_df.columns = [col + '_room' for col in binary_df.columns]

df = pd.concat([train, binary_df], axis=1)

anova_results = {}
for column in binary_df.columns:
    group_0 = df[df[column] == 0]['Listing.Price.ClosePrice']
    group_1 = df[df[column] == 1]['Listing.Price.ClosePrice']
    
    if len(group_0) > 1 and len(group_1) > 1:  
        f_stat, p_value = f_oneway(group_0, group_1)
        anova_results[column] = (f_stat, p_value)
    else:
        anova_results[column] = (None, None) 

sorted_anova = sorted(anova_results.items(), key=lambda x: x[1][0] if x[1][0] is not None else 0, reverse=True)

threshold_p_value = 0.05
important_features = [feature for feature, (f_stat, p_value) in anova_results.items() if p_value is not None and p_value < threshold_p_value]

print("Resultados de ANOVA ordenados (por estadístico F):")
for feature, (f_stat, p_value) in sorted_anova:
    if f_stat is not None:
        print(f"{feature}: Estadístico F={f_stat:.3f}, p-valor={p_value:.3f}")

print("\nCaracterísticas importantes (p-valor < 0.05):", important_features)


significant_binary_df = binary_df[important_features]
train = pd.concat([train, significant_binary_df], axis=1)

Resultados de ANOVA ordenados (por estadístico F):
DiningArea_room: Estadístico F=2654.193, p-valor=0.000
Bar_room: Estadístico F=2394.913, p-valor=0.000
FloorPlan_room: Estadístico F=2366.405, p-valor=0.000
Patio_room: Estadístico F=2286.801, p-valor=0.000
Office_room: Estadístico F=2267.915, p-valor=0.000
MudRoom_room: Estadístico F=2143.360, p-valor=0.000
WineCellar_room: Estadístico F=1690.276, p-valor=0.000
ExerciseRoom_room: Estadístico F=1389.853, p-valor=0.000
LivingRoom_room: Estadístico F=1358.083, p-valor=0.000
GameRoom_room: Estadístico F=1335.366, p-valor=0.000
EntranceFoyer_room: Estadístico F=971.622, p-valor=0.000
MediaRoom_room: Estadístico F=932.663, p-valor=0.000
Stairs_room: Estadístico F=886.683, p-valor=0.000
WalkInClosets_room: Estadístico F=768.773, p-valor=0.000
AerialView_room: Estadístico F=763.297, p-valor=0.000
SittingRoom_room: Estadístico F=669.426, p-valor=0.000
Deck_room: Estadístico F=617.846, p-valor=0.000
Bedroom_room: Estadístico F=475.152, p-valor=

In [120]:
import pandas as pd
import ast 
from sklearn.preprocessing import MultiLabelBinarizer

valid['ImageData.room_type_reso.results'] = valid['ImageData.room_type_reso.results'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

binary_features_valid = mlb.transform(valid['ImageData.room_type_reso.results'])
binary_df_valid = pd.DataFrame(binary_features_valid, columns=mlb.classes_, index=valid.index)

binary_df_valid.columns = [col + '_room' for col in binary_df_valid.columns]
valid = pd.concat([valid, binary_df_valid[important_features]], axis=1)


print(valid.head())


       ImageData.c1c6.summary.bathroom  ImageData.c1c6.summary.exterior  \
32045                          3.30000                          3.06213   
64911                          3.08973                          3.50000   
60627                          3.10000                          3.00000   
97370                          3.70000                          4.00000   
2323                           1.00000                          1.00000   

       ImageData.c1c6.summary.interior  ImageData.c1c6.summary.kitchen  \
32045                              3.4                             3.1   
64911                              3.0                             2.9   
60627                              3.0                             3.0   
97370                              3.6                             3.8   
2323                               1.0                             1.0   

       ImageData.c1c6.summary.property  ImageData.q1q6.summary.bathroom  \
32045                        

In [121]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer

test['ImageData.room_type_reso.results'] = test['ImageData.room_type_reso.results'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

binary_features_test = mlb.transform(test['ImageData.room_type_reso.results'])

binary_df_test = pd.DataFrame(binary_features_test, columns=mlb.classes_, index=test.index)

binary_df_test.columns = [col + '_room' for col in binary_df_test.columns]

test = pd.concat([test, binary_df_test[important_features]], axis=1)

# Eliminamos las columnas originales
train.drop(columns=['ImageData.room_type_reso.results'], inplace=True)
valid.drop(columns=['ImageData.room_type_reso.results'], inplace=True)
test.drop(columns=['ImageData.room_type_reso.results'], inplace=True)




print(test.head())


   ImageData.c1c6.summary.bathroom  ImageData.c1c6.summary.exterior  \
0                         2.000000                         3.300000   
1                         3.144666                         3.262419   
2                         3.300000                         3.100000   
3                         3.144666                         3.262419   
4                         3.900000                         3.600000   

   ImageData.c1c6.summary.interior  ImageData.c1c6.summary.kitchen  \
0                          2.20000                        2.000000   
1                          3.22855                        3.146163   
2                          3.10000                        2.900000   
3                          3.22855                        3.146163   
4                          4.00000                        4.000000   

   ImageData.c1c6.summary.property  ImageData.q1q6.summary.bathroom  \
0                         2.300000                         2.800000   
1         

In [122]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import spearmanr

train['Structure.Heating'] = train['Structure.Heating'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(train['Structure.Heating'])
binary_df = pd.DataFrame(binary_features, columns=mlb.classes_, index=train.index)

binary_df.columns = [col + '_heating' for col in binary_df.columns]
df = pd.concat([train, binary_df], axis=1)

anova_results = {}
for column in binary_df.columns:
    group_0 = df[df[column] == 0]['Listing.Price.ClosePrice']
    group_1 = df[df[column] == 1]['Listing.Price.ClosePrice']
    
    if len(group_0) > 1 and len(group_1) > 1:  
        f_stat, p_value = f_oneway(group_0, group_1)
        anova_results[column] = (f_stat, p_value)
    else:
        anova_results[column] = (None, None)  

sorted_anova = sorted(anova_results.items(), key=lambda x: x[1][0] if x[1][0] is not None else 0, reverse=True)

threshold_p_value = 0.05
important_features = [feature for feature, (f_stat, p_value) in anova_results.items() if p_value is not None and p_value < threshold_p_value]

print("Resultados de ANOVA ordenados (por estadístico F):")
for feature, (f_stat, p_value) in sorted_anova:
    if f_stat is not None:
        print(f"{feature}: Estadístico F={f_stat:.3f}, p-valor={p_value:.3f}")

print("\nCaracterísticas importantes (p-valor < 0.05):", important_features)

significant_binary_df = binary_df[important_features]
train = pd.concat([train, significant_binary_df], axis=1)

Resultados de ANOVA ordenados (por estadístico F):
zoned_heating: Estadístico F=6186.943, p-valor=0.000
sep heating systems - 2+_heating: Estadístico F=3034.277, p-valor=0.000
radiant_heating: Estadístico F=799.102, p-valor=0.000
indv controls_heating: Estadístico F=690.032, p-valor=0.000
Unknown_heating: Estadístico F=471.522, p-valor=0.000
forced air_heating: Estadístico F=226.300, p-valor=0.000
electric_heating: Estadístico F=144.879, p-valor=0.000
baseboard_heating: Estadístico F=117.648, p-valor=0.000
natural gas_heating: Estadístico F=113.716, p-valor=0.000
heat pump_heating: Estadístico F=113.047, p-valor=0.000
geothermal_heating: Estadístico F=59.934, p-valor=0.000
radiator(s)_heating: Estadístico F=59.256, p-valor=0.000
none_heating: Estadístico F=47.574, p-valor=0.000
solar_heating: Estadístico F=32.616, p-valor=0.000
other_heating: Estadístico F=31.174, p-valor=0.000
propane_heating: Estadístico F=20.840, p-valor=0.000
floor furnace_heating: Estadístico F=10.993, p-valor=0.0

In [123]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer

valid['Structure.Heating'] = valid['Structure.Heating'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

binary_features_valid = mlb.transform(valid['Structure.Heating'])
binary_df_valid = pd.DataFrame(binary_features_valid, columns=mlb.classes_, index=valid.index)

binary_df_valid.columns = [col + '_heating' for col in binary_df_valid.columns]
valid = pd.concat([valid, binary_df_valid[important_features]], axis=1)

print(valid.head())


       ImageData.c1c6.summary.bathroom  ImageData.c1c6.summary.exterior  \
32045                          3.30000                          3.06213   
64911                          3.08973                          3.50000   
60627                          3.10000                          3.00000   
97370                          3.70000                          4.00000   
2323                           1.00000                          1.00000   

       ImageData.c1c6.summary.interior  ImageData.c1c6.summary.kitchen  \
32045                              3.4                             3.1   
64911                              3.0                             2.9   
60627                              3.0                             3.0   
97370                              3.6                             3.8   
2323                               1.0                             1.0   

       ImageData.c1c6.summary.property  ImageData.q1q6.summary.bathroom  \
32045                        

In [124]:
import pandas as pd
import ast  
from sklearn.preprocessing import MultiLabelBinarizer

test['Structure.Heating'] = test['Structure.Heating'].apply(
    lambda x: ['Unknown'] if pd.isna(x) else (ast.literal_eval(x) if isinstance(x, str) else x)
)

binary_features_test = mlb.transform(test['Structure.Heating'])

binary_df_test = pd.DataFrame(binary_features_test, columns=mlb.classes_, index=test.index)

binary_df_test.columns = [col + '_heating' for col in binary_df_test.columns]

test = pd.concat([test, binary_df_test[important_features]], axis=1)

# Eliminamos las columnas originales
train.drop(columns=['Structure.Heating'], inplace=True)
valid.drop(columns=['Structure.Heating'], inplace=True)
test.drop(columns=['Structure.Heating'], inplace=True)

print(test.head())


   ImageData.c1c6.summary.bathroom  ImageData.c1c6.summary.exterior  \
0                         2.000000                         3.300000   
1                         3.144666                         3.262419   
2                         3.300000                         3.100000   
3                         3.144666                         3.262419   
4                         3.900000                         3.600000   

   ImageData.c1c6.summary.interior  ImageData.c1c6.summary.kitchen  \
0                          2.20000                        2.000000   
1                          3.22855                        3.146163   
2                          3.10000                        2.900000   
3                          3.22855                        3.146163   
4                          4.00000                        4.000000   

   ImageData.c1c6.summary.property  ImageData.q1q6.summary.bathroom  \
0                         2.300000                         2.800000   
1         

In [125]:
missing_values = train.isnull().sum()
missing_values_percent = missing_values / len(train) * 100
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table.columns = ["Missing Values", "Percentage"]
missing_values_table = missing_values_table[missing_values_table["Missing Values"] > 0]
missing_values_table = missing_values_table.sort_values(by="Missing Values", ascending=False)
print(missing_values_table)


Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


In [126]:
missing_values = valid.isnull().sum()
missing_values_percent = missing_values / len(train) * 100
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table.columns = ["Missing Values", "Percentage"]
missing_values_table = missing_values_table[missing_values_table["Missing Values"] > 0]
missing_values_table = missing_values_table.sort_values(by="Missing Values", ascending=False)
print(missing_values_table)

Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


In [127]:
missing_values = test.isnull().sum()
missing_values_percent = missing_values / len(train) * 100
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table.columns = ["Missing Values", "Percentage"]
missing_values_table = missing_values_table[missing_values_table["Missing Values"] > 0]
missing_values_table = missing_values_table.sort_values(by="Missing Values", ascending=False)
print(missing_values_table)

Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


In [129]:
train.to_csv('train_cleaned_final.csv', index=False)


In [None]:
valid.to_csv('valid_cleaned.csv', index=False)

In [None]:
test.to_csv('test_cleaned.csv', index=False)

In [None]:
#Numero rows del test
print(test.shape[0])

22039
