In [None]:
import pandas as pd

In [None]:
!pip install dython

In [None]:
train_data = pd.read_csv('train_data.txt')
train_data = train_data.rename(columns={'Location.Address.PostalCode': 'Zip'})
train_data['Zip'] = train_data['Zip'].astype(str).str[:5].astype(int)

In [None]:
df_income = pd.read_excel('income.xlsx')
df_income

In [None]:
print(df_income.columns)

In [None]:
df_income['Zip / Population'] = df_income['Zip / Population'].str[:5]
df_income= df_income.drop(columns=['Rank'])

In [None]:
df_income = df_income.rename(columns={'Zip / Population': 'Zip'})
df_income = df_income.rename(columns={'Median Household Income ▼': 'Median Household Income'})
df_income['Zip'] = df_income['Zip'].astype(int)

In [None]:
df_income['Median Household Income'] = (
    df_income['Median Household Income']
    .str.replace('$', '', regex=False)
    .str.replace(',', '', regex=False)
    .astype(int)  )

In [None]:
df_income

In [None]:
train_data = pd.merge(train_data, df_income, on='Zip', how='left')
train_data.to_csv('data_with_income.csv', index=False)

# Preprocessing

In [None]:
#%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.precision', 3)

In [None]:
# extra imports
from pandas import read_csv
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn import preprocessing
from pandas.plotting import scatter_matrix
from scipy.stats import boxcox
from statsmodels.genmod.generalized_linear_model import GLM


# BASIC INSPECTION OF THE DATASET

In [None]:
train_data.describe()

In [None]:
train_data.shape

In [None]:
train_data['Median Household Income'].hist(figsize=(8,8));

In [None]:
train_data['Median Household Income'][(train_data['Median Household Income']!=99999999)&
              (train_data['Median Household Income']!=0)].hist(bins=15,figsize=(8,8));

In [None]:
train_data['Median Household Income'].describe()

In [None]:
train_data['Listing.Price.ClosePrice'].describe()

Preprocessing

In [None]:
from dython.nominal import associations
from dython.nominal import correlation_ratio
import seaborn as sns
import matplotlib.pyplot as plt


numerical_columns = [
    	"ImageData.c1c6.summary.bathroom",
    	"ImageData.c1c6.summary.exterior",
    	"ImageData.c1c6.summary.interior",
    	"ImageData.c1c6.summary.kitchen",
    	"ImageData.c1c6.summary.property",
    	"ImageData.q1q6.summary.bathroom",
    	"ImageData.q1q6.summary.exterior",
    	"ImageData.q1q6.summary.interior",
    	"ImageData.q1q6.summary.kitchen",
    	"ImageData.q1q6.summary.property",
    	"ImageData.style.stories.summary.label",
    	"Listing.Price.ClosePrice",
    	"Location.GIS.Latitude",
    	"Location.GIS.Longitude",
    	"Structure.BathroomsFull",
    	"Structure.BathroomsHalf",
    	"Structure.BedroomsTotal",
    	"Structure.FireplacesTotal",
    	"Structure.Parking",
    	"Structure.Rooms.RoomsTotal",
    	"Structure.YearBuilt",
	"Median Household Income",
	    'Characteristics.SurfaceArea'
	]

associations_result = associations(train_data[numerical_columns], nominal_columns='auto', plot=False)

correlation_matrix = associations_result['corr']

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
train_data = train_data.drop(["ImageData.c1c6.summary.bathroom", "ImageData.c1c6.summary.exterior", "ImageData.c1c6.summary.interior",	"ImageData.c1c6.summary.kitchen",
    	"ImageData.q1q6.summary.bathroom", "ImageData.q1q6.summary.exterior", "ImageData.q1q6.summary.interior", "ImageData.q1q6.summary.kitchen",
    	"Location.GIS.Latitude", "Location.GIS.Longitude", "Characteristics.LotSizeSquareFeet", "Structure.GarageSpaces", "Structure.LivingArea",
			"Structure.ParkingFeatures", "Structure.YearBuilt", 'Location.Address.StreetSuffix'], axis=1)

train_data.head()

In [None]:
train_data['ImageData.c1c6.summary.property'] = train_data['ImageData.c1c6.summary.property'].fillna(train_data['ImageData.c1c6.summary.property'].mean())
train_data['ImageData.q1q6.summary.property'] = train_data['ImageData.q1q6.summary.property'].fillna(train_data['ImageData.q1q6.summary.property'].mean())
train_data['Structure.Rooms.RoomsTotal'] = train_data['Structure.Rooms.RoomsTotal'].fillna(train_data['Structure.Rooms.RoomsTotal'].mean())

In [None]:
train_data['ImageData.features_reso.results'].head(10)

# Preprocessing 

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
import ast

# List of the categorical columns that are structure as a list of strings
columns_to_transform = ['ImageData.features_reso.results', 'ImageData.room_type_reso.results',
                        'Structure.Heating', 'Structure.Cooling']

# Separate each list
for col in columns_to_transform:
    train_data[col] = [
        ast.literal_eval(sublist) if isinstance(sublist, str) else []
        for sublist in train_data[col]
    ]

print(train_data[columns_to_transform].head())

In [None]:

mlb = MultiLabelBinarizer()

# Apply one-hot encoding to each column and concatenate the results
one_hot_features = pd.DataFrame(mlb.fit_transform(train_data['ImageData.features_reso.results']),
                                 columns=mlb.classes_, index=train_data.index)

one_hot_room_type = pd.DataFrame(mlb.fit_transform(train_data['ImageData.room_type_reso.results']),
                                 columns=mlb.classes_, index=train_data.index)

one_hot_heating = pd.DataFrame(mlb.fit_transform(train_data['Structure.Heating']),
                                 columns=mlb.classes_, index=train_data.index)

one_hot_cooling = pd.DataFrame(mlb.fit_transform(train_data['Structure.Cooling']),
                                 columns=mlb.classes_, index=train_data.index)

# Concatenate the result in the the original dataset
one_hot_categorical = pd.concat([one_hot_features, one_hot_room_type, one_hot_heating,one_hot_cooling], axis=1)

print(train_data.head())

Compare the correlation of each feature with the target Close Price

In [None]:
close_price = train_data['Listing.Price.ClosePrice']

# Computes the correlation with the one-hot columns and 'Listing.Price.ClosePrice'
correlation_with_close_price = one_hot_categorical.corrwith(close_price)

print(correlation_with_close_price)

In [None]:
correlation_with_close_price = one_hot_categorical.corrwith(train_data['Listing.Price.ClosePrice'])

# Filter with the features with correlation greater than 0.2
columns_to_keep = correlation_with_close_price[correlation_with_close_price.abs() > 0.2].index

df_one_hot_filtered = one_hot_categorical[columns_to_keep]

print(df_one_hot_filtered.shape)

train_data = pd.concat([train_data, df_one_hot_filtered], axis=1)

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

# Apply one-hot encoding 
one_hot = encoder.fit_transform(train_data[['Property.PropertyType']])
one_hot = one_hot.toarray()

encoded_property_type_df = pd.DataFrame(one_hot, columns=encoder.get_feature_names_out(['Property.PropertyType']))

train_data = pd.concat([train_data, encoded_property_type_df], axis=1)
train_data = train_data.drop(columns=['Property.PropertyType', 'zoned', 'zoned', 'zoned', 'zoned'])

In [None]:
from sklearn.impute import KNNImputer

# Select the columns that we consider important to impute the Median Household income
columns_for_knn = list(encoded_property_type_df.columns) + ['Zip', 'Listing.Price.ClosePrice',
                  'Structure.BathroomsFull', 'Structure.BathroomsHalf', 'Structure.BedroomsTotal',
                  'Structure.Rooms.RoomsTotal', 'Median Household Income']

knn_data = train_data[columns_for_knn]

knn_imputer = KNNImputer(n_neighbors=5)

# Imputing the missing values
imputed_data = knn_imputer.fit_transform(knn_data)

df_imputed = pd.DataFrame(imputed_data, columns=columns_for_knn, index=train_data.index)

train_data['Median Household Income'] = df_imputed['Median Household Income']

In [None]:
# Select the columns that we consider important so impute the SurfaceArea
columns_for_knn = list(encoded_property_type_df.columns) + ['Median Household Income', 'Listing.Price.ClosePrice',
                  'Structure.BathroomsFull', 'Structure.BathroomsHalf', 'Structure.BedroomsTotal',
                  'Characteristics.SurfaceArea']

knn_data = train_data[columns_for_knn]

knn_imputer = KNNImputer(n_neighbors=5)

# Imputing the missing values
imputed_data = knn_imputer.fit_transform(knn_data)

df_imputed = pd.DataFrame(imputed_data, columns=columns_for_knn, index=train_data.index)

train_data['Characteristics.SurfaceArea'] = df_imputed['Characteristics.SurfaceArea']

df_commercial_sale = train_data[train_data['Property.PropertyType_commercial sale'] == 1]
df_commercial_sale.head()

In [None]:
def non_outliers_using_IQR(col_name, X):
    # Calculate IQR boundaries
    Q1 = X[col_name].quantile(0.10)
    Q3 = X[col_name].quantile(0.90)
    IQR = Q3 - Q1

    # Identify non-outliers
    non_outliers_mask = (X[col_name] >= (Q1 - 1.5 * IQR)) & (X[col_name] <= (Q3 + 1.5 * IQR))

    # Return only rows without outliers
    return X[non_outliers_mask]

# List of columns to check for outliers
columns_to_check = [
    "Listing.Price.ClosePrice",
    "Structure.BathroomsFull",
    "Characteristics.SurfaceArea",
]

# Filter rows that are non-outliers for all specified columns
for col in columns_to_check:
    train_data = non_outliers_using_IQR(col, train_data)

In [None]:
train_data = train_data.drop(columns = ['ImageData.room_type_reso.results', 'ImageData.features_reso.results', 'Listing.Dates.CloseDate',
                                        'Location.Address.City', 'Zip', 'Structure.Heating', 'Structure.Cooling'])
print(train_data.columns)

In [None]:
train_data.to_csv('cleaned_data_without_scaling.csv', index=False)