In [None]:
import os
import re
import math
import typing
import warnings
from enum import Enum

import numpy as np
import pandas as pd
import seaborn as sns
import pingouin as pg
import matplotlib as mpl
from scipy.stats import zscore
import matplotlib.pyplot as plt

import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge, Lasso, LinearRegression, HuberRegressor, RANSACRegressor, TheilSenRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import silhouette_score, mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score, train_test_split, RandomizedSearchCV

from sklearn.exceptions import DataConversionWarning
from IPython.display import display, HTML
display(HTML("<style>.container { width: 90% !important}; </style>"))

In [None]:
warnings.filterwarnings(
    action='ignore',
    message='înternal gelsd'
)

warnings.filterwarnings(
    action='ignore',
    category=UserWarning
)

warnings.filterwarnings(
    action='ignore',
    category=RuntimeWarning
)

warnings.filterwarnings(
    action='ignore', 
    category=DataConversionWarning
)

np.random.seed(1)

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
# Matplotlib set label size
%matplotlib inline

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=14)
mpl.rc('ytick', labelsize=14)

plt.rc('font', size=12)
plt.rc('figure', figsize=(12, 5))

In [None]:
sns.set_style('whitegrid')
sns.set_context(
    'notebook',
    font_scale=1,
    rc={
        'lines.linewidth': 2,
        'font.family': [u'times']
    }
)

## Loading Data

In [None]:
train_set = pd.read_csv('./data/train_set.csv', index_col=0)
test_set = pd.read_csv('./data/test_set.csv', index_col=0)

In [None]:
train_set.Price.hist()

In [None]:
train_set['price_range'] = pd.cut(train_set.Price, bins=[.0, .5e6, 1e6, 1.5e6, 2e6, 10e6], labels=[1, 2, 3, 4, 5])
train_set['price_range'].hist(bins=5)

In [None]:
def train_val_split(train_set: pd.DataFrame, stratify_col: str = 'price_range'):
    train_set_target = train_set[stratify_col]

    X_train, X_valid, Y_train, Y_valid = train_test_split(
        train_set, 
        train_set_target, 
        stratify=train_set_target, 
        test_size=.15,
        random_state=0
    )
    
    return pd.concat([X_train, Y_train], axis=1), pd.concat([X_valid, Y_valid], axis=1)

In [None]:
train_set, valid_set = train_val_split(train_set)

In [None]:
train_set.drop(['price_range'], axis=1, inplace=True)
valid_set.drop(['price_range'], axis=1, inplace=True)

## EDA

In [None]:
train_set.describe()

In [None]:
sns.countplot(x='Type', data=train_set)

In [None]:
train_set.plot(
    kind='scatter',
    x='Longtitude',
    y='Lattitude',
    alpha=.3,
    figsize=(20, 10),
    c='Price', 
    cmap=plt.get_cmap('jet'),
    colorbar=True,
    sharex=False
)

In [None]:
train_set.loc[:, 'Price_log'] = np.log(train_set['Price'])
corr = train_set.corr(numeric_only=True)

# Getting the Upper Triangle of the co-relation matrix
matrix = np.triu(corr)

sns.heatmap(
    corr,
    annot=True,
    mask=matrix
)

In [None]:
corr_sorted = corr['Price'].sort_values()
corr_sorted

In [None]:
corr_sorted = corr['Price_log'].sort_values()
corr_sorted

In [None]:
# Will do pair plot with most correlated values
sns.pairplot(
    train_set[corr_sorted.index[:3].tolist() + corr_sorted.index[-3:].tolist()],
    kind='reg',
    plot_kws={'line_kws':{'color':'red'}, 'scatter_kws': {'alpha': 0.1}}
)

## Data Processing

In [None]:
print(train_set.isnull().sum(axis=0).to_string())

In [None]:
train_set_processed = train_set.copy()
valid_set_processed = valid_set.copy()
test_set_processed = test_set.copy()

Decided not to handle outliers, but create models that are robust to them. As outliers are a valid data from our dataset.

### Handling NaNs

#### Car

In [None]:
# How is the distribution of price when car is not set
sns.histplot(
    train_set_processed[train_set_processed.Car.isnull()]['Price']
)

In [None]:
def handle_null_car(train_df, valid_df, test_df, year_not_car: int = 1940):
    """
    We will assume that for the nulls in the year before year_not_car, are because there as effectively no car.
    
    """
    
    # For the other ones would place the median
    train_set_median = train_df['Car'].median()
    
    # For very aged houses
    train_df.loc[
        train_df['YearBuilt'] < year_not_car,
        'Car'
    ] = 0

    valid_df.loc[
        valid_df['YearBuilt'] < year_not_car,
        'Car'
    ] = 0

    test_df.loc[
        test_df['YearBuilt'] < year_not_car,
        'Car'
    ] = 0
    
    
    # And for the other houses
    train_df['Car'].fillna(train_set_median, inplace=True)
    valid_df['Car'].fillna(train_set_median, inplace=True)
    test_df['Car'].fillna(train_set_median, inplace=True)
    
    return train_df, valid_df, test_df

In [None]:
train_set_processed, valid_set_processed, test_set_processed = handle_null_car(train_set_processed, valid_set_processed, test_set_processed)

####  BuildingArea

In [None]:
def get_df_valid_size(df: pd.DataFrame, column: str):
    return df[
        (~df[column].isnull()) &
        (df[column] > 0)
    ]

In [None]:
def get_df_invalid_size(df: pd.DataFrame, column: str):
    return df[
        (df[column].isnull()) |
        (df[column] <= 0)
    ]

In [None]:
# We can see that half of the dataset is not useful
building_area_train_base = get_df_valid_size(train_set_processed, 'BuildingArea')
building_area_train_target = get_df_invalid_size(train_set_processed, 'BuildingArea')

print('Valid building area shape: ', building_area_train_base.shape)
print('Invalid building area shape: ', building_area_train_target.shape)

We can try to find an easy relationship to the building area.

In [None]:
building_area_train_base.head(3)

In [None]:
from typing import List

def create_logs(df: pd.DataFrame, columns: List[str] = ['BuildingArea', 'Rooms', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'YearBuilt']):
    for col in columns:
        df[col + '_log'] = np.log(df[col] + 1)
    
    df.replace([np.inf, -np.inf], 0, inplace=True)

    return df

In [None]:
# And we find that there are relationships of log-log of nearly 60&
building_area_train_base = create_logs(building_area_train_base)

building_area_train_base[
    ['BuildingArea', 'BuildingArea_log', 
    'Rooms', 'Bathroom', 'Bedroom2', 'Car',
    'Rooms_log', 'Bedroom2_log', 'Bathroom_log', 'Car_log']
].corr()\
    .sort_values('BuildingArea_log')

So for this model, seems the variables that have great influence in the building area log are:
- Car
- Bathroom Logarithm
- Bedroom2 Logarithm
- Rooms Logarithm

In [None]:
building_area_features = ['Car', 'Bathroom_log', 'Bedroom2_log', 'Rooms_log']

In [None]:
Y_building_area_train_base = building_area_train_base.BuildingArea_log
X_building_area_train_base = building_area_train_base[building_area_features]

In [None]:
building_area_scaler = StandardScaler()
X_building_area_train_base = building_area_scaler.fit_transform(X_building_area_train_base)

In [None]:
# Huber Regressor is robust agains outliers
ba_linear_model = HuberRegressor().fit(X_building_area_train_base, Y_building_area_train_base)

In [None]:
# Can see in the train set which is r2 scoring
from sklearn.metrics import r2_score
round(r2_score(Y_building_area_train_base, ba_linear_model.predict(X_building_area_train_base)), 2)

And now we will use this model to imput the missing values in the building area.

In [None]:
from copy import deepcopy

def input_nan_logarithms(
    model, 
    scaler, 
    df: pd.DataFrame, 
    column_target: str, 
    column_features,
    log: bool = True,
    logging: bool = True,
):
    
    """
    Input the nans with a given model with the linear regression model for logarithms.
    Also, computes the MSE for the base dataframe (the one without nan).
    - model: the model trained
    - scaler: StandardScaler fitted
    - df: the dataframe for which we want to replace the nans
    - column_target: for which column do we want to replace the NaNs
    - column_features: list of features used from the df
    - log: if we are predicting a logarithm
    - logging: if we want to print out scores
    """
    df = deepcopy(df)
    column_target_model = column_target + '_log' if log else column_target
    
    # Get the valid and invalid dataframes
    base_df = get_df_valid_size(df, column_target)
    target_df = get_df_invalid_size(df, column_target)
    
    # Create the log features
    base_df = create_logs(base_df)
    target_df = create_logs(target_df)
    
    # Apply scaling
    Y_base_df = base_df[column_target_model]  # we expect the log of that variable
    X_base_df = base_df[column_features]
    X_base_df = scaler.transform(X_base_df)
    
    X_target_df = target_df[column_features]
    X_target_df = scaler.transform(X_target_df)
    Y_target_df_idx = target_df.index.values

    # And now we make the predictions
    prediction_base = model.predict(X_base_df)
    prediction_target = model.predict(X_target_df)
    
    # We compute scoring on the base
    comparison = [Y_base_df, prediction_base]
    comparison = [np.exp(x) if log else x for x in comparison]
    
    # Compute accuracy on the base
    if logging:
        print('-'*20)
        print('MSE error: ', mean_squared_error(
            *comparison,
            squared=False
        ))

        print('R2 error: ', r2_score(
            *comparison,
        ))
        print('-'*20)
    
    # And finally we fill with the exponential
    df.loc[
        Y_target_df_idx,
        column_target
    ] = np.exp(prediction_target)
    
    return df

In [None]:
train_set_processed = input_nan_logarithms(ba_linear_model, building_area_scaler, train_set_processed, 'BuildingArea', building_area_features, logging=False)
valid_set_processed = input_nan_logarithms(ba_linear_model, building_area_scaler, valid_set_processed, 'BuildingArea', building_area_features)
test_set_processed = input_nan_logarithms(ba_linear_model, building_area_scaler, test_set_processed, 'BuildingArea', building_area_features)

#### Landsize

This one does not have nulls, but have land size of 0. Which are values we can assumed that were put when they did not have a value for the land size.

Same analysis as before, but could include now the Building area & building area logs

In [None]:
# We can see that half of the dataset is not useful
landsize_train_base = get_df_valid_size(train_set_processed, 'Landsize')
landsize_train_target = get_df_invalid_size(train_set_processed, 'Landsize')

print('Valid landsize shape: ', landsize_train_base.shape)
print('Invalid landsize shape: ', landsize_train_target.shape)

In [None]:
landsize_train_base = create_logs(landsize_train_base)

landsize_train_base[
    ['Landsize', 'Landsize_log', 'BuildingArea', 'BuildingArea_log', 
    'Rooms', 'Bathroom', 'Bedroom2', 'Car',
    'Rooms_log', 'Bedroom2_log', 'Bathroom_log', 'Car_log']
].corr()\
    .sort_values('Landsize_log')

Have tried training for both Landsize & Landsize log, and I get better R2 scoring for the Landsize.

In [None]:
landsize_features = ['Bathroom', 'BuildingArea', 'Car', 'Rooms', 'Bedroom2']

In [None]:
Y_landsize_train_base = landsize_train_base.Landsize
X_landsize_train_base = landsize_train_base[landsize_features]

In [None]:
landsize_scaler = StandardScaler()
X_landsize_train_base = landsize_scaler.fit_transform(X_landsize_train_base)

In [None]:
ls_linear_model = HuberRegressor().fit(X_landsize_train_base, Y_landsize_train_base)

In [None]:
round(r2_score(Y_landsize_train_base, ls_linear_model.predict(X_landsize_train_base)), 2)

Will try to fill the NaNs with the model.

In [None]:
_ = input_nan_logarithms(ls_linear_model, landsize_scaler, train_set_processed, 'Landsize', landsize_features, log=False, logging=False)
_ = input_nan_logarithms(ls_linear_model, landsize_scaler, valid_set_processed, 'Landsize', landsize_features, log=False)
_ = input_nan_logarithms(ls_linear_model, landsize_scaler, test_set_processed, 'Landsize', landsize_features, log=False)

In [None]:
# We can see a huge MSE, so we will go for another way to fill the nans
pd.DataFrame(train_set_processed\
    .groupby(['Regionname', 'Type'])\
    .median(numeric_only=True)['Landsize'])\
    .T

In [None]:
# First, we will handle this grouping
train_set_processed['Regionname'].value_counts()

In [None]:
other_region_name_cols = ['Eastern Victoria', 'Western Victoria', 'Northern Victoria']

train_set_processed.loc[
    train_set_processed['Regionname'].isin(other_region_name_cols),
    'Regionname'
] = 'Other'

valid_set_processed.loc[
    valid_set_processed['Regionname'].isin(other_region_name_cols),
    'Regionname'
] = 'Other'

test_set_processed.loc[
    test_set_processed['Regionname'].isin(other_region_name_cols),
    'Regionname'
] = 'Other'

In [None]:
# And we can check again
train_set_processed['Regionname'].value_counts()

In [None]:
landsize_by_region_type = pd.DataFrame(train_set_processed\
    .groupby(['Regionname', 'Type'])\
    .median(numeric_only=True)['Landsize'])  # median to do not be influenced by outliers of the 0

landsize_by_region_type.T

In [None]:
# And this one will be used in the case we get a 0
landsize_by_region = pd.DataFrame(train_set_processed\
    .groupby(['Regionname'])\
    .median(numeric_only=True)['Landsize'])

landsize_by_region.T

Will use this as a hash map, of values that will be replaced for the NaN of the dataframes.

In [None]:
def handle_null_landsize(df: pd.DataFrame):
    for idx, row in df.iterrows():
        landsize = row['Landsize']
        
        if pd.isnull(landsize) or landsize == 0:
            region_name = row['Regionname']
            type_name = row['Type']
            
            new_value = landsize_by_region_type.loc[region_name, type_name].values[0]
            new_value = new_value if new_value > 0.0 else landsize_by_region.loc[region_name].values[0]
            
            df.loc[
                idx, 'Landsize'
            ] = new_value
    
    return df

In [None]:
train_set_processed = handle_null_landsize(train_set_processed)
valid_set_processed = handle_null_landsize(valid_set_processed)
test_set_processed = handle_null_landsize(test_set_processed)

#### CouncilArea

Will can try to use a **k-nearest neighbor**, as the council area depends a lot in the position of longtitude and lattitude.

In [None]:
train_set_processed.head(3)

In [None]:
test_set_processed.isnull().sum(axis=0)

In [None]:
train_set_base = train_set_processed.dropna(subset=['CouncilArea'])
train_set_target = train_set_processed[train_set_processed['CouncilArea'].isna()]

# Explicative variables
X_train_base_set_council = train_set_base[['Lattitude', 'Longtitude']]
X_train_target_set_council = train_set_target[['Lattitude', 'Longtitude']]
X_train_target_set_council_index = X_train_target_set_council.index

# Target variables
Y_train_base_set_council = train_set_base['CouncilArea'].values.ravel()

In [None]:
# Scale the inputs
scaler = StandardScaler()

X_train_base_set_council = scaler.fit_transform(X_train_base_set_council)
X_train_target_set_council = scaler.transform(X_train_target_set_council)

In [None]:
# Train the model
n_council_area = train_set_processed['CouncilArea'].nunique()
knn_council_area = KNeighborsClassifier(n_council_area)

knn_council_area.fit(X_train_base_set_council, Y_train_base_set_council)

In [None]:
valid_set_base = valid_set_processed.dropna(subset='CouncilArea')
X_valid_base_set_council = valid_set_base[['Lattitude', 'Longtitude']]
Y_valid_base_set_council = valid_set_base['CouncilArea'].values.ravel()

accuracy_score(knn_council_area.predict(X_valid_base_set_council), Y_valid_base_set_council)

We can see that this is not giving us very good results.

In [None]:
train_set_processed['CouncilArea'].value_counts()

In [None]:
train_set_processed['CouncilArea'].fillna('Unknown', inplace=True)
valid_set_processed['CouncilArea'].fillna('Unknown', inplace=True)
test_set_processed['CouncilArea'].fillna('Unknown', inplace=True)

### YearBuilt

In [None]:
# We can see that half of the dataset is not useful
yearbuilt_train_base = get_df_valid_size(train_set_processed, 'YearBuilt')
yearbuilt_train_target = get_df_invalid_size(train_set_processed, 'YearBuilt')

print('Valid year built shape: ', yearbuilt_train_base.shape)
print('Invalid year built shape: ', yearbuilt_train_target.shape)

In [None]:
yearbuilt_train_base = create_logs(yearbuilt_train_base)

yearbuilt_train_base[
    ['YearBuilt', 'YearBuilt_log', 'Landsize', 'Landsize_log', 'BuildingArea', 'BuildingArea_log', 
    'Rooms', 'Bathroom', 'Bedroom2', 'Car',
    'Rooms_log', 'Bedroom2_log', 'Bathroom_log', 'Car_log']
].corr()\
    .sort_values('YearBuilt_log')

So we could try out a model with:
- Car Logarithm
- Car
- Bathroom Logarithm
- Bathroom

In [None]:
yearbuilt_features = ['Car_log', 'Car', 'Bathroom_log', 'Bathroom']
Y_yearbuilt_train_base = yearbuilt_train_base.YearBuilt_log
X_yearbuilt_train_base = yearbuilt_train_base[yearbuilt_features]

In [None]:
yearbuilt_scaler = StandardScaler()
X_yearbuilt_train_base = yearbuilt_scaler.fit_transform(X_yearbuilt_train_base)
yb_linear_model = HuberRegressor().fit(X_yearbuilt_train_base, Y_yearbuilt_train_base)

In [None]:
# And we can see a model that has good 
round(r2_score(Y_yearbuilt_train_base, yb_linear_model.predict(X_yearbuilt_train_base)), 2)

In [None]:
train_set_processed = input_nan_logarithms(yb_linear_model, yearbuilt_scaler, train_set_processed, 'YearBuilt', yearbuilt_features, logging=False)
valid_set_processed = input_nan_logarithms(yb_linear_model, yearbuilt_scaler, valid_set_processed, 'YearBuilt', yearbuilt_features)
test_set_processed = input_nan_logarithms(yb_linear_model, yearbuilt_scaler, test_set_processed, 'YearBuilt', yearbuilt_features)

With this, we have handled all the Nulls that we had for this df.

In [None]:
print(train_set_processed.isnull().sum(axis=0).to_string())

## Handling Categorical Data

There are some categorical variables that have too much different values.

So instead of using all of them, we are going to group them depending if they tend to have high/medium/low price values.

In [None]:
def plot_distribution_grouped_field(
    df,
    field: str, 
    expensive_divisor: int = 1.5e6, 
    premium_divisor: int = 2e6,
    font_size: int = 6
):
    
    grouped_pricing = df\
        .groupby(field)\
        .mean()['Price']\
        .sort_values()
    
    grouped_names = grouped_pricing.index.values.tolist()

    fig, ax = plt.subplots(figsize=(20, 10))

    sns.scatterplot(
        x=grouped_names,
        y=grouped_pricing,
        ax=ax
    )

    plt.axhline(
        y=expensive_divisor, 
        color='green',
        linestyle='--',
        label='Expensive divisor'
    )

    plt.axhline(
        y=premium_divisor, 
        color='orange',
        linestyle='dotted',
        label='Premium divisor'
    )

    plt.xticks(
        size=font_size,
        rotation=90
    );

    plt.legend()
    
    return grouped_pricing, grouped_names

In [None]:
def classify_category(
    group_pricing, 
    df: pd.DataFrame,
    input_column_name: str,
    output_column_name: str,
    premium_threshold = 2e6,
    expensive_threshold = 1.5e6,
    
):
    
    # We get the group names
    premium_group = list(
        group_pricing[group_pricing >= premium_threshold].index.values
    )
    
    expensive_group = list(
        group_pricing[
            (group_pricing >= expensive_threshold) & 
            (group_pricing < premium_threshold)
        ].index.values
    )
    
    normal_group = list(
        group_pricing[group_pricing < expensive_threshold].index.values
    )
    
    # And now we make the classification
    df.loc[
        df[input_column_name].isin(premium_group), 
        output_column_name
    ] = 2

    df.loc[
        df[input_column_name].isin(expensive_group), 
        output_column_name
    ] = 1

    df.loc[
        df[input_column_name].isin(normal_group), 
        output_column_name
    ] = 0
    
    # And in the case some of it has not been classified: 0 as we consider them less than normal
    df[output_column_name].fillna(0, inplace=True)
    
    return df

### SellerG

In [None]:
train_set_processed.head(3)

In [None]:
seller_pricing, _ = plot_distribution_grouped_field(train_set_processed, field='SellerG')

In [None]:
train_set_processed = classify_category(
    df=train_set_processed,
    group_pricing=seller_pricing,
    input_column_name='SellerG',
    output_column_name='seller_class',
    premium_threshold=2e6,
    expensive_threshold=1.5e6,
)

valid_set_processed = classify_category(
    df=valid_set_processed,
    group_pricing=seller_pricing,
    input_column_name='SellerG',
    output_column_name='seller_class',
    premium_threshold=2e6,
    expensive_threshold=1.5e6,
)

test_set_processed = classify_category(
    df=test_set_processed,
    group_pricing=seller_pricing,
    input_column_name='SellerG',
    output_column_name='seller_class',
    premium_threshold=2e6,
    expensive_threshold=1.5e6,
)

Moreover, instead of bucketizing we can create a value that ranges from 0-1 that tells you how much that suburb pricing is.

TODO: Do not standard scale this value

In [None]:
def get_scoring_mean_price(df: pd.DataFrame, column_grouped: str, price_column: str = 'Price'):
    df_means = pd.DataFrame(df\
        .groupby(column_grouped)\
        .mean()[price_column]\
        .sort_values())

    max_df_price_grouped = df_means[price_column].max()
    df_means['price_mean_proportion'] = df_means[price_column] / max_df_price_grouped
    
    return df_means

In [None]:
def get_scoring_median_price(df: pd.DataFrame, column_grouped: str, price_column: str = 'Price'):
    df_medians = pd.DataFrame(df\
        .groupby(column_grouped)\
        .median()[price_column]\
        .sort_values())

    max_df_price_grouped = df_medians[price_column].max()
    df_medians['price_median_proportion'] = df_medians[price_column] / max_df_price_grouped
    
    return df_medians

In [None]:
seller_mean = get_scoring_mean_price(train_set_processed, 'SellerG')
seller_median = get_scoring_median_price(train_set_processed, 'SellerG')

In [None]:
seller_mean.price_mean_proportion.hist(bins=50)

In [None]:
seller_median.price_median_proportion.hist(bins=50)

In [None]:
def set_price_proportions(
    df: pd.DataFrame, 
    df_price_mean: pd.DataFrame,
    target_column_df: str,
    suffix: str = '_price_mean_prop'
):
    # By default will be values of 0, in the case we have not seem some
    new_column = str(target_column_df + suffix).lower()
    df.loc[:, new_column] = .0
    
    for idx, row in df.iterrows():
        try:
            df.loc[idx, new_column] = df_price_mean.loc[row[target_column_df]].values[1]
        
        # In case some seller is not found
        except:
            df.loc[idx, new_column] = 0
            continue 
    
    return df

In [None]:
# And we place those values into the dataframe
train_set_processed = set_price_proportions(train_set_processed, seller_mean, 'SellerG')
valid_set_processed = set_price_proportions(valid_set_processed, seller_mean, 'SellerG')
test_set_processed = set_price_proportions(test_set_processed, seller_mean, 'SellerG')

In [None]:
train_set_processed = set_price_proportions(train_set_processed, seller_median, 'SellerG', '_price_median_prop')
valid_set_processed = set_price_proportions(valid_set_processed, seller_median, 'SellerG', '_price_median_prop')
test_set_processed = set_price_proportions(test_set_processed, seller_median, 'SellerG', '_price_median_prop')

### Suburb

In [None]:
suburb_pricing, _ = plot_distribution_grouped_field(
    train_set_processed,
    field='Suburb', 
    expensive_divisor=1.5e6, 
    premium_divisor=1.9e6
)

In [None]:
train_set_processed = classify_category(
    df=train_set_processed,
    group_pricing=suburb_pricing,
    input_column_name='Suburb',
    output_column_name='suburb_class',
    premium_threshold=1.9e6,
    expensive_threshold=1.5e6,
)

valid_set_processed = classify_category(
    df=valid_set_processed,
    group_pricing=suburb_pricing,
    input_column_name='Suburb',
    output_column_name='suburb_class',
    premium_threshold=1.9e6,
    expensive_threshold=1.5e6,
)

test_set_processed = classify_category(
    df=test_set_processed,
    group_pricing=suburb_pricing,
    input_column_name='Suburb',
    output_column_name='suburb_class',
    premium_threshold=1.9e6,
    expensive_threshold=1.5e6,
)

In [None]:
suburb_mean = get_scoring_mean_price(train_set_processed, 'Suburb')
suburb_median = get_scoring_median_price(train_set_processed, 'Suburb')

In [None]:
suburb_mean.price_mean_proportion.hist(bins=50)

In [None]:
suburb_median.price_median_proportion.hist(bins=50)

In [None]:
train_set_processed = set_price_proportions(train_set_processed, suburb_mean, 'Suburb')
valid_set_processed = set_price_proportions(valid_set_processed, suburb_mean, 'Suburb')
test_set_processed = set_price_proportions(test_set_processed, suburb_mean, 'Suburb')

In [None]:
train_set_processed = set_price_proportions(train_set_processed, suburb_median, 'Suburb', '_price_median_prop')
valid_set_processed = set_price_proportions(valid_set_processed, suburb_median, 'Suburb', '_price_median_prop')
test_set_processed = set_price_proportions(test_set_processed, suburb_median, 'Suburb', '_price_median_prop')

In [None]:
# And remove the SellerG that will no longer be used
# train_set_processed.drop(['Suburb'], axis=1, inplace=True)
# valid_set_processed.drop(['Suburb'], axis=1, inplace=True)
# test_set_processed.drop(['Suburb'], axis=1, inplace=True)

### Council Area

In [None]:
council_pricing, _ = plot_distribution_grouped_field(
    train_set_processed, 
    'CouncilArea', 
    expensive_divisor=1.1e6, 
    premium_divisor=1.4e6
)

In [None]:
train_set_processed = classify_category(
    df=train_set_processed,
    group_pricing=council_pricing,
    input_column_name='CouncilArea',
    output_column_name='council_class',
    premium_threshold=1e6,
    expensive_threshold=1.3e6,
)

valid_set_processed = classify_category(
    df=valid_set_processed,
    group_pricing=council_pricing,
    input_column_name='CouncilArea',
    output_column_name='council_class',
    premium_threshold=1e6,
    expensive_threshold=1.3e6,
)

test_set_processed = classify_category(
    df=test_set_processed,
    group_pricing=council_pricing,
    input_column_name='CouncilArea',
    output_column_name='council_class',
    premium_threshold=1e6,
    expensive_threshold=1.3e6,
)

In [None]:
council_mean = get_scoring_mean_price(train_set_processed, 'CouncilArea')
council_median = get_scoring_median_price(train_set_processed, 'CouncilArea')
council_mean.price_mean_proportion.hist(bins=15)

In [None]:
train_set_processed = set_price_proportions(train_set_processed, council_mean, 'CouncilArea')
valid_set_processed = set_price_proportions(valid_set_processed, council_mean, 'CouncilArea')
test_set_processed = set_price_proportions(test_set_processed, council_mean, 'CouncilArea')

train_set_processed = set_price_proportions(train_set_processed, council_median, 'CouncilArea', '_price_median_prop')
valid_set_processed = set_price_proportions(valid_set_processed, council_median, 'CouncilArea', '_price_median_prop')
test_set_processed = set_price_proportions(test_set_processed, council_median, 'CouncilArea', '_price_median_prop')

In [None]:
# train_set_processed.drop(['CouncilArea'], axis=1, inplace=True)
# valid_set_processed.drop(['CouncilArea'], axis=1, inplace=True)
# test_set_processed.drop(['CouncilArea'], axis=1, inplace=True)

### Address

There are a lof of different addresses.
We could check for the case of some has been re-sold.

In [None]:
def add_feature_resold(
    data_duplicated: pd.DataFrame, 
    data_source: pd.DataFrame
):
    
    before, actual = '', ''
    for idx, row in data_duplicated.iterrows():
        if not before:
            before = row['Address']
            data_source.loc[idx, 'Resold'] = 0
            continue

        actual = row['Address']
        if before == actual:
            # Set it directly this new feature on extended dataframe
            data_source.loc[idx, 'Resold'] = 1

        else:
            data_source.loc[idx, 'Resold'] = 0
            before = actual
    
    return data_source

def check_resold_houses(data: pd.DataFrame):
    # Will assume that the ones with same values in address, room, bedroom and bathroom is the same house being sold
    multiple_sold = data[
        data.duplicated(
            subset=['Address', 'Rooms', 'Bedroom2', 'Bathroom'], 
            keep=False
        )
    ].sort_values(['Address', 'Date'])
    
    # And now we add the feature of being sold
    if len(multiple_sold) > 0:
        data = add_feature_resold(multiple_sold, data)
        data['Resold'] = data['Resold'].fillna(0).astype('int8')
    else:
        data['Resold'] = 0
    
    # An example with the ones sold more than once
    display(data.loc[multiple_sold.index].head(2))
    
    address_feature = ['Resold']
    
    return data, address_feature

In [None]:
train_set_processed, _ = check_resold_houses(train_set_processed)
valid_set_processed, _ = check_resold_houses(valid_set_processed)
test_set_processed, _ = check_resold_houses(test_set_processed)

In [None]:
train_set_processed.drop(['Address'], axis=1, inplace=True)
valid_set_processed.drop(['Address'], axis=1, inplace=True)
test_set_processed.drop(['Address'], axis=1, inplace=True)

### Type, Regionname & Method

Simply creating one-hot encoding for them.

In [None]:
train_set_processed = pd.get_dummies(
    train_set_processed, 
    columns=['Method'], 
    drop_first=True
)

valid_set_processed = pd.get_dummies(
    valid_set_processed, 
    columns=['Method'], 
    drop_first=True
)

test_set_processed = pd.get_dummies(
    test_set_processed, 
    columns=['Method'], 
    drop_first=True
)

In [None]:
train_set_processed = pd.get_dummies(
    train_set_processed, 
    columns=['Type'], 
    drop_first=True
)

valid_set_processed = pd.get_dummies(
    valid_set_processed, 
    columns=['Type'], 
    drop_first=True
)

test_set_processed = pd.get_dummies(
    test_set_processed, 
    columns=['Type'], 
    drop_first=True
)

In [None]:
train_set_processed = pd.get_dummies(
    train_set_processed, 
    columns=['Regionname'], 
    drop_first=True
)

valid_set_processed = pd.get_dummies(
    valid_set_processed, 
    columns=['Regionname'], 
    drop_first=True
)

test_set_processed = pd.get_dummies(
    test_set_processed, 
    columns=['Regionname'], 
    drop_first=True
)

### Postcode & Date

Postcode is a categorical variable, as the numbers does not have an order. So we will remove that one.

For date, we will extract the date in which it was sold and the difference.

In [None]:
train_set_processed.head(3)

In [None]:
# Dropping postcodes
train_set_processed.drop(['Postcode'], axis=1, inplace=True)
valid_set_processed.drop(['Postcode'], axis=1, inplace=True)
test_set_processed.drop(['Postcode'], axis=1, inplace=True)

In [None]:
train_set_processed['Date'] = pd.to_datetime(train_set_processed['Date'])
valid_set_processed['Date'] = pd.to_datetime(valid_set_processed['Date'])
test_set_processed['Date'] = pd.to_datetime(test_set_processed['Date'])

train_set_processed['year_sold'] = train_set_processed['Date'].dt.year
valid_set_processed['year_sold'] = valid_set_processed['Date'].dt.year
test_set_processed['year_sold'] = test_set_processed['Date'].dt.year

train_set_processed['quarter_sold'] = train_set_processed.Date.dt.quarter
valid_set_processed['quarter_sold'] = valid_set_processed.Date.dt.quarter
test_set_processed['quarter_sold'] = test_set_processed.Date.dt.quarter

In [None]:
train_set_processed['years_to_sell'] = train_set_processed['year_sold'] - train_set_processed['YearBuilt']
valid_set_processed['years_to_sell'] = valid_set_processed['year_sold'] - valid_set_processed['YearBuilt']
test_set_processed['years_to_sell'] = test_set_processed['year_sold'] - test_set_processed['YearBuilt']

In [None]:
# Dropping date
train_set_processed.drop(['Date'], axis=1, inplace=True)
valid_set_processed.drop(['Date'], axis=1, inplace=True)
test_set_processed.drop(['Date'], axis=1, inplace=True)

# Feature Engineering

Apart from the features we already created, we are going to add some more.

In [None]:
train_set_extended = train_set_processed.copy()
valid_set_extended = valid_set_processed.copy()
test_set_extended = test_set_processed.copy()

### Ratios

In [None]:
train_set_extended

In [None]:
# Adding ratios
train_set_extended['bed_bath_ratio'] = train_set_extended['Bedroom2'] + 1 / train_set_extended['Bathroom'] + 1
train_set_extended['car_bed_ratio'] = train_set_extended['Car'] + 1/ train_set_extended['Bedroom2'] + 1
train_set_extended['bed_room_ratio'] = train_set_extended['Bedroom2'] + 1 / train_set_extended['Rooms'] + 1
train_set_extended['bath_room_ratio'] = train_set_extended['Bathroom'] + 1 / train_set_extended['Rooms'] + 1
train_set_extended['room_building_area_ratio'] = train_set_extended['Rooms'] + 1 / train_set_extended['BuildingArea'] + 1
train_set_extended['bed_building_area_ratio'] = train_set_extended['Bedroom2'] + 1 / train_set_extended['BuildingArea'] + 1
train_set_extended['bath_building_area_ratio'] = train_set_extended['Bathroom'] + 1 / train_set_extended['BuildingArea'] + 1


valid_set_extended['bed_bath_ratio'] = valid_set_extended['Bedroom2'] + 1 / valid_set_extended['Bathroom'] + 1
valid_set_extended['car_bed_ratio'] = valid_set_extended['Car'] + 1/ valid_set_extended['Bedroom2'] + 1
valid_set_extended['bed_room_ratio'] = valid_set_extended['Bedroom2'] + 1 / valid_set_extended['Rooms'] + 1
valid_set_extended['bath_room_ratio'] = valid_set_extended['Bathroom'] + 1 / valid_set_extended['Rooms'] + 1
valid_set_extended['room_building_area_ratio'] = valid_set_extended['Rooms'] + 1 / valid_set_extended['BuildingArea'] + 1
valid_set_extended['bed_building_area_ratio'] = valid_set_extended['Bedroom2'] + 1 / valid_set_extended['BuildingArea'] + 1
valid_set_extended['bath_building_area_ratio'] = valid_set_extended['Bathroom'] + 1 / valid_set_extended['BuildingArea'] + 1


test_set_extended['bed_bath_ratio'] = test_set_extended['Bedroom2'] + 1 / test_set_extended['Bathroom'] + 1
test_set_extended['car_bed_ratio'] = test_set_extended['Car'] + 1/ test_set_extended['Bedroom2'] + 1
test_set_extended['bed_room_ratio'] = test_set_extended['Bedroom2'] + 1 / test_set_extended['Rooms'] + 1
test_set_extended['bath_room_ratio'] = test_set_extended['Bathroom'] + 1 / test_set_extended['Rooms'] + 1
test_set_extended['room_building_area_ratio'] = test_set_extended['Rooms'] + 1 / test_set_extended['BuildingArea'] + 1
test_set_extended['bed_building_area_ratio'] = test_set_extended['Bedroom2'] + 1 / test_set_extended['BuildingArea'] + 1
test_set_extended['bath_building_area_ratio'] = test_set_extended['Bathroom'] + 1 / test_set_extended['BuildingArea'] + 1

### Logs & Sqrt

In [None]:
# Adding the logarithms
train_set_extended = create_logs(train_set_extended)
valid_set_extended = create_logs(valid_set_extended)
test_set_extended = create_logs(test_set_extended)

In [None]:
train_set_extended['Price_log'] = np.log(train_set_extended.Price + 1)
valid_set_extended['Price_log'] = np.log(valid_set_extended.Price + 1)

In [None]:
suburb_means_log = get_scoring_mean_price(train_set_extended, 'Suburb', 'Price_log')
suburb_median_log = get_scoring_median_price(train_set_extended, 'Suburb', 'Price_log')

train_set_extended = set_price_proportions(train_set_extended, suburb_means_log, 'Suburb', '_price_mean_log_prop')
train_set_extended = set_price_proportions(train_set_extended, suburb_median_log, 'Suburb', '_price_median_log_prop')
valid_set_extended = set_price_proportions(valid_set_extended, suburb_means_log, 'Suburb', '_price_mean_log_prop')
valid_set_extended = set_price_proportions(valid_set_extended, suburb_median_log, 'Suburb', '_price_median_log_prop')
test_set_extended = set_price_proportions(test_set_extended, suburb_means_log, 'Suburb', '_price_mean_log_prop')
test_set_extended = set_price_proportions(test_set_extended, suburb_median_log, 'Suburb', '_price_median_log_prop')

council_area_means_log = get_scoring_mean_price(train_set_extended, 'CouncilArea', 'Price_log')
council_area_median_log = get_scoring_median_price(train_set_extended, 'CouncilArea', 'Price_log')

train_set_extended = set_price_proportions(train_set_extended, council_area_means_log, 'CouncilArea', '_price_mean_log_prop')
train_set_extended = set_price_proportions(train_set_extended, council_area_median_log, 'CouncilArea', '_price_median_log_prop')
valid_set_extended = set_price_proportions(valid_set_extended, council_area_means_log, 'CouncilArea', '_price_mean_log_prop')
valid_set_extended = set_price_proportions(valid_set_extended, council_area_median_log, 'CouncilArea', '_price_median_log_prop')
test_set_extended = set_price_proportions(test_set_extended, council_area_means_log, 'CouncilArea', '_price_mean_log_prop')
test_set_extended = set_price_proportions(test_set_extended, council_area_median_log, 'CouncilArea', '_price_median_log_prop')

seller_means_log = get_scoring_mean_price(train_set_processed, 'SellerG', 'Price_log')
seller_median_log = get_scoring_median_price(train_set_processed, 'SellerG', 'Price_log')

train_set_extended = set_price_proportions(train_set_extended, seller_means_log, 'SellerG', '_price_mean_log_prop')
train_set_extended = set_price_proportions(train_set_extended, seller_median_log, 'SellerG', '_price_median_log_prop')
valid_set_extended = set_price_proportions(valid_set_extended, seller_means_log, 'SellerG', '_price_mean_log_prop')
valid_set_extended = set_price_proportions(valid_set_extended, seller_median_log, 'SellerG', '_price_median_log_prop')
test_set_extended = set_price_proportions(test_set_extended, seller_means_log, 'SellerG', '_price_mean_log_prop')
test_set_extended = set_price_proportions(test_set_extended, seller_median_log, 'SellerG', '_price_median_log_prop')

In [None]:
test_set_extended = set_price_proportions(test_set_extended, suburb_means_log, 'Suburb', '_price_mean_log_prop')


In [None]:
test_set_extended.columns.values

In [None]:
# And remove the SellerG that will no longer be used
to_drop = ['SellerG', 'Suburb', 'CouncilArea']

for col in to_drop:
    train_set_extended.drop([col], axis=1, inplace=True)
    valid_set_extended.drop([col], axis=1, inplace=True)
    test_set_extended.drop([col], axis=1, inplace=True)

In [None]:
train_set_extended['Distance_sqr'] = np.sqrt(train_set_extended['Distance'])
train_set_extended['Landsize_sqr'] = np.sqrt(train_set_extended['Landsize'])
train_set_extended['BuildingArea_sqr'] = np.sqrt(train_set_extended['BuildingArea'])
train_set_extended['Propertycount_sqr'] = np.sqrt(train_set_extended['Propertycount'])

valid_set_extended['Distance_sqr'] = np.sqrt(valid_set_extended['Distance'])
valid_set_extended['Landsize_sqr'] = np.sqrt(valid_set_extended['Landsize'])
valid_set_extended['BuildingArea_sqr'] = np.sqrt(valid_set_extended['BuildingArea'])
valid_set_extended['Propertycount_sqr'] = np.sqrt(valid_set_extended['Propertycount'])

test_set_extended['Distance_sqr'] = np.sqrt(test_set_extended['Distance'])
test_set_extended['Landsize_sqr'] = np.sqrt(test_set_extended['Landsize'])
test_set_extended['BuildingArea_sqr'] = np.sqrt(test_set_extended['BuildingArea'])
test_set_extended['Propertycount_sqr'] = np.sqrt(test_set_extended['Propertycount'])

### Property Count

Will bucketize also this feature.

In [None]:
# Create buckets of property count
_, pc_bins = pd.qcut(train_set_extended['Propertycount'], 9, labels=False, retbins=True)

train_set_extended['prop_count_bkt'] = pd.cut(
    train_set_extended['Propertycount'], 
    bins=pc_bins, 
    labels=range(len(pc_bins)-1),
    include_lowest=True
)

valid_set_extended['prop_count_bkt'] = pd.cut(
    valid_set_extended['Propertycount'], 
    bins=pc_bins, 
    labels=range(len(pc_bins)-1),
    include_lowest=True
)

test_set_extended['prop_count_bkt'] = pd.cut(
    test_set_extended['Propertycount'], 
    bins=pc_bins, 
    labels=range(len(pc_bins)-1),
    include_lowest=True
)
test_set_extended['prop_count_bkt'].fillna(0.0, inplace=True) # one of 250

In [None]:
train_set_extended['prop_count_bkt'] = train_set_extended['prop_count_bkt'].astype('int8')
valid_set_extended['prop_count_bkt'] = valid_set_extended['prop_count_bkt'].astype('int8')
test_set_extended['prop_count_bkt'] = test_set_extended['prop_count_bkt'].astype('int8')

In [None]:
# Could be a good ratio the price / property count, as might be the ones with more allocation the cheaper
prop_count_bkt_mean = get_scoring_mean_price(train_set_extended, 'prop_count_bkt')
prop_count_bkt_median = get_scoring_median_price(train_set_extended, 'prop_count_bkt')

prop_count_bkt_mean_log = get_scoring_mean_price(train_set_extended, 'prop_count_bkt', 'Price_log')
prop_count_bkt_median_log = get_scoring_median_price(train_set_extended, 'prop_count_bkt', 'Price_log')

In [None]:
train_set_extended = set_price_proportions(train_set_extended, prop_count_bkt_mean, 'prop_count_bkt', '_price_mean_prop')
valid_set_extended = set_price_proportions(valid_set_extended, prop_count_bkt_mean, 'prop_count_bkt', '_price_mean_prop')
test_set_extended = set_price_proportions(test_set_extended, prop_count_bkt_mean, 'prop_count_bkt', '_price_mean_prop')

train_set_extended = set_price_proportions(train_set_extended, prop_count_bkt_median, 'prop_count_bkt', '_price_median_prop')
valid_set_extended = set_price_proportions(valid_set_extended, prop_count_bkt_median, 'prop_count_bkt', '_price_median_prop')
test_set_extended = set_price_proportions(test_set_extended, prop_count_bkt_median, 'prop_count_bkt', '_price_median_prop')

train_set_extended = set_price_proportions(train_set_extended, prop_count_bkt_mean_log, 'prop_count_bkt', '_price_mean_log_prop')
valid_set_extended = set_price_proportions(valid_set_extended, prop_count_bkt_mean_log, 'prop_count_bkt', '_price_mean_log_prop')
test_set_extended = set_price_proportions(test_set_extended, prop_count_bkt_mean_log, 'prop_count_bkt', '_price_mean_log_prop')

train_set_extended = set_price_proportions(train_set_extended, prop_count_bkt_median_log, 'prop_count_bkt', '_price_median_log_prop')
valid_set_extended = set_price_proportions(valid_set_extended, prop_count_bkt_median_log, 'prop_count_bkt', '_price_median_log_prop')
test_set_extended = set_price_proportions(test_set_extended, prop_count_bkt_median_log, 'prop_count_bkt', '_price_median_log_prop')

### Longtitude & Lattitude

Will assume that there are groups for that location that are of relevent importance for the price.

In [None]:
train_long_lat = train_set_extended[['Longtitude', 'Lattitude']]
valid_long_lat = valid_set_extended[['Longtitude', 'Lattitude']]
test_long_lat = test_set_extended[['Longtitude', 'Lattitude']]

In [None]:
sse = {}
for k in range(1, 20):
    kmeans = KMeans(
        n_clusters=k, 
        max_iter=1000, 
        n_init='auto'
    ).fit(train_long_lat)
        
    # Squarred Sum of Errors
    sse[k] = kmeans.inertia_
    
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

In [None]:
divisor = 12
n_features = math.floor((train_set['YearBuilt'].nunique() / divisor))

print('Number of features: ', n_features)
k_means = KMeans(
    n_clusters=n_features, 
    n_init='auto',
    max_iter=1000
)

k_means.fit(train_long_lat)

In [None]:
# With the same model trained we predict both groups for test & trian
labels_train = k_means.labels_
labels_valid = k_means.predict(valid_long_lat)
labels_test = k_means.predict(test_long_lat)

In [None]:
centroides = k_means.cluster_centers_
etiquetas = k_means.labels_

unique_labels = np.unique(labels_train)
 
for label in unique_labels:
    plt.scatter(
        train_long_lat.iloc[labels_train == label, 0], 
        train_long_lat.iloc[labels_train == label, 1], 
        label = f"{label} cluster",
    )

plt.scatter(
    centroides[:,0],
    centroides[:,1], 
    label='centroides', 
    color = 'k', 
    s=10
)

plt.legend(
    prop={'size': 8}
)
plt.show()

In [None]:
# Add this new feature into the dataframe
train_set_extended['location_group'] = labels_train
valid_set_extended['location_group'] = labels_valid
test_set_extended['location_group'] = labels_test

In [None]:
# train_set_extended['location_group'] = train_set_extended['location_group'].astype('category')
# valid_set_extended['location_group'] = valid_set_extended['location_group'].astype('category')
# test_set_extended['location_group'] = test_set_extended['location_group'].astype('category')

Having those clusters identified can be interesting for then to see distribution of the prices in each one. And letting a model to have a value for which to pivot -/+ with respect to make the prediction.

Will also create another cluster for the ones that have price in the 95th percentile or more

In [None]:
# The 9th cluster  -> for prediction of prices
percentile_price_train = train_set_extended.Price.quantile(.999)
round(percentile_price_train, 2)

In [None]:
pd.DataFrame(train_set_extended.Price).boxplot()

In [None]:
"""train_set_extended.loc[
    (train_set_extended.Price > percentile_price_train).values,
    'location_group'
] = 9"""

For now, for each of the known clusters we will impute which is the mean/median in each of their locations, and divide by the max in each of their respective locations.

In [None]:
location_group_info = train_set_extended\
    .groupby('location_group')[['Price', 'Price_log']]\
    .describe().T

location_group_info

In [None]:
location_means = get_scoring_mean_price(train_set_extended, 'location_group')
location_median = get_scoring_median_price(train_set_extended, 'location_group')

location_means_log = get_scoring_mean_price(train_set_extended, 'location_group', 'Price_log')
location_median_log = get_scoring_median_price(train_set_extended, 'location_group', 'Price_log')

In [None]:
train_set_extended = set_price_proportions(train_set_extended, location_means, 'location_group', '_price_mean_prop')
valid_set_extended = set_price_proportions(valid_set_extended, location_means, 'location_group', '_price_mean_prop')
test_set_extended = set_price_proportions(test_set_extended, location_means, 'location_group', '_price_mean_prop')

train_set_extended = set_price_proportions(train_set_extended, location_median, 'location_group', '_price_median_prop')
valid_set_extended = set_price_proportions(valid_set_extended, location_median, 'location_group', '_price_median_prop')
test_set_extended = set_price_proportions(test_set_extended, location_median, 'location_group', '_price_median_prop')

train_set_extended = set_price_proportions(train_set_extended, location_means_log, 'location_group', '_price_mean_log_prop')
valid_set_extended = set_price_proportions(valid_set_extended, location_means_log, 'location_group', '_price_mean_log_prop')
test_set_extended = set_price_proportions(test_set_extended, location_means_log, 'location_group', '_price_mean_log_prop')

train_set_extended = set_price_proportions(train_set_extended, location_median_log, 'location_group', '_price_median_log_prop')
valid_set_extended = set_price_proportions(valid_set_extended, location_median_log, 'location_group', '_price_median_log_prop')
test_set_extended = set_price_proportions(test_set_extended, location_median_log, 'location_group', '_price_median_log_prop')

### Sum Up of Variables

In [None]:
print(train_set_extended.dtypes.to_string())

In [None]:
train_set_extended.isin([-np.inf, np.inf]).sum().sum()

# Model Creation: Price Log

In [None]:
train_set_final = train_set_extended.copy()
valid_set_final = valid_set_extended.copy()
test_set_final = test_set_extended.copy()

In [None]:
pd.set_option('display.max_rows', 500)

corr_final = train_set_final.corr().sort_values('Price')[['Price', 'Price_log']]
corr_final

In [None]:
# More like a normal distribution
train_set_final.Price_log.hist(bins=30)

So we can see it could be easier to predict which is the logarithm of the price, and then afterwards we make the change.

In [None]:
# Extract features with corr > 10
final_features = corr_final[(corr_final.Price_log > .1) | (corr_final.Price_log < -.1)].index.values
final_features = [feature for feature in final_features if feature not in ['Price', 'Price_log']]
final_features

In [None]:
# Will add categorical that were not considered before
final_features += ['prop_count_bkt', 'location_group']

In [None]:
# We will standarize those features
final_scaler = StandardScaler()

Y_train_set_final = train_set_final.Price_log.ravel().reshape(-1, 1)
X_train_set_final = train_set_final[final_features]

Y_valid_set_final = valid_set_final.Price_log.ravel().reshape(-1, 1)
X_valid_set_final = valid_set_final[final_features]

X_test_set_final = test_set_final[final_features]

X_train_set_final = final_scaler.fit_transform(X_train_set_final)
X_valid_set_final = final_scaler.transform(X_valid_set_final)
X_test_set_final = final_scaler.transform(X_test_set_final)

In [None]:
test_set_final

In [None]:
test_set_final

As requested in the problem, use LR or KNNeighbors.

### Linear Regressor

Can try different linear models.

In [None]:
lr_final_model = LinearRegression().fit(X_train_set_final, Y_train_set_final)

lr_pred = np.exp(lr_final_model.predict(X_valid_set_final))
print('Mean Squarred Error', mean_squared_error(lr_pred, np.exp(Y_valid_set_final), squared=False))
print('Mean Absolute Error', mean_absolute_error(lr_pred, np.exp(Y_valid_set_final)))
print('R2 Score', r2_score(lr_pred, np.exp(Y_valid_set_final)))

In [None]:
lr_final_model = HuberRegressor().fit(X_train_set_final, Y_train_set_final)

lr_pred = np.exp(lr_final_model.predict(X_valid_set_final))
print('Mean Squarred Error', mean_squared_error(lr_pred, np.exp(Y_valid_set_final), squared=False))
print('Mean Absolute Error', mean_absolute_error(lr_pred, np.exp(Y_valid_set_final)))
print('R2 Score', r2_score(lr_pred, np.exp(Y_valid_set_final)))

In [None]:
lr_final_model = Lasso().fit(X_train_set_final, Y_train_set_final)

lr_pred = np.exp(lr_final_model.predict(X_valid_set_final))
print('Mean Squarred Error', mean_squared_error(lr_pred, np.exp(Y_valid_set_final), squared=False))
print('Mean Absolute Error', mean_absolute_error(lr_pred, np.exp(Y_valid_set_final)))
print('R2 Score', r2_score(lr_pred, np.exp(Y_valid_set_final)))

In [None]:
lr_final_model = Ridge().fit(X_train_set_final, Y_train_set_final)

lr_pred = np.exp(lr_final_model.predict(X_valid_set_final))
print('Mean Squarred Error', mean_squared_error(lr_pred, np.exp(Y_valid_set_final), squared=False))
print('Mean Absolute Error', mean_absolute_error(lr_pred, np.exp(Y_valid_set_final)))
print('R2 Score', r2_score(lr_pred, np.exp(Y_valid_set_final)))

In [None]:
lr_final_model = RANSACRegressor(random_state=42).fit(X_train_set_final, Y_train_set_final)

lr_pred = np.exp(lr_final_model.predict(X_valid_set_final))
print('Mean Squarred Error', mean_squared_error(lr_pred, np.exp(Y_valid_set_final), squared=False))
print('Mean Absolute Error', mean_absolute_error(lr_pred, np.exp(Y_valid_set_final)))
print('R2 Score', r2_score(lr_pred, np.exp(Y_valid_set_final)))

In [None]:
lr_final_model = TheilSenRegressor(random_state=42).fit(X_train_set_final, Y_train_set_final)

lr_pred = np.exp(lr_final_model.predict(X_valid_set_final))
print('Mean Squarred Error', mean_squared_error(lr_pred, np.exp(Y_valid_set_final), squared=False))
print('Mean Absolute Error', mean_absolute_error(lr_pred, np.exp(Y_valid_set_final)))
print('R2 Score', r2_score(lr_pred, np.exp(Y_valid_set_final)))

In [None]:
# Seems th best one is the Linear one, but it is more propense to outliers. Will go for Hubber
from sklearn.model_selection import cross_val_score
lr_scores = cross_val_score(
    RANSACRegressor(random_state=42).fit(X_train_set_final, Y_train_set_final),
    X_train_set_final, Y_train_set_final,
    scoring='neg_root_mean_squared_error',
    cv=10
)

In [None]:
# Not very significant, as we should be in exponential of the predictions
np.sqrt(-lr_scores.mean()), np.sqrt(lr_scores.std())

## Decision Tree

In [None]:
dt_final_model = DecisionTreeRegressor(random_state=42).fit(X_train_set_final, Y_train_set_final)

In [None]:
dt_pred = np.exp(dt_final_model.predict(X_valid_set_final))
print('Mean Squarred Error', mean_squared_error(lr_pred, np.exp(Y_valid_set_final), squared=False))
print('Mean Absolute Error', mean_absolute_error(lr_pred, np.exp(Y_valid_set_final)))

In [None]:
# So we can see a huge overfitting
print(mean_squared_error(np.exp(dt_final_model.predict(X_train_set_final)), np.exp(Y_train_set_final), squared=False))
print(mean_squared_error(np.exp(dt_final_model.predict(X_valid_set_final)), np.exp(Y_valid_set_final), squared=False))

In [None]:
# Can try search to not overfit
space_dt = { 
    'max_depth': [12, 15, 20, 25],
    'min_samples_leaf': [16, 20],
    'min_samples_split': [30, 40],
    'max_features': [8, 10, 12],
}

# Want to be very precise
scoring = ['neg_mean_squared_error']
search_model_dt = DecisionTreeRegressor()

grid_search_final_dt = GridSearchCV(
    estimator=search_model_dt,
    param_grid=space_dt, 
    scoring=scoring, 
    n_jobs=-1, 
    refit=scoring[0],
    cv=3, 
    verbose=-1,
)

grid_search_final_dt.fit(X_train_set_final, Y_train_set_final)

In [None]:
grid_search_final_dt.best_estimator_

In [None]:
mean_squared_error(np.exp(grid_search_final_dt.predict(X_train_set_final)), np.exp(Y_train_set_final), squared=False)

In [None]:
mean_squared_error(np.exp(grid_search_final_dt.predict(X_valid_set_final)), np.exp(Y_valid_set_final), squared=False)

## Random Forest


In [None]:
# Can try search to not overfit
space_rf = { 
    'n_estimators': [30, 60, 80, 100],
    'max_depth': [3, 4, 6, 8, 10, 12],
    'min_samples_leaf': [18, 20],
    'min_samples_split': [30, 40],
    'max_features': [4, 6, 8, 10, 12],
}

# Want to be very precise
scoring = ['neg_mean_squared_error']
search_model_rf = RandomForestRegressor()

grid_search_final_rf = GridSearchCV(
    estimator=search_model_rf,
    param_grid=space_rf, 
    scoring=scoring, 
    n_jobs=-1, 
    refit=scoring[0],
    cv=3, 
    verbose=-1,
)

grid_search_final_rf.fit(X_train_set_final, Y_train_set_final)

In [None]:
grid_search_final_rf.best_estimator_

In [None]:
grid_search_final_rf.best_params_

In [None]:
{**grid_search_final_rf.best_params_, 'max_depth': 14}

In [None]:
best_hp = {**grid_search_final_rf.best_params_, 'max_depth': 14, 'n_estimators': 80, 'max_features': 14}
best_model = RandomForestRegressor(**{key: int(value) for key, value in best_hp.items()}).fit(X_train_set_final, Y_train_set_final)

print(mean_squared_error(np.exp(best_model.predict(X_train_set_final)), np.exp(Y_train_set_final), squared=False))
print(mean_squared_error(np.exp(best_model.predict(X_valid_set_final)), np.exp(Y_valid_set_final), squared=False))

In [None]:
mean_squared_error(np.exp(grid_search_final_rf.predict(X_train_set_final)), np.exp(Y_train_set_final), squared=False)

In [None]:
mean_squared_error(np.exp(grid_search_final_rf.predict(X_valid_set_final)), np.exp(Y_valid_set_final), squared=False)

Best so far: 331276.7979632767, and in train 312422.0375526496.

{'max_depth': 12,
 'max_features': 10,
 'min_samples_leaf': 18,
 'min_samples_split': 30,
 'n_estimators': 60}



### XGBoost

In [None]:
import xgboost as xgb

First, we will do feature selection to reduce amount of variance that the model can use.

In [None]:
# TODO: Train with all variables, then feature selection, and try again
xgb_final_model = xgb.XGBRegressor().fit(X_train_set_final, Y_train_set_final)

In [None]:
# Will use all dataset, to then create feature importances (we do not want now high correlated variables)
xgb_scaler = StandardScaler()

Y_train_set_xgb = train_set_extended.Price_log
X_train_set_xgb = train_set_extended.drop(['Price_log', 'Price'], axis=1)

X_train_set_xgb = xgb_scaler.fit_transform(X_train_set_xgb)

In [None]:
xgb_final_model.fit(X_train_set_xgb, Y_train_set_xgb)

In [None]:
fig, ax = plt.subplots(figsize=(30, 15))

xgb.plot_importance(xgb_final_model, ax=ax)
plt.show()

In [None]:
feature_names_xgb = train_set_extended.drop(['Price_log', 'Price'], axis=1).columns.values
feature_names_xgb = feature_names_xgb[xgb_final_model.feature_importances_.argsort()]

In [None]:
feature_names_xgb = feature_names_xgb[:22]

X_train_set_xgb = train_set_extended[feature_names_xgb]
X_train_set_xgb = xgb_scaler.fit_transform(X_train_set_xgb)

In [None]:
feature_names_xgb

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV

param_grid = { 
    # Percentage of columns to be randomly samples for each tree.
    "colsample_bytree": [ 0.3, 0.5 , 0.8 ],
    # reg_alpha provides l1 regularization to the weight, higher values result in more conservative models
    "reg_alpha": [0, 0.5, 1, 5, 10, 20],
    # reg_lambda provides l2 regularization to the weight, higher values result in more conservative models
    "reg_lambda": [0, 0.5, 1, 5],
    'max_depth': [3, 6, 10, 15, 20],
    'n_estimators': [50, 100, 300, 500]
}

# Want to be very precise
scoring = ['neg_mean_squared_error']

grid_search_xgb = GridSearchCV(
    estimator=xgb.XGBRegressor(),
    param_grid=param_grid, 
    scoring=scoring, 
    refit=scoring[0], # in recall before
    n_jobs=-1, 
    cv=3, 
    verbose=0
)

grid_search_xgb = grid_search_xgb.fit(X_train_set_xgb, Y_train_set_xgb)

In [None]:
best_model_xgb = xgb.XGBRegressor(**grid_search_xgb.best_params_).fit(X_train_set_xgb, Y_train_set_xgb)

In [None]:
mean_squared_error(np.exp(best_model_xgb.predict(X_train_set_xgb)), np.exp(Y_train_set_xgb), squared=False)

In [None]:
Y_valid_set_xgb = valid_set_extended.Price_log
X_valid_set_xgb = valid_set_extended[feature_names_xgb]

In [None]:
X_valid_set_xgb = xgb_scaler.transform(X_valid_set_xgb)

mean_squared_error(np.exp(best_model_xgb.predict(X_valid_set_xgb)), np.exp(Y_valid_set_xgb), squared=False)

- 198566 in train set score, 297286 in valid set score. 18 features
- 223900 in train set score, 296663 in valid set score. 22 features

In [None]:
X_test_set_xgb = test_set_extended[feature_names_xgb]
X_test_set_xgb = xgb_scaler.transform(X_test_set_xgb)

## Model Creation: Price

### Final Model

We have seen Decision Trees & Random Forests tend to overfit a lot. So we will go to the simple solution we had: Linear Regression (Huber implementation).

In [None]:
# lr_final_model = Ridge().fit(X_train_set_final, Y_train_set_final)
submission = pd.DataFrame(np.exp(best_model_xgb.predict(X_test_set_xgb)))
submission = submission.reset_index()
submission.columns = ['index', 'Price']
submission.to_csv('Submissions.csv', index=False)
submission