In [None]:
# Libraries
import numpy as np 
import pandas as pd

import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore")
import gc

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

import optuna

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

bold = ['\033[1m', '\033[0m']

# Read files
path = '/kaggle/input/playground-series-s3e1/'
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
ss = pd.read_csv(path+'sample_submission.csv')

from sklearn.datasets import fetch_california_housing as fch
sklearn_df = pd.DataFrame(fch()['data'], columns=fch()['feature_names'])
sklearn_df['MedHouseVal'] = fch()['target']

# Show all columns
pd.set_option('display.max_columns', None)

## Disclaimer
All credit goes to @DMITRYUAROV whom I've copied the notebook. I have literally only made one change which is at the very bottom where the post-processing is not to the nearest value, but just clipping it between a min and max. I have found this gives an even better score in practice and also intuitively.

![](https://www.bankrate.com/2022/06/14113029/buying-a-house-in-california.jpg?auto=webp&optimize=high&crop=16:9&width=912)

<a href="https://www.bankrate.com/real-estate/buying-a-house-in-california/">Photo source</a>

# <div style="background-color:#e29930;text-align:center;color:white;font-size:150%;font-family:Calibri;border-radius:10px"> <b>Preprocessing</b></div>

<b><span style='color:#444444;font-size:200%;font-family:Calibri'>|</span><span style='color:#e29930;font-size:200%;font-family:Calibri'> Generated data</span></b>

<p style="text-align:justify;">
It is worth starting with the definition of the generated data. As correctly noted in the <a href="https://www.kaggle.com/competitions/playground-series-s3e1/discussion/376043"><b>discussion</b></a>, this really improves the results of the models. Then merge the data and delete the id's.
</p>



In [None]:
sklearn_df['is_generated'] = 0
train['is_generated'] = 1
test['is_generated'] = 1

train.drop('id', axis=1, inplace=True)
train = pd.concat([train, sklearn_df], 
                  ignore_index=True)
test.drop('id', axis=1, inplace=True)

train.loc[33228,['Latitude','Longitude']] = [32.74, -117]
train.loc[34363,['Latitude','Longitude']] = [32.71, -117]
train.loc[20991,['Latitude','Longitude']] = [34.2, -119]

df = pd.concat([train, test], axis=0, ignore_index=True)

<b><span style='color:#444444;font-size:200%;font-family:Calibri'>|</span><span style='color:#e29930;font-size:200%;font-family:Calibri'> Encoding trick</span></b>

Idea from this <a href="https://www.kaggle.com/competitions/playground-series-s3e1/discussion/376210"><b>discussion</b></a>


In [None]:
emb_size = 20
precision = 1e6 

latlon = np.expand_dims(df[['Latitude', 'Longitude']].values, axis=-1) 

m = np.exp(np.log(precision) / emb_size) 
angle_freq = m ** np.arange(emb_size) 
angle_freq = angle_freq.reshape(1, 1, emb_size) 

latlon = latlon * angle_freq 
latlon[..., 0::2] = np.cos(latlon[..., 0::2]) 
latlon[..., 1::2] = np.sin(latlon[..., 1::2]) 
latlon = latlon.reshape(-1, 2 * emb_size) 

In [None]:
df['exp_latlon1'] = [lat[0] for lat in latlon]
df['exp_latlon2'] = [lat[1] for lat in latlon]

<b><span style='color:#444444;font-size:200%;font-family:Calibri'>|</span><span style='color:#e29930;font-size:200%;font-family:Calibri'> Coordinates with PCA and UMAP</span></b>

In [None]:
from sklearn.decomposition import PCA

coordinates = df[['Latitude', 'Longitude']].values
pca = PCA().fit(coordinates)
df['pca_lat'] = pca.transform(coordinates)[:,0]
df['pca_lon'] = pca.transform(coordinates)[:,1]

In [None]:
from umap import UMAP
umap = UMAP(n_components=2, n_neighbors=50, random_state=228).fit(coordinates)
df['umap_lat'] = umap.transform(coordinates)[:,0]
df['umap_lon'] = umap.transform(coordinates)[:,1]

<b><span style='color:#444444;font-size:200%;font-family:Calibri'>|</span><span style='color:#e29930;font-size:200%;font-family:Calibri'> Cartesian coordinates rotation</span></b>

<p style="text-align:justify;">
These features have given an incredibly large increase in the metric.
</p>

In [None]:
df['rot_15_x'] = (np.cos(np.radians(15)) * df['Longitude']) + \
                  (np.sin(np.radians(15)) * df['Latitude'])
    
df['rot_15_y'] = (np.cos(np.radians(15)) * df['Latitude']) + \
                  (np.sin(np.radians(15)) * df['Longitude'])
    
df['rot_30_x'] = (np.cos(np.radians(30)) * df['Longitude']) + \
                  (np.sin(np.radians(30)) * df['Latitude'])
    
df['rot_30_y'] = (np.cos(np.radians(30)) * df['Latitude']) + \
                  (np.sin(np.radians(30)) * df['Longitude'])
    
df['rot_45_x'] = (np.cos(np.radians(44)) * df['Longitude']) + \
                  (np.sin(np.radians(45)) * df['Latitude'])

<b><span style='color:#444444;font-size:200%;font-family:Calibri'>|</span><span style='color:#e29930;font-size:200%;font-family:Calibri'> Location of coordinates</span></b>

<p style="text-align:justify;">
Thanks to this wonderful library, we can easily determine in which locality the place is located.
</p>

In [None]:
pip install reverse_geocoder

In [None]:
import reverse_geocoder as rg

coordinates = list(zip(df['Latitude'], df['Longitude']))
results = rg.search(coordinates)
df['place'] = [x['admin2'] for x in results]

places = ['Los Angeles County', 'Orange County', 'Kern County',
          'Alameda County', 'San Francisco County', 'Ventura County',
          'Santa Clara County', 'Fresno County', 'Santa Barbara County',
          'Contra Costa County', 'Yolo County', 'Monterey County',
          'Riverside County', 'Napa County']

def replace(x):
    if x in places:
        return x
    else:
        return 'Other'
    
df['place'] = df['place'].apply(lambda x: replace(x))
le = LabelEncoder()
df['place'] = le.fit_transform(df['place'])

<b><span style='color:#444444;font-size:200%;font-family:Calibri'>|</span><span style='color:#e29930;font-size:200%;font-family:Calibri'> Distance to cities and coast points</span></b>

In [None]:
from haversine import haversine

Sac = (38.576931, -121.494949)
SF = (37.780080, -122.420160)
SJ = (37.334789, -121.888138)
LA = (34.052235, -118.243683)
SD = (32.715759, -117.163818)

df['dist_Sac'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), Sac, unit='ft'), axis=1)
df['dist_SF'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SF, unit='ft'), axis=1)
df['dist_SJ'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SJ, unit='ft'), axis=1)
df['dist_LA'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), LA, unit='ft'), axis=1)
df['dist_SD'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SD, unit='ft'), axis=1)
df['dist_nearest_city'] = df[['dist_Sac', 'dist_SF', 'dist_SJ', 
                              'dist_LA', 'dist_SD']].min(axis=1)

In [None]:
from shapely.geometry import LineString, Point

coast_points = LineString([(32.6644, -117.1613), (33.2064, -117.3831),
                           (33.7772, -118.2024), (34.4634, -120.0144),
                           (35.4273, -120.8819), (35.9284, -121.4892),
                           (36.9827, -122.0289), (37.6114, -122.4916),
                           (38.3556, -123.0603), (39.7926, -123.8217),
                           (40.7997, -124.1881), (41.7558, -124.1976)])

df['dist_to_coast'] = df.apply(lambda x: Point(x['Latitude'], x['Longitude']).distance(coast_points), axis=1)

<b><span style='color:#444444;font-size:200%;font-family:Calibri'>|</span><span style='color:#e29930;font-size:200%;font-family:Calibri'> Final preprocessing</span></b>

In [None]:
train = df.iloc[:-len(test),:]
test = df.iloc[-len(test):,:].drop('MedHouseVal', axis=1).reset_index(drop=True)

X = train.drop('MedHouseVal', axis=1)
y = train['MedHouseVal']

seed = 228
FOLDS = 10

In [None]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

def f_importance_plot(f_imp):
    fig = plt.figure(figsize = (15, 0.35*len(f_imp)))
    plt.title('Feature importances', size=25, y=1.05, 
              fontname='Calibri', fontweight='bold', color='#444444')
    a = sns.barplot(data=f_imp, x='avg_imp', y='feature', 
                    palette='Blues_d', linestyle="-", 
                    linewidth=1, edgecolor="black")
    plt.xlabel('')
    plt.xticks([])
    plt.ylabel('')
    plt.yticks(size=11, color='#444444')
    
    for j in ['right', 'top', 'bottom']:
        a.spines[j].set_visible(False)
    for j in ['left']:
        a.spines[j].set_linewidth(0.5)
    plt.show()

# <div style="background-color:#e29930;text-align:center;color:white;font-size:150%;font-family:Calibri;border-radius:10px"> <b>LGBM</b></div>

In [None]:
lgb_params = {
    'max_depth': 11,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': seed,
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'device': 'cpu', 
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}

f_imp = pd.DataFrame({'feature': X.columns})
predictions, scores = np.zeros(len(test)), []

k = KFold(n_splits=FOLDS, random_state=seed, shuffle=True)
for fold, (train_idx, val_idx) in enumerate(k.split(X, y)):
    print(f'\n--- FOLD {fold+1} ---')
        
    lgb_train = lgb.Dataset(data=X.iloc[train_idx], 
                            label=y.iloc[train_idx],
                            categorical_feature=['place'])
    lgb_valid = lgb.Dataset(data=X.iloc[val_idx], 
                            label=y.iloc[val_idx],
                            categorical_feature=['place'],
                            reference=lgb_train)

    model = lgb.train(params=lgb_params, 
                      train_set=lgb_train, 
                      num_boost_round=50000,
                      valid_sets=[lgb_train, lgb_valid], 
                      valid_names=['train', 'val'],
                      verbose_eval=False,
                      callbacks=[lgb.log_evaluation(1000),
                                 lgb.early_stopping(1000, verbose=False)])
    
    f_imp['fold_'+str(fold+1)] = model.feature_importance()
    b_itr = model.best_iteration
    
    val_preds = model.predict(X.iloc[val_idx], num_iteration=b_itr)
    val_score = rmse(y.iloc[val_idx], val_preds)
    scores.append(val_score)
    
    predictions += model.predict(test, num_iteration=b_itr) / FOLDS
    print(f'--- RMSE: {bold[0]}{round(val_score, 6)}{bold[1]} | best iteration: {bold[0]}{b_itr}{bold[1]} ---')
    
    del lgb_train, lgb_valid, val_preds, val_score, model
    gc.collect()

print('*'*45)
print(f'Mean RMSE: {bold[0]}{round(np.mean(scores), 6)}{bold[1]}')

f_imp['avg_imp'] = f_imp[f_imp.columns[1:]].mean(axis=1)
f_imp.sort_values('avg_imp', ascending=False, inplace=True)
f_importance_plot(f_imp)

In [None]:
lgbm_preds = predictions.copy()
ss['MedHouseVal'] = predictions
ss.to_csv('lgbm_submission7.csv', index=False)

# <div style="background-color:#e29930;text-align:center;color:white;font-size:150%;font-family:Calibri;border-radius:10px"> <b>CatBoost</b></div>

In [None]:
cb_params = {
    'depth': 9,
    'learning_rate': 0.01,
    'rsm': 0.88,
    'subsample': 0.795,
    'l2_leaf_reg': 8,
    'min_data_in_leaf': 35,
    'random_strength': 0.63,
    
    'use_best_model': True,
    'task_type': 'CPU',
    'bootstrap_type': 'Bernoulli',
    'grow_policy': 'SymmetricTree',
    'random_seed': seed,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE'
}

f_imp = pd.DataFrame({'feature': X.columns})
predictions, scores = np.zeros(len(test)), []

k = KFold(n_splits=FOLDS, random_state=seed, shuffle=True)
for fold, (train_idx, val_idx) in enumerate(k.split(X, y)):
    print(f'\n--- FOLD {fold+1} ---')
    
    cb_train = cb.Pool(data=X.iloc[train_idx],
                       label=y.iloc[train_idx],
                       cat_features=['place'])
    cb_valid = cb.Pool(data=X.iloc[val_idx],
                       label=y.iloc[val_idx],
                       cat_features=['place'])
    
    model = cb.train(params=cb_params,
                     dtrain=cb_train,
                     num_boost_round=50000,
                     evals=cb_valid, 
                     early_stopping_rounds=777,
                     verbose=3000)
    
    f_imp['fold_'+str(fold+1)] = model.get_feature_importance()
    b_itr = model.get_best_iteration()
    
    val_preds = model.predict(cb_valid)
    val_score = rmse(y.iloc[val_idx], val_preds)
    scores.append(val_score)
    
    predictions += model.predict(test) / FOLDS
    print(f'--- RMSE: {bold[0]}{round(val_score, 6)}{bold[1]} | best iteration: {bold[0]}{b_itr}{bold[1]} ---')
    
    del cb_train, cb_valid, val_preds, val_score, model
    gc.collect()

print('*'*45)
print(f'Mean RMSE: {bold[0]}{round(np.mean(scores), 6)}{bold[1]}')

f_imp['avg_imp'] = f_imp[f_imp.columns[1:]].mean(axis=1)
f_imp.sort_values('avg_imp', ascending=False, inplace=True)
f_importance_plot(f_imp)

In [None]:
cb_preds = predictions.copy()
ss['MedHouseVal'] = predictions
ss.to_csv('cb_submission3.csv', index=False)

# <div style="background-color:#e29930;text-align:center;color:white;font-size:150%;font-family:Calibri;border-radius:10px"> <b>XGB</b></div>

In [None]:
def df_xgb(df):
    df = pd.concat([df, pd.get_dummies(df['place'], 
                                       prefix='place')], axis=1)
    df.drop('place', axis=1, inplace=True)
    return df

X = df_xgb(X)
test = df_xgb(test)

In [None]:
xgb_params = {
    'max_depth': 9,
    'eta': 0.01,
    'colsample_bytree': 0.66,
    'subsample': 0.76,
    'min_child_weight': 22,
    'lambda': 16, 
    'gamma': 1,
    
    'tree_method': 'gpu_hist',
    'booster': 'gbtree',
    'predictor':'gpu_predictor',
    'seed': seed,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
}

f_imp = pd.DataFrame({'feature': X.columns})
predictions, scores = np.zeros(len(test)), []

k = KFold(n_splits=FOLDS, random_state=seed, shuffle=True)
for fold, (train_idx, val_idx) in enumerate(k.split(X, y)):
    print(f'\n--- FOLD {fold+1} ---')
    
    dtrain = xgb.DMatrix(X.iloc[train_idx], label=y.iloc[train_idx])
    dvalid = xgb.DMatrix(X.iloc[val_idx], label=y.iloc[val_idx])
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

    model = xgb.train(params=xgb_params, 
                      dtrain=dtrain, 
                      num_boost_round=50000,
                      evals=watchlist, 
                      verbose_eval=5000,
                      callbacks=[xgb.callback.EarlyStopping(rounds=1000,
                                                            data_name='eval',
                                                            maximize=False,
                                                            save_best=True)])
    
    fi = model.get_score(importance_type='weight')
    fi = pd.DataFrame({'feature':fi.keys(),f'importance_{fold}':fi.values()})
    f_imp = f_imp.merge(fi, on='feature', how='left').fillna(0)
    b_itr = model.best_ntree_limit
    
    val_preds = model.predict(dvalid)
    val_score = rmse(y.iloc[val_idx], val_preds)
    scores.append(val_score)
    
    predictions += model.predict(xgb.DMatrix(test)) / FOLDS
    print(f'--- RMSE: {bold[0]}{round(val_score, 6)}{bold[1]} | best iteration: {bold[0]}{b_itr}{bold[1]} ---')
    
    del dtrain, dvalid, watchlist, val_preds, val_score, model
    gc.collect()

print('*'*45)
print(f'Mean RMSE: {bold[0]}{round(np.mean(scores), 6)}{bold[1]}')

f_imp['avg_imp'] = f_imp[f_imp.columns[1:]].mean(axis=1)
f_imp.sort_values('avg_imp', ascending=False, inplace=True)
f_importance_plot(f_imp)

In [None]:
xgb_preds = predictions.copy()
ss['MedHouseVal'] = predictions
ss.to_csv('xgb_submission5.csv', index=False)

# <div style="background-color:#e29930;text-align:center;color:white;font-size:150%;font-family:Calibri;border-radius:10px"> <b>Blending + rounding</b></div>

In [None]:
ss['MedHouseVal'] = lgbm_preds*0.35 + cb_preds*0.3 + xgb_preds*0.35

<p style="text-align:justify;">
As you can see, the price can be rounded to the nearest of the available values, which also gives a small increase in the result.
</p>

In [None]:
max_val = 5.00001
min_val = 0.14999
ss['MedHouseVal'][ss['MedHouseVal'] > max_val] = max_val
ss['MedHouseVal'][ss['MedHouseVal'] < min_val] = min_val

In [None]:
ss.to_csv('blend_submission5.csv', index=False)