In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

%matplotlib inline

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s3e1/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s3e1/test.csv')
submission_df = pd.read_csv('/kaggle/input/playground-series-s3e1/sample_submission.csv')
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

## EDA

In [None]:
train_df.head()

In [None]:
train_df.shape[0], test_df.shape[0], submission_df.shape[0]

In [None]:
profile = ProfileReport(train_df, title="Pandas Profiling Report")
# profile

In [None]:
correlations = profile.description_set["correlations"]
print(correlations['pearson']['MedHouseVal'].sort_values(ascending=False), '\n\n')
print(correlations['spearman']['MedHouseVal'].sort_values(ascending=False), '\n\n')

In [None]:
!pip install dython

In [None]:
from dython.nominal import associations

associations(train_df)['corr']

## Feature Engineering

TODO:
- additional training data - DONE
- additional feature MedInc / Population - DONE
- additional features long/lat-itude - DONE
- remove outliers - DONE

TODO v2:
- categorical features long/lat-itude - DONE
- scaling for NN - DONE

In [None]:
original_df = fetch_california_housing(as_frame=True)['frame']
train_df['is_generated'] = 1
test_df['is_generated'] = 1
original_df['is_generated'] = 0
train_df = pd.concat([train_df, original_df]).reset_index(drop=True)

### (Long/Lat)itude features engineering

based on: https://bmanikan.medium.com/feature-engineering-all-i-learned-about-geo-spatial-features-649871d16796

In [None]:
train_df['r'] = np.sqrt(train_df['Latitude']**2 + train_df['Longitude']**2)
train_df['theta'] = np.arctan2(train_df['Latitude'], train_df['Longitude'])

test_df['r'] = np.sqrt(test_df['Latitude']**2 + test_df['Longitude']**2)
test_df['theta'] = np.arctan2(test_df['Latitude'], test_df['Longitude'])

In [None]:
train_df['rot_15_x'] = (np.cos(np.radians(15)) * train_df['Longitude']) + (np.sin(np.radians(15)) * train_df['Latitude'])
train_df['rot_15_y'] = (np.cos(np.radians(15)) * train_df['Latitude']) - (np.sin(np.radians(15)) * train_df['Longitude'])

train_df['rot_30_x'] = (np.cos(np.radians(30)) * train_df['Longitude']) + (np.sin(np.radians(30)) * train_df['Latitude'])
train_df['rot_30_y'] = (np.cos(np.radians(30)) * train_df['Latitude']) - (np.sin(np.radians(30)) * train_df['Longitude'])

train_df['rot_45_x'] = (np.cos(np.radians(45)) * train_df['Longitude']) + (np.sin(np.radians(45)) * train_df['Latitude'])
train_df['rot_45_y'] = (np.cos(np.radians(45)) * train_df['Latitude']) - (np.sin(np.radians(45)) * train_df['Longitude'])

In [None]:
test_df['rot_15_x'] = (np.cos(np.radians(15)) * test_df['Longitude']) + (np.sin(np.radians(15)) * test_df['Latitude'])
test_df['rot_15_y'] = (np.cos(np.radians(15)) * test_df['Latitude']) - (np.sin(np.radians(15)) * test_df['Longitude'])

test_df['rot_30_x'] = (np.cos(np.radians(30)) * test_df['Longitude']) + (np.sin(np.radians(30)) * test_df['Latitude'])
test_df['rot_30_y'] = (np.cos(np.radians(30)) * test_df['Latitude']) - (np.sin(np.radians(30)) * test_df['Longitude'])

test_df['rot_45_x'] = (np.cos(np.radians(45)) * test_df['Longitude']) + (np.sin(np.radians(45)) * test_df['Latitude'])
test_df['rot_45_y'] = (np.cos(np.radians(45)) * test_df['Latitude']) - (np.sin(np.radians(45)) * test_df['Longitude'])

In [None]:
from sklearn.decomposition import PCA

def pca(data):
    '''
    input: dataframe containing Latitude(x) and Longitude(y)
    '''
    coordinates = data[['Latitude','Latitude']].values
    pca_obj = PCA().fit(coordinates)
    pca_x = pca_obj.transform(data[['Latitude', 'Longitude']].values)[:,0]
    pca_y = pca_obj.transform(data[['Latitude', 'Longitude']].values)[:,1]
    return pca_x, pca_y

train_df['pca_x'], train_df['pca_y'] = pca(train_df)
test_df['pca_x'], test_df['pca_y'] = pca(test_df)

In [None]:
!pip install reverse_geocoder --quiet

In [None]:
import reverse_geocoder as rg

def geocoder(data):
    '''
    input: dataframe containing Latitude(x) and Longitude(y) coordinates
    output: JSON data containing info on available building or street names.
    '''
    coordinates = list(zip(data['Latitude'].values, data['Longitude'].values))
    results = rg.search(coordinates) # default mode = 2
    return results

In [None]:
def get_values(df, column):
    return df[column].value_counts(normalize=True)[df[column].value_counts(normalize=True) < 0.005].index.tolist()

geocoder_data = geocoder(train_df)

train_df['admin2'] = [val['admin2'] for val in geocoder_data]
train_df['admin2'][train_df['admin2'].isin(get_values(train_df, 'admin2'))] = 'Other'
train_df['admin2'].value_counts()

geocoder_data = geocoder(test_df)

test_df['admin2'] = [val['admin2'] for val in geocoder_data]
test_df['admin2'][test_df['admin2'].isin(get_values(test_df, 'admin2'))] = 'Other'
test_df['admin2'].value_counts()

In [None]:
# import geopy.distance

# sacramento = (38.576931, -121.494949)
# san_francisco = (37.780080, -122.420160)
# san_jose = (37.334789, -121.888138)
# los_angeles = (34.052235, -118.243683)
# san_diego = (32.715759, -117.163818)

# train_df['sacramento_distance'] = [geopy.distance.geodesic(sacramento, row).km for row in (list(zip(train_df['Latitude'].values, train_df['Longitude'].values)))]
# train_df['san_francisco_distance'] = [geopy.distance.geodesic(san_francisco, row).km for row in (list(zip(train_df['Latitude'].values, train_df['Longitude'].values)))]
# train_df['san_jose_distance'] = [geopy.distance.geodesic(san_jose, row).km for row in (list(zip(train_df['Latitude'].values, train_df['Longitude'].values)))]
# train_df['los_angeles_distance'] = [geopy.distance.geodesic(los_angeles, row).km for row in (list(zip(train_df['Latitude'].values, train_df['Longitude'].values)))]
# train_df['san_diego_distance'] = [geopy.distance.geodesic(san_diego, row).km for row in (list(zip(train_df['Latitude'].values, train_df['Longitude'].values)))]

# test_df['sacramento_distance'] = [geopy.distance.geodesic(sacramento, row).km for row in (list(zip(test_df['Latitude'].values, test_df['Longitude'].values)))]
# test_df['san_francisco_distance'] = [geopy.distance.geodesic(san_francisco, row).km for row in (list(zip(test_df['Latitude'].values, test_df['Longitude'].values)))]
# test_df['san_jose_distance'] = [geopy.distance.geodesic(san_jose, row).km for row in (list(zip(test_df['Latitude'].values, test_df['Longitude'].values)))]
# test_df['los_angeles_distance'] = [geopy.distance.geodesic(los_angeles, row).km for row in (list(zip(test_df['Latitude'].values, test_df['Longitude'].values)))]
# test_df['san_diego_distance'] = [geopy.distance.geodesic(san_diego, row).km for row in (list(zip(test_df['Latitude'].values, test_df['Longitude'].values)))]

In [None]:
# import json
# import pandas as pd
# import numpy as np
# from tqdm.auto import tqdm
# tqdm.pandas()

# # Reads California coastline coordinates.
# # (https://earthworks.stanford.edu/catalog/stanford-vx275xn8886) @kaivanbrunt
# path = r'/kaggle/input/coastline-data/stanford-vx275xn8886-geojson.json'
# with open(path, 'r') as f:
#     coastline = json.load(f)
#     features = coastline['features']

# # Unpacks California coastline coordinates and builds a dataframe. shape=(25693, 2) 
# cstl_coords = [features[i]['geometry']['coordinates'] for i in range(len(features))]
# cstl_coords = np.hstack(cstl_coords).reshape((-1, 2))
# cstl_df = pd.DataFrame(cstl_coords, columns=['Longitude', 'Latitude'])

# # Finds the shortest distance to the coastline (Euclidian Distance).
# def f(lat, lon, df):
#     return (((df.Latitude - lat)**2 + (df.Longitude - lon)**2)**.5).min()

# train_df['dist_to_cstl'] = train_df.progress_apply(lambda x: f(x.Latitude, x.Longitude, cstl_df), axis=1)
# test_df['dist_to_cstl'] = test_df.progress_apply(lambda x: f(x.Latitude, x.Longitude, cstl_df), axis=1)

In [None]:
# Number of houses in block : Population / AveOccup (size of block)
# Total income of block : MedInc * Population (total wealth of each block - could adjust to discount children)
# Ratio of occupants to bedrooms : AveOccup / AveBedrms (could help identify summer houses)
# Number of unused bedrooms : AveBedrms - AveOccup (could correspond to guest rooms)
# Total number of rooms : AveBedrms + AveRooms (indicates size of house)
# Number of non-bedrooms rooms : AveRooms - AveBedrms (how many bathrooms, kitchens etc.)
# Ratio of bedrooms to rooms : AveBedrms/AveRooms (could be useful)
# Ratio of occupants to rooms : AveOccup / AveRooms (could be useful)
# Distance to nearest block : min_{L2 dist} (Latitude, Longitude) (how remote is this block)
# Number of blocks in 10 mile/km radius (how connected is this block)

In [None]:
train_df['size_of_block'] = train_df['Population'] / train_df['AveOccup']
test_df['size_of_block'] = test_df['Population'] / test_df['AveOccup']

train_df['income_of_block'] = train_df['MedInc'] * train_df['Population']
test_df['income_of_block'] = test_df['MedInc'] * test_df['Population']

train_df['occupants/bedrooms'] = train_df['AveOccup'] / train_df['AveBedrms']
test_df['occupants/bedrooms'] = test_df['AveOccup'] / test_df['AveBedrms']

train_df['unused_bedrooms'] = train_df['AveBedrms'] - train_df['AveOccup']
test_df['unused_bedrooms'] = test_df['AveBedrms'] - test_df['AveOccup']

train_df['total_rooms'] = train_df['AveBedrms'] + train_df['AveRooms']
test_df['total_rooms'] = test_df['AveBedrms'] + test_df['AveRooms']

train_df['non_bedrooms'] = train_df['AveRooms'] - train_df['AveBedrms']
test_df['non_bedrooms'] = test_df['AveRooms'] - test_df['AveBedrms']

train_df['bedrooms/rooms'] = train_df['AveBedrms'] / train_df['AveRooms']
test_df['bedrooms/rooms'] = test_df['AveBedrms'] / test_df['AveRooms']

train_df['occupants/rooms'] = train_df['AveOccup'] / train_df['AveRooms']
test_df['occupants/rooms'] = test_df['AveOccup'] / test_df['AveRooms']

## Modeling

TODO:
- CV vs Hold-out - DONE
- RF (baseline) - DONE
- CatBoost - DONE

TODO v2:
- Feedforward NN - DONE (didn't work best tho)
- Ensemble CatBoost / LightGBM/ NN - PARTLY DONE

In [None]:
y = train_df['MedHouseVal']
X = train_df.drop(['MedHouseVal'], axis=1)

## Cross-validating CatBoost

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from tqdm import tqdm

rmses = []
models = []
kf = KFold(n_splits=10, random_state=42, shuffle=True)

for train_index, val_index in tqdm(kf.split(X)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = CatBoostRegressor(iterations=15_000, loss_function='RMSE')
    model.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=['admin2'],
              early_stopping_rounds=1500, use_best_model=True, verbose=5000)
    pred = model.predict(X_val)
    
    models.append(model)
    rmses.append(mean_squared_error(y_val, pred, squared=False))
    
print(f'Total RMSE: {np.mean(rmses)}')

In [None]:
# Total RMSE: 0.5090353897181753

In [None]:
# 1.0 basic additional features: 
# 2.0 basic features + distance to cities: 
# 3.0 basic feats + dist to cities + dist to coast: 

## CatBoost feature importances

In [None]:
feature_importances = np.zeros(28)
for model in models[:10]:
    feature_importances += model.get_feature_importance()
    
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances / 10, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

## Cross-validating LightGBM

In [None]:
from lightgbm import LGBMRegressor
import lightgbm as lgbm

params= {
 'lambda_l1': 1.945,
 'num_leaves': 87,
 'feature_fraction': 0.79,
 'bagging_fraction': 0.93,
 'bagging_freq': 4,
 'min_data_in_leaf': 103,
 'max_depth': 17,
}

rmses = []
kf = KFold(n_splits=10, random_state=42, shuffle=True)
X['admin2'] = X['admin2'].astype('category')

for train_index, val_index in tqdm(kf.split(X)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    model = LGBMRegressor(learning_rate=0.01, n_estimators=15_000, metric='rmse', **params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], categorical_feature=['admin2'],
              callbacks=[lgbm.early_stopping(1500, verbose=True)])
    pred = model.predict(X_val)
    
    models.append(model)
    rmses.append(mean_squared_error(y_val, pred, squared=False))
    
print(f'Total RMSE: {np.mean(rmses)}')

## LightGBM feature importances

In [None]:
feature_importances = np.zeros(20)
for model in models[10:]:
    feature_importances += model.feature_importances_
    
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances / 10, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

## Cross-validating XGBoost

In [None]:
from xgboost import XGBRegressor

rmses = []
kf = KFold(n_splits=10, random_state=42, shuffle=True)
X['admin2'] = X['admin2'].astype('category')

params= {
    'max_depth': 9,
    'colsample_bytree': 0.66,
    'subsample': 0.9,
    'min_child_weight': 22,
    'reg_lambda': 16,
    'tree_method': 'hist'
}

for train_index, val_index in tqdm(kf.split(X)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    model = XGBRegressor(learning_rate=0.01, n_estimators=15_000,
                         eval_metric='rmse', enable_categorical=True, **params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=1500, verbose=5000)
    pred = model.predict(X_val)
    
    models.append(model)
    rmses.append(mean_squared_error(y_val, pred, squared=False))
    
print(f'Total RMSE: {np.mean(rmses)}')

In [None]:
# Total RMSE: 0.5072187072363529

In [None]:
# 1.0 basic additional features: 0.5095460287618219
# 2.0 basic features + distance to cities: 
# 3.0 basic feats + dist to cities + dist to coast: 

## XGBoost feature importances

In [None]:
feature_importances = np.zeros(28)
for model in models[10:]:
    feature_importances += model.feature_importances_
    
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances / 10, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

## Feedforward NN

In [None]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras import backend as K
from sklearn.preprocessing import MinMaxScaler


def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 
    

def create_nn_model():    
    model = Sequential()

    model.add(Dense(512, activation='relu', input_shape=(18,)))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(1))

    model.compile(optimizer='adam',loss=root_mean_squared_error)
    
    return model

In [None]:
# models = []
# rmses = []
# kf = KFold(n_splits=10, random_state=42, shuffle=True)

# for train_index, val_index in tqdm(kf.split(X)):
#     X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#     y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
#     minmax = MinMaxScaler()
#     X_train = minmax.fit_transform(X_train)
#     X_val = minmax.transform(X_val)
    
#     model = create_nn_model()
#     model.fit(X_train, y_train, batch_size=128, epochs=100, validation_data=(X_val, y_val))
#     pred = model.predict(X_val)
    
#     models.append(model)
#     rmses.append(mean_squared_error(y_val, pred, squared=False))

Unfortunately, didn't work best, so as a final sumbmission let's ensemble CatBoost & LGBM only.

## Submission

In [None]:
# test_df['admin2'] = test_df['admin2'].astype('category')

# cb_pred = np.mean(np.stack([model.predict(test_df) for model in models[:10]]), axis=0)
# lgbm_pred = np.mean(np.stack([model.predict(test_df) for model in models[10:20]]), axis=0)
# xgb_pred = np.mean(np.stack([model.predict(test_df) for model in models[20:]]), axis=0)

In [None]:
# test_pred = 0.45 * xgb_pred + 0.55 * cb_pred

In [None]:
test_df['admin2'] = test_df['admin2'].astype('category')
test_pred = np.mean(np.stack([model.predict(test_df) for model in models]), axis=0)
submission = pd.DataFrame(data={'id': submission_df.id, 'MedHouseVal': test_pred})
submission.MedHouseVal.clip(0, train_df.MedHouseVal.max(), inplace=True)

In [None]:
submission.to_csv('submission_cb_xgb_additional_basic_features.csv', index=False)