In [None]:
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import warnings
import glob
import math
import json
import sys
import os

warnings.filterwarnings("ignore")

from sklearn.datasets import fetch_california_housing

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.metrics.pairwise import haversine_distances
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

import lightgbm as lgbm
import xgboost as xgb
import catboost

%matplotlib inline

In [None]:
data_dir = "/kaggle/input/playground-series-s3e1/"
city_dir = "/kaggle/input/uscities/"

validation = False

XGB = True
LGBM = True
CATBOOST = True

FOLDS = 10

print(os.listdir(data_dir))

In [None]:
df_test              = pd.read_csv(os.path.join(data_dir, "test.csv"))
df_train             = pd.read_csv(os.path.join(data_dir, "train.csv"))
df_sample_submission = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"))

In [None]:
# Load the original dataset.
original = fetch_california_housing()

df_original = pd.DataFrame(
    original["data"], 
    columns = original["feature_names"]
)

df_original["MedHouseVal"] = original["target"]

In [None]:
# Feature engineering functions.

df_cities = pd.read_csv(os.path.join(city_dir, "uscities.csv"))

columns = ["city", "lat", "lng", "density", "state_name", "population"]

df_cities = df_cities[columns]
df_cities = df_cities[df_cities["state_name"] == "California"].reset_index(drop = True)

df_cities = df_cities[df_cities["population"] > 500_000]

def compute_distance(loc1, loc2):
    loc1 = [math.radians(x) for x in loc1]
    loc2 = [math.radians(x) for x in loc2]
    
    result = haversine_distances([loc1, loc2])
    
    return (result * (6371000 / 1000))[0][1]

def add_distance_features(df):
    for city in tqdm(df_cities["city"].unique()):
        lon_lat = df_cities[df_cities["city"] == city][["lng", "lat"]].values.tolist()[0]
        
        df[f"to_{city}"] = df.apply(lambda t: compute_distance((t["Longitude"], t["Latitude"]), lon_lat), axis = 1) 
        
    return df

def exp_features(df):
    emb_size = 20
    precision = 1e6 
    
    coordinates = df[["Latitude", "Longitude"]]

    latlon = np.expand_dims(coordinates, axis = -1) 

    m = np.exp(np.log(precision) / emb_size) 
    angle_freq = m ** np.arange(emb_size) 
    angle_freq = angle_freq.reshape(1, 1, emb_size) 

    latlon = latlon * angle_freq 
    latlon[..., 0::2] = np.cos(latlon[..., 0::2]) 
    latlon[..., 1::2] = np.sin(latlon[..., 1::2]) 
    latlon = latlon.reshape(-1, 2 * emb_size) 

    df["exp_latlon1"] = [lat[0] for lat in latlon]
    df["exp_latlon2"] = [lat[1] for lat in latlon]
    
    return df

def distance_coastline_feature(df):
    # Reads California coastline coordinates.
    # (https://earthworks.stanford.edu/catalog/stanford-vx275xn8886) @kaivanbrunt
    path = r"/kaggle/input/coastlinedata/try.json"
    with open(path, "r") as f:
        coastline = json.load(f)
        features = coastline["features"]

    # Unpacks California coastline coordinates and builds a dataframe. shape = (25693, 2) 
    cstl_coords = [features[i]["geometry"]["coordinates"] for i in range(len(features))]
    cstl_coords = np.hstack(cstl_coords).reshape((-1, 2))
    cstl_df = pd.DataFrame(cstl_coords, columns = ["Longitude", "Latitude"])

    # Maybe find the haversine distance.
    # Finds the shortest distance to the coastline (Euclidian Distance).
    def f(lat, lon, df):
        return (((df.Latitude - lat) ** 2 + (df.Longitude - lon) ** 2) ** 0.5).min()

    df["dist_to_cstl"] = df.apply(lambda x: f(x.Latitude, x.Longitude, cstl_df), axis = 1)
    
    return df

def pca_coords(df):
    coordinates = df[["Latitude", "Longitude"]]
    
    pca_obj = PCA().fit(coordinates.values)
    
    df["pca_lat"] = pca_obj.transform(coordinates)[:, 0]
    df["pca_lon"] = pca_obj.transform(coordinates)[:, 1]
    
    return df

def polar_coords(df):
    df["r"] = np.sqrt(df["Latitude"] ** 2 + df["Longitude"] ** 2)
    df["theta"] = np.arctan2(df["Latitude"], df["Longitude"])

    return df

def rotate_coords(df): 
    
    df["rot_15_x"] = (np.cos(np.radians(15)) * df["Longitude"]) + \
                     (np.sin(np.radians(15)) * df["Latitude"])
    
    df["rot_15_y"] = (np.cos(np.radians(15)) * df["Latitude"]) - \
                     (np.sin(np.radians(15)) * df["Longitude"])
    
    df["rot_30_x"] = (np.cos(np.radians(30)) * df["Longitude"]) + \
                     (np.sin(np.radians(30)) * df["Latitude"])
    
    df["rot_30_y"] = (np.cos(np.radians(30)) * df["Latitude"]) - \
                     (np.sin(np.radians(30)) * df["Longitude"])
    
    df["rot_45_x"] = (np.cos(np.radians(45)) * df["Longitude"]) + \
                     (np.sin(np.radians(45)) * df["Latitude"])
    
    df["rot_45_y"] = (np.cos(np.radians(45)) * df["Latitude"]) - \
                     (np.sin(np.radians(45)) * df["Longitude"])

    return df


In [None]:
def build_features(df):
    df = exp_features(df)
    df = rotate_coords(df)
    df = polar_coords(df)
    df = pca_coords(df)
    df = distance_coastline_feature(df)
    df = add_distance_features(df)
    
    return df


df_original["is_generated"] = 0
df_train["is_generated"] = 1
df_test["is_generated"] = 1

df_train = pd.concat([
    df_train.drop("id", axis = 1), 
    df_original
]).reset_index(drop = True)

df_train = build_features(df_train)
df_test  = build_features(df_test)

In [None]:
features = [column for column in df_train.columns if column not in ["MedHouseVal", "id"]]

target = "MedHouseVal"

In [None]:
df_X = df_train[features]
df_y = df_train[target]

In [None]:
if validation:
    X_train, X_test, y_train, y_test = train_test_split(
        df_X, 
        df_y, 
        train_size = 0.8, 
        test_size = 0.2, 
        random_state = 42,
        shuffle = True
    )
    
    indices = (X_test["is_generated"] == 1)
    X_test = X_test[indices]
    y_test = y_test[indices]
else:
    X_train = df_X
    y_train = df_y

In [None]:
if XGB:
    xgb_params = {
        'max_depth': 9,
        'eta': 0.01,
        'colsample_bytree': 0.66,
        'subsample': 0.76,
        'min_child_weight': 22,
        'lambda': 16, 
        'gamma': 1,

        'tree_method': 'hist',
        'booster': 'gbtree',
        'predictor':'cpu_predictor',
        'seed': 42,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse'
    }
    
    skf = KFold(
        n_splits = FOLDS, 
        random_state = 1, 
        shuffle = True
    )

    skf.get_n_splits(X_train, y_train)

    xgb_scores = []
    xgb_models = []
    for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        valid_x = X_train.iloc[test_index]
        valid_y = y_train.iloc[test_index]
        
        indices = (valid_x["is_generated"] == 1)
        valid_x = valid_x[indices]
        valid_y = valid_y[indices]

        xgb_train = xgb.DMatrix(
            X_train.iloc[train_index], 
            label = y_train.iloc[train_index],
        )

        xgb_valid = xgb.DMatrix(
            valid_x,
            label = valid_y,   
        )

        watchlist = [(xgb_train, "train"), (xgb_valid, "eval")]

        model = xgb.train(
            params = xgb_params, 
            dtrain = xgb_train, 
            num_boost_round = 50000,
            evals = watchlist, 
            verbose_eval = 1000,
            callbacks = [
                xgb.callback.EarlyStopping(
                    rounds = 1000,
                    data_name = "eval",
                    maximize = False,
                    save_best = True
                )
            ]
        )

        val_preds = model.predict(xgb_valid)
        val_score = mean_squared_error(
            valid_y, 
            val_preds, 
            squared = False
        )
        
        print(
            model.predict(
                xgb.DMatrix(
                    df_test.loc[:, (df_test.columns != "id")]
                )
            )
        )

        xgb_scores.append(val_score)
        xgb_models.append(model)

        print(f"Score: {val_score}")
        print("-" * 10)

In [None]:
if validation and XGB:
    indices = (X_test["is_generated"] == 1)
    X_test = X_test[indices]
    y_test = y_test[indices]
    
    xgb_test = xgb.DMatrix(
        X_test
    )

    predictions = []
    for model in xgb_models:
        predictions.append(model.predict(xgb_test))

    print(mean_squared_error(y_test, np.mean(predictions, axis = 0), squared = False))

# 0.5380623336060028

In [None]:
if LGBM:
    skf = KFold(
        n_splits = FOLDS, 
        random_state = 1, 
        shuffle = True
    )
    
    params = {
        'n_estimators': 1000, 
        'reg_lambda': 0.8435272531761764, 
        'reg_alpha': 0.0047770992003183695, 
        'colsample_bytree': 0.5, 
        'learning_rate': 0.01, 
        'subsample': 0.8, 
        'max_depth': 100, 
        'min_child_samples': 194, 
        'num_leaves': 894
    }
    
    skf.get_n_splits(X_train, y_train)

    lgbm_scores = []
    lgbm_models = []
    for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        valid_x = X_train.iloc[test_index]
        valid_y = y_train.iloc[test_index]
        
        indices = (valid_x["is_generated"] == 1)
        valid_x = valid_x[indices]
        valid_y = valid_y[indices]

        model = lgbm.LGBMRegressor(**params)
        model.fit(
            X_train.iloc[train_index], 
            y_train.iloc[train_index],
            eval_set=[(
                valid_x,
                valid_y 
            )],
            early_stopping_rounds = 100,
            verbose = False
        )
        
        preds = model.predict(valid_x)
        
        rmse = mean_squared_error(
            valid_y, 
            preds,
            squared = False
        )
        
        print(rmse)
        
        lgbm_scores.append(rmse)
        lgbm_models.append(model)

In [None]:
if validation and LGBM:
    indices = (X_test["is_generated"] == 1)
    X_test = X_test[indices]
    y_test = y_test[indices]
    
    predictions = []
    for model in lgbm_models:
        predictions.append(model.predict(X_test))
        
    rmse = mean_squared_error(
        y_test, 
        np.mean(predictions, axis = 0), 
        squared = False
    )
    
    print(rmse)
    
# 0.5424722699766628 | Hyperparameter opt with 5 folds. 
# | 0.5383166352453341

In [None]:
if CATBOOST:
    params = {
        "random_seed": 1234,    
        "iterations": 15000,
        "early_stopping_rounds": 1000,
        "use_best_model": True,
        "eval_metric": 'RMSE',
        "verbose": 1000
    }
    
    skf = KFold(
        n_splits = FOLDS, 
        random_state = 1, 
        shuffle = True
    )

    skf.get_n_splits(X_train, y_train)

    catboost_scores = []
    catboost_models = []
    for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        valid_x = X_train.iloc[test_index]
        valid_y = y_train.iloc[test_index]
        
        indices = (valid_x["is_generated"] == 1)
        valid_x = valid_x[indices]
        valid_y = valid_y[indices]

        model = catboost.CatBoostRegressor(**params)
        model.fit(
            X_train.iloc[train_index], 
            y_train.iloc[train_index],
            eval_set=[(
                valid_x,
                valid_y 
            )],
            early_stopping_rounds = 100,
            verbose = 1000
        )
        
        preds = model.predict(valid_x)
        
        rmse = mean_squared_error(
            valid_y, 
            preds,
            squared = False
        )
        
        print(rmse)
        
        catboost_scores.append(rmse)
        catboost_models.append(model)

In [None]:
if validation and CATBOOST:
    indices = (X_test["is_generated"] == 1)
    X_test = X_test[indices]
    y_test = y_test[indices]
    
    predictions = []
    for model in catboost_models:
        predictions.append(model.predict(X_test))
        
    rmse = mean_squared_error(
        y_test, 
        np.mean(predictions, axis = 0), 
        squared = False
    )
    
    print(rmse)
    
# 0.5424722699766628 | Hyperparameter opt with 5 folds. 
# 0.5418564539103299

In [None]:
if validation and (CATBOOST and XGB and LGBM):
    indices = (X_test["is_generated"] == 1)
    X_test = X_test[indices]
    y_test = y_test[indices]
    
    predictions = []
    for model in catboost_models:
        predictions.append(model.predict(X_test))
    
    for model in xgb_models:
        predictions.append(
            model.predict(
                xgb.DMatrix(X_test)
            )
        )
    
    for model in lgbm_models:
        predictions.append(model.predict(X_test))
        
    rmse = mean_squared_error(
        y_test, 
        np.mean(predictions, axis = 0), 
        squared = False
    )
    
    print(rmse)
    
# 0.5424722699766628 | Hyperparameter opt with 5 folds. 

In [None]:
if not validation:
    predictions = []
    
    if XGB:
        xgb_test = xgb.DMatrix(
            df_test.drop("id", axis = 1)
        )

        xgb_predictions = []
        for model in xgb_models:
            pred = model.predict(xgb_test)
            
            print(pred[-5:])
            
            xgb_predictions.append(pred)
        
        predictions.append(np.mean(xgb_predictions, axis = 0))
    
    if LGBM:
        lgbm_predictions = []
        for model in lgbm_models:
            pred = model.predict(
                df_test.drop("id", axis = 1)
            )
            
            print(pred[-5:])
            
            lgbm_predictions.append(pred)
        
        predictions.append(np.mean(lgbm_predictions, axis = 0))
        
    if CATBOOST:
        catboost_predictions = []
        for model in catboost_models:
            pred = model.predict(
                df_test.drop("id", axis = 1)
            )
            
            print(pred[-5:])
            
            catboost_predictions.append(pred)
        
        predictions.append(np.mean(catboost_predictions, axis = 0))

In [None]:
if not validation:
    preds = np.mean(predictions, axis = 0)
    
    print(preds)
    
    df_test["MedHouseVal"] = preds
        
    submission = df_test[["id", "MedHouseVal"]]
    
    # https://www.kaggle.com/competitions/playground-series-s3e1/discussion/376396
    submission.MedHouseVal.clip(0, 5, inplace = True)

    submission.to_csv("submission.csv", index = False)
    
    print(submission.head())
    print(submission.tail())