In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from lightgbm.sklearn import LGBMRegressor
import lightgbm as lgbm
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.datasets import fetch_california_housing
from tqdm import tqdm
import xgboost as xgb
from catboost import CatBoostRegressor
from scipy.spatial.distance import euclidean, cityblock
from sklearn.metrics.pairwise import cosine_similarity, haversine_distances
from colorama import Fore, Back, Style

from plotly.offline import plot, iplot, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode(connected=True)

### Loading the data + sklearn data

In [None]:
#ch_X, ch_y = fetch_california_housing(as_frame = True, return_X_y = True)
train = pd.read_csv("/kaggle/input/playground-series-s3e1/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s3e1/test.csv")
#train["is_generated"] = True
#test["is_generated"] = True

In [None]:
#ch = pd.concat([ch_X, ch_y], axis = 1)
#ch["is_generated"] = False
df_ch = train.copy()#pd.concat([train, ch], ignore_index = True).drop(columns = ["id"])
df_ch["is_train"] = 1
test["is_train"] = 0

### Outliers

In [None]:
df_ch_rooms = df_ch[df_ch["AveRooms"] > 100]
df_ch = df_ch.drop(df_ch_rooms.index).reset_index()

df_ch_occup = df_ch[df_ch["AveOccup"] > 250]
df_ch = df_ch.drop(df_ch_occup.index).reset_index()

for i in ["AveRooms", "AveBedrms", "AveOccup"]:
    ax = df_ch[i].plot.box()
    plt.show()

### Feature Engineering

In [None]:
import geopandas as gpd

coastline =  gpd.read_file('https://www2.census.gov/geo/tiger/TIGER2022/COASTLINE/tl_2022_us_coastline.zip')
pacific_coastline =  coastline[coastline.NAME=='Pacific'].reset_index(drop=True)
points = []
for _,data in pacific_coastline.iterrows():
    points += list(data['geometry'].coords)
points = np.array(points)
points = np.unique(points,axis=0) # eliminate redundancy
# set bounding box 
lon_bounds = (-125, -115)
lat_bounds = (32, 42)
points = points[(points[:,0]>=lon_bounds[0])&(points[:,0]<=lon_bounds[1])]
points = points[(points[:,1]>=lat_bounds[0])&(points[:,1]<=lat_bounds[1])]
print(F'Number of points = {len(points)}')
# visualize it!
countries = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) # only found a lo-res world map...
usa = countries[countries.name == 'United States of America']
usa.plot(figsize=(16,16),color='lightgray')
plt.plot(points[:,0],points[:,1],'c.')
plt.axis([lon_bounds[0], lon_bounds[1], lat_bounds[0], lat_bounds[1]])
plt.show()

In [None]:
full_df = pd.concat([df_ch, test], ignore_index = True).drop(columns = ["index", "level_0"])
full_df

In [None]:
from scipy.spatial import cKDTree

def min_distance_to_coastline(df, lon_col, lat_col, coastline_lons, coastline_lats):
    '''Compute minimum distance to coastline for any given Longitude and Latitude'''
    lons = df[lon_col].to_numpy()
    lats = df[lat_col].to_numpy()
    
    tree = cKDTree(np.column_stack((coastline_lons, coastline_lats)))
    _, indices = tree.query(np.column_stack((lons, lats)))
    
    distances = np.sqrt((lons - coastline_lons[indices])**2 + (lats - coastline_lats[indices])**2)
    min_distances = pd.DataFrame({'min_distance': distances})

    return min_distances

In [None]:
min_distances = min_distance_to_coastline(full_df, "Longitude", "Latitude", points[:,0], points[:,1])
full_df = pd.concat([full_df, min_distances], axis=1)

In [None]:
df_ch = full_df.query("is_train == 1").drop(columns = "is_train")
test = full_df.query("is_train == 0").reset_index().drop(columns = ["is_train", "index", "MedHouseVal"])
test["id"] = test["id"].astype(int)

In [None]:
def add_r_theta(dfs):
    for df in dfs:
        df['r'] = np.sqrt(df['Latitude']**2 + df['Longitude']**2)
        df['theta'] = np.arctan2(df['Latitude'], df['Longitude'])
    
add_r_theta([df_ch, test])

In [None]:
coords = df_ch[['Latitude', 'Longitude']].values
pca = PCA().fit(coords)

In [None]:
def add_pca(df, pca):
    df['pca_x'] = pca.transform(df[['Latitude', 'Longitude']].values)[:,0]
    df['pca_y'] = pca.transform(df[['Latitude', 'Longitude']].values)[:,1]

add_pca(df_ch, pca)
add_pca(test, pca)

In [None]:
df_ch["pop_bed"] = np.log(df_ch["Population"] / df_ch["AveBedrms"] + 50)
test["pop_bed"] = np.log(test["Population"] / test["AveBedrms"] + 50)

def crt_crds(df):
    df['rot_15_x'] = (np.cos(np.radians(15)) * df['Longitude']) + \
                     (np.sin(np.radians(15)) * df['Latitude'])
    
    df['rot_15_y'] = (np.cos(np.radians(15)) * df['Latitude']) + \
                     (np.sin(np.radians(15)) * df['Longitude'])
    
    df['rot_30_x'] = (np.cos(np.radians(30)) * df['Longitude']) + \
                     (np.sin(np.radians(30)) * df['Latitude'])
    
    df['rot_45_x'] = (np.cos(np.radians(45)) * df['Longitude']) + \
                     (np.sin(np.radians(45)) * df['Latitude'])
    return df

df_ch = crt_crds(df_ch)
test = crt_crds(test)


def compute_euclidean(row):
    '''Compute the Euclidean distance between the longitude and latitude columns'''
    return euclidean(row["Longitude"], row["Latitude"])

df_ch["euclidean_long_lat"] = df_ch.apply(compute_euclidean, axis = 1)
test["euclidean_long_lat"] = test.apply(compute_euclidean, axis = 1)

In [None]:
ax = np.log(df_ch["Population"] / df_ch["AveBedrms"] + 50).plot.hist(bins = 500)

### Features & Target

In [None]:
features = ['MedInc','HouseAge', 'AveRooms',
            #'rot_15_x', 
            'pca_x', 'pca_y',
            'r', 'theta',
            'rot_15_y',
            'rot_30_x', "min_distance",
            'rot_45_x',
           # "is_generated",
            "euclidean_long_lat",
            "pop_bed", #'AveBedrms', 'Population', 
            'AveOccup', 'Latitude', 'Longitude',
           ]
target = 'MedHouseVal'

### Feature Scaling

In [None]:
scaler = MinMaxScaler()#StandardScaler()#StandardScaler()#RobustScaler()#Normalizer()
df_ch_s = pd.DataFrame(scaler.fit_transform(df_ch[features]))
df_ch_s.columns = df_ch[features].columns
df_ch_s["MedHouseVal"] = df_ch[target]

test_s = pd.DataFrame(scaler.fit_transform(test[features]))
test_s.columns = test[features].columns

### Cross Validation

In [None]:
kf = KFold(n_splits = 5, random_state = 0, shuffle = True)
clfs = []
rmses = []

for i, (train_index, val_index) in tqdm(enumerate(kf.split(df_ch))): # df_ch_s # _s = scaled
    
    X_train, X_val = df_ch[features].loc[train_index], df_ch[features].loc[val_index]
    y_train, y_val = df_ch[target][train_index], df_ch[target][val_index]
    
    params = {
        'iterations': 20000,
        'loss_function': 'RMSE',
        'random_seed': 0
    }
    
    clf = CatBoostRegressor(**params)#, task_type = "GPU")
    
    clf.fit(X_train.values, y_train.values, eval_set = [(X_val, y_val)], 
            early_stopping_rounds = 2500, verbose = 2000)
    
    preds = clf.predict(X_val.values)
    
    rmse = mean_squared_error(y_val, preds, squared=False)
    print(f'{Fore.GREEN}{Style.BRIGHT}RMSE on fold {i}: {rmse}{Style.RESET_ALL}')
    
    clfs.append(clf)
    rmses.append(mean_squared_error(y_val, preds, squared=False))
    
print(f'{Fore.GREEN}{Style.BRIGHT}mean RMSE across all folds: {np.mean(rmses)}{Style.RESET_ALL}')

In [None]:
import warnings
warnings.filterwarnings('ignore')

rmses = []

for i, (train_index, val_index) in tqdm(enumerate(kf.split(df_ch))):
    
    X_train, X_val = df_ch[features].loc[train_index], df_ch[features].loc[val_index]
    y_train, y_val = df_ch[target][train_index], df_ch[target][val_index]
    
    clf = xgb.XGBRegressor(n_estimators=10000,
                           max_depth=9,
                           learning_rate=0.01,
                           colsample_bytree=0.66,
                           subsample=0.9,
                           min_child_weight=22,
                           reg_lambda=16,
                           #tree_method='gpu_hist',
                           seed=1)
    
    clf.fit(X_train, y_train, eval_set = [(X_val, y_val)], 
            early_stopping_rounds = 100, verbose = 200)
    
    preds = clf.predict(X_val.values)
    
    rmse = mean_squared_error(y_val, preds, squared=False)
    print(f'{Fore.GREEN}{Style.BRIGHT}RMSE on fold {i}: {rmse}{Style.RESET_ALL}')
    
    clfs.append(clf)
    rmses.append(mean_squared_error(y_val, preds, squared=False))
    
print(f'{Fore.GREEN}{Style.BRIGHT}mean RMSE across all folds: {np.mean(rmses)}{Style.RESET_ALL}')

### Plotting Feature Importance

In [None]:
plt.figure(figsize = (8, 6))
cat_imp = np.zeros(len(features))
for clf in clfs[:5]:
    cat_imp += clf.feature_importances_
    
plt.barh([features[i] for i in np.argsort(cat_imp)], sorted(cat_imp), 
         color = "#33cc33", edgecolor = "#000000")
plt.title("Catboost")
plt.show()

In [None]:
plt.figure(figsize = (8, 6))
xgb_imp = np.zeros(len(features))
for clf in clfs[5:]:
    xgb_imp += clf.feature_importances_
    
plt.barh([features[i] for i in np.argsort(xgb_imp)], sorted(xgb_imp), 
         color = "#33cc33", edgecolor = "#000000")
plt.title("XGBoost")
plt.show()

### Blending Models

In [None]:
cat_preds = []
xgb_preds = []

for clf in clfs[:5]:
    preds = clf.predict(test[features].values)
    cat_preds.append(preds)
    
for clf in clfs[5:]:
    preds = clf.predict(test[features].values)
    xgb_preds.append(preds)

In [None]:
cat_preds = np.stack(cat_preds).mean(0)
xgb_preds = np.stack(xgb_preds).mean(0)

In [None]:
blended_preds = cat_preds * 0.4 + xgb_preds * 0.6
blended_preds

In [None]:
submission = pd.DataFrame(data = {'id': test.id, 'MedHouseVal': blended_preds})
idx = submission.MedHouseVal.gt(4.7)
random_idx = np.random.choice([True, False], size = idx.sum(), p = [0.7, 0.3])
selected_idx = idx[idx].index[random_idx]
submission.loc[selected_idx, "MedHouseVal"] = submission.loc[selected_idx, "MedHouseVal"] * 1.1
submission.loc[submission.MedHouseVal.gt(5), "MedHouseVal"] = 5
submission.head()

In [None]:
submission.MedHouseVal.hist(bins=100)

In [None]:
submission.to_csv('submission.csv', index=False)