In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import altair as alt
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

  from pandas import MultiIndex, Int64Index


### Model 1 (clusterting + train on entire set)

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [3]:
train_df.columns

Index(['Year_Factor', 'State_Factor', 'building_class', 'facility_type',
       'floor_area', 'year_built', 'energy_star_rating', 'ELEVATION',
       'january_min_temp', 'january_avg_temp', 'january_max_temp',
       'february_min_temp', 'february_avg_temp', 'february_max_temp',
       'march_min_temp', 'march_avg_temp', 'march_max_temp', 'april_min_temp',
       'april_avg_temp', 'april_max_temp', 'may_min_temp', 'may_avg_temp',
       'may_max_temp', 'june_min_temp', 'june_avg_temp', 'june_max_temp',
       'july_min_temp', 'july_avg_temp', 'july_max_temp', 'august_min_temp',
       'august_avg_temp', 'august_max_temp', 'september_min_temp',
       'september_avg_temp', 'september_max_temp', 'october_min_temp',
       'october_avg_temp', 'october_max_temp', 'november_min_temp',
       'november_avg_temp', 'november_max_temp', 'december_min_temp',
       'december_avg_temp', 'december_max_temp', 'cooling_degree_days',
       'heating_degree_days', 'precipitation_inches', 'snowfall_inc

In [4]:
train_df = train_df.query("State_Factor != 'State_6'")

In [None]:
temp_cols = train_df.columns[8:44].tolist()

from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans

model = KMeans()
visualizer = KElbowVisualizer(model, k=(1, 15))

mod_df1 = train_df[temp_cols+["id"]].copy()
mod_df2 = test_df[temp_cols+["id"]].copy()
mod_df1['set'] = 'train'
mod_df2['set'] = 'test'

mod_df = mod_df1.append(mod_df2)

visualizer.fit(mod_df[temp_cols])  # Fit the data to the visualizer
visualizer.show();

In [None]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(mod_df[temp_cols])

clusters_df = mod_df.copy()
clusters_df["cluster"] = kmeans.predict(mod_df[temp_cols])
clusters_df

In [None]:
clusters_df.groupby(["set"]).cluster.value_counts(normalize=True)

In [None]:
clusters_df.drop(clusters_df.columns[0: 36].to_list(), axis=1, inplace=True)
clusters_df

In [None]:
train_df = train_df.merge(clusters_df, how='left', on='id')
test_df = test_df.merge(clusters_df, how='left', on='id')

In [None]:
test_df

In [None]:
# Making a new column by concating specific columns of df

def make_building_ids(data,columns):
    full_col_name = None
    for column in columns:
        if full_col_name is None:
            full_col_name = data[column].astype(str)
        else:
            full_col_name += '_' +data[column].astype(str)        
    return full_col_name

buidling_columns = ['State_Factor','building_class','facility_type','year_built','floor_area']

train_df['building_ID'] = make_building_ids(train_df, buidling_columns)
test_df['building_ID'] = make_building_ids(test_df, buidling_columns)

In [None]:
train_df

In [None]:
replace = dict(zip(
    [
        "Education_.*",
        ".*(f|F)ood_.*",
        "Health_Care_.*",
        "Lodging_.*",
        "Office_.*",
        "(Public_|Religious).*",
        "Retail_.*",
        "Service_.*",
        "Warehouse_.*",
        ".*_Unit_.*",
        "Mixed_Use_.*"
    ],
    [
        "Commercial_Education",
        "Commercial_Food",
        "Commercial_Health",
        "Commercial_Hotel",
        "Commercial_Office",
        "Commercial_Public",
        "Commercial_Retail",
        "Commercial_Service",
        "Commercial_Warehouse",
        "Residential_Unit",
        "Residential_Mixed"
    ]
))

replace.update({
    "Commercial_Unknown": "Commercial_Other",
    "Data_Center": "Commercial_DC",
    "Industrial": "Commercial_Industrial",
    "Laboratory": "Commercial_Lab",
    "Nursing_Home": "Commercial_Home",
    "Parking_Garage": "Commercial_Parking",
    "Multifamily_Uncategorized": "Residential_Multifamily"
})

train_df["building_class"] = train_df["facility_type"].replace(replace, regex=True)
test_df["building_class"] = test_df["facility_type"].replace(replace, regex=True)

In [None]:
year = train_df["year_built"].fillna(2016)
train_df = train_df.loc[(year <= 2016) & (year >= 1845)]

In [None]:
# Categorical encoding by mean values
# Attribution: Inspired from another notebook in the competition

train_df["State_Factor_encoded"] = train_df.groupby("State_Factor")["site_eui"].transform("mean")
train_df["building_class_encoded"] = train_df.groupby("building_class")["site_eui"].transform("mean")
train_df["facility_type_encoded"] = train_df.groupby("facility_type")["site_eui"].transform("mean")
train_df["building_ID_encoded"] = train_df.groupby("building_ID")["site_eui"].transform("mean")
#train_df["month_ID_encoded"] = train_df.groupby("month_ID")["site_eui"].transform("mean")

sf_encoding = (
    train_df[["State_Factor","State_Factor_encoded"]]
    .drop_duplicates()
    .set_index('State_Factor')
    .to_dict()['State_Factor_encoded']
)
train_df = train_df.replace({"State_Factor": sf_encoding})
test_df = test_df.replace({"State_Factor": sf_encoding})

bc_encoding = (
    train_df[["building_class","building_class_encoded"]]
    .drop_duplicates()
    .set_index('building_class')
    .to_dict()['building_class_encoded']
)
train_df = train_df.replace({"building_class": bc_encoding})
test_df = test_df.replace({"building_class": bc_encoding})

ft_encoding = (
    train_df[["facility_type","facility_type_encoded"]]
    .drop_duplicates()
    .set_index('facility_type')
    .to_dict()['facility_type_encoded']
)
train_df = train_df.replace({"facility_type": ft_encoding})
test_df = test_df.replace({"facility_type": ft_encoding})

bi_encoding = (
    train_df[["building_ID","building_ID_encoded"]]
    .drop_duplicates()
    .set_index('building_ID')
    .to_dict()['building_ID_encoded']
)
train_df = train_df.replace({"building_ID": bi_encoding})
test_df = test_df.replace({"building_ID": bi_encoding})

# mi_encoding = (
#     train_df[["month_ID","month_ID_encoded"]]
#     .drop_duplicates()
#     .set_index('month_ID')
#     .to_dict()['month_ID_encoded']
# )
# train_df = train_df.replace({"month_ID": mi_encoding})
# test_df = test_df.replace({"month_ID": mi_encoding})

train_df = train_df.drop(['State_Factor_encoded',
                          'building_class_encoded',
                          'facility_type_encoded',
                         # "month_ID_encoded",
                          'building_ID_encoded'
                         ],
                         axis = 1)

In [None]:
filter_num = [isinstance(test_df.building_ID.values[i], float) for i in range(test_df.shape[0])]
filter_str = [isinstance(test_df.building_ID.values[i], str) for i in range(test_df.shape[0])]
test_num = test_df[filter_num].copy() 
test_str = test_df[filter_str].copy()

In [None]:
test_num.head(5)

### ML Pipeline

In [None]:
features_selected = [
    "facility_type",
    "building_class",
    "year_built",
    "floor_area",
    "energy_star_rating",
    "State_Factor",
    "building_ID",
    "Year_Factor"
   # "month_ID"
]
target = "site_eui"
drop_features = list(
    set(train_df.columns) - set(features_selected) - {target} - {"Year_Factor"}
    #- set(single)
)

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

impute_estimator = ExtraTreesRegressor(random_state=1, n_estimators=200)

column_transformer = make_column_transformer(
    (IterativeImputer(random_state=1, estimator=impute_estimator), features_selected),
    #(StandardScaler(), features_selected),
    ("drop", drop_features)
)

In [None]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

scoring_metric = make_scorer(root_mean_squared_error)
results = {}

model = VotingRegressor(
    [
        (
            'cat', CatBoostRegressor(
                iterations=5000,
                grow_policy='Lossguide',
                verbose=0,
                task_type='GPU',
                l2_leaf_reg=1,
                learning_rate=0.03,
                depth=10,
            )
        ),
        (
            'xgb', XGBRegressor(
                grow_policy='lossguide',
                tree_method='gpu_hist',
                n_estimators=5000,
                eta=0.05, 
                max_depth=10,
                reg_lambda=1
            )
        ),
        (
            'lgbm', LGBMRegressor(
                n_estimators=10000, # num_iterations
                device="gpu",
                learning_rate=0.03,
                reg_lambda=5
            )
        )
    ]
)

model_pipe = make_pipeline(
    column_transformer,
    model
)

# Using KFold instead of CV

In [None]:
from sklearn.model_selection import StratifiedKFold

folds=5
skf = StratifiedKFold(n_splits=folds)

X_pre, year_factor = train_df, train_df["Year_Factor"]

X, y = X_pre.drop("site_eui", axis=1), X_pre["site_eui"]
predictions = np.zeros(test_df.shape[0])

In [None]:
import timeit

train_score = 0
valid_score = 0

fold = 1
for train_index, valid_index in skf.split(X_pre, year_factor):
    
    print(f"fold = {fold}, train set size: {len(train_index)}, valid set size: {len(valid_index)}")
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    start = timeit.default_timer()
    model_pipe.fit(X_train, y_train)
    
    fold_train_score = root_mean_squared_error(y_train, model_pipe.predict(X_train))
    fold_valid_score = root_mean_squared_error(y_valid, model_pipe.predict(X_valid))
    print(f"Train Score: {fold_train_score}")
    print(f"Valid Score: {fold_valid_score}")
    
    train_score += fold_train_score
    valid_score += fold_valid_score
    
    # predictions =  np.add(predictions, model_pipe.predict(test_df))
    end = timeit.default_timer()
    print(f"time taken = {round(end-start)} seconds.")
    fold += 1
    print("\n")
print(f"mean train score = {train_score/folds}, mean valid score = {valid_score/folds}")

### Submission

In [None]:
X_test = test_num.drop("Year_Factor", axis=1)

In [None]:
model_pipe.fit(X, y);

In [None]:
submission_df1 = pd.DataFrame({
    "id": X_test["id"],
    "site_eui": model_pipe.predict(X_test)
})
#submission_df.to_csv("/kaggle/working/submission-rr1202-5.csv", index=False)

In [None]:
submission_df1

# Model 2: where no similar building ID not found in test set 

In [None]:
train_df = pd.read_csv("/kaggle/input/widsdatathon2022/train.csv")
test_df = pd.read_csv("/kaggle/input/widsdatathon2022/test.csv")

In [None]:
# train_df = train_df.query("State_Factor != 'State_6'")

In [None]:
temp_cols = train_df.columns[8:44].tolist()

from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans

model = KMeans()
visualizer = KElbowVisualizer(model, k=(1, 15))

mod_df1 = train_df[temp_cols+["id"]].copy()
mod_df2 = test_df[temp_cols+["id"]].copy()
mod_df1['set'] = 'train'
mod_df2['set'] = 'test'

mod_df = mod_df1.append(mod_df2)

visualizer.fit(mod_df[temp_cols])  # Fit the data to the visualizer
visualizer.show();

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(mod_df[temp_cols])

clusters_df = mod_df.copy()
clusters_df["cluster"] = kmeans.predict(mod_df[temp_cols])
clusters_df

In [None]:
clusters_df.groupby(["set"]).cluster.value_counts(normalize=True)

In [None]:
clusters_df.drop(clusters_df.columns[0: 36].to_list(), axis=1, inplace=True)
clusters_df

In [None]:
train_df = train_df.merge(clusters_df, how='left', on='id')
test_df = test_df.merge(clusters_df, how='left', on='id')

In [None]:
replace = dict(zip(
    [
        "Education_.*",
        ".*(f|F)ood_.*",
        "Health_Care_.*",
        "Lodging_.*",
        "Office_.*",
        "(Public_|Religious).*",
        "Retail_.*",
        "Service_.*",
        "Warehouse_.*",
        ".*_Unit_.*",
        "Mixed_Use_.*"
    ],
    [
        "Commercial_Education",
        "Commercial_Food",
        "Commercial_Health",
        "Commercial_Hotel",
        "Commercial_Office",
        "Commercial_Public",
        "Commercial_Retail",
        "Commercial_Service",
        "Commercial_Warehouse",
        "Residential_Unit",
        "Residential_Mixed"
    ]
))

replace.update({
    "Commercial_Unknown": "Commercial_Other",
    "Data_Center": "Commercial_DC",
    "Industrial": "Commercial_Industrial",
    "Laboratory": "Commercial_Lab",
    "Nursing_Home": "Commercial_Home",
    "Parking_Garage": "Commercial_Parking",
    "Multifamily_Uncategorized": "Residential_Multifamily"
})

train_df["building_class"] = train_df["facility_type"].replace(replace, regex=True)
test_df["building_class"] = test_df["facility_type"].replace(replace, regex=True)

In [None]:
year = train_df["year_built"].fillna(2016)
train_df = train_df.loc[(year <= 2016) & (year >= 1845)]

In [None]:
train_df["State_Factor_encoded"] = train_df.groupby("State_Factor")["site_eui"].transform("mean")
train_df["building_class_encoded"] = train_df.groupby("building_class")["site_eui"].transform("mean")
train_df["facility_type_encoded"] = train_df.groupby("facility_type")["site_eui"].transform("mean")
#train_df["building_ID_encoded"] = train_df.groupby("building_ID")["site_eui"].transform("mean")
#train_df["month_ID_encoded"] = train_df.groupby("month_ID")["site_eui"].transform("mean")
train_df["cluster_encoded"] = train_df.groupby("cluster")["site_eui"].transform("mean")

sf_encoding = (
    train_df[["State_Factor","State_Factor_encoded"]]
    .drop_duplicates()
    .set_index('State_Factor')
    .to_dict()['State_Factor_encoded']
)
train_df = train_df.replace({"State_Factor": sf_encoding})
test_df = test_df.replace({"State_Factor": sf_encoding})

bc_encoding = (
    train_df[["building_class","building_class_encoded"]]
    .drop_duplicates()
    .set_index('building_class')
    .to_dict()['building_class_encoded']
)
train_df = train_df.replace({"building_class": bc_encoding})
test_df = test_df.replace({"building_class": bc_encoding})

ft_encoding = (
    train_df[["facility_type","facility_type_encoded"]]
    .drop_duplicates()
    .set_index('facility_type')
    .to_dict()['facility_type_encoded']
)
train_df = train_df.replace({"facility_type": ft_encoding})
test_df = test_df.replace({"facility_type": ft_encoding})

cl_encoding = (
    train_df[["cluster","cluster_encoded"]]
    .drop_duplicates()
    .set_index('cluster')
    .to_dict()['cluster_encoded']
)
train_df = train_df.replace({"cluster": cl_encoding})
test_df = test_df.replace({"cluster": cl_encoding})

# bi_encoding = (
#     train_df[["building_ID","building_ID_encoded"]]
#     .drop_duplicates()
#     .set_index('building_ID')
#     .to_dict()['building_ID_encoded']
# )
# train_df = train_df.replace({"building_ID": bi_encoding})
# test_df = test_df.replace({"building_ID": bi_encoding})

# mi_encoding = (
#     train_df[["month_ID","month_ID_encoded"]]
#     .drop_duplicates()
#     .set_index('month_ID')
#     .to_dict()['month_ID_encoded']
# )
# train_df = train_df.replace({"month_ID": mi_encoding})
# test_df = test_df.replace({"month_ID": mi_encoding})

train_df = train_df.drop(['State_Factor_encoded',
                          'building_class_encoded',
                          'facility_type_encoded',
                         # "month_ID_encoded",
                         # 'building_ID_encoded',
                          'cluster_encoded'
                         ],
                         axis = 1)

In [None]:
train_df

### ML Pipeline

In [None]:
features_selected = [
    "facility_type",
    "building_class",
    "year_built",
    "floor_area",
    "energy_star_rating",
    "State_Factor",
   # "building_ID",
   # "month_ID",
   # "cluster"
]
target = "site_eui"
drop_features = list(
    set(train_df.columns) - set(features_selected) - {target} - {"Year_Factor"}
    #- set(single)
)

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

impute_estimator = ExtraTreesRegressor(random_state=1, n_estimators=200)

column_transformer = make_column_transformer(
    (IterativeImputer(random_state=1, estimator=impute_estimator), features_selected),
    #(StandardScaler(), features_selected),
    ("drop", drop_features)
)

In [None]:
from sklearn.model_selection import StratifiedKFold

folds=5
skf = StratifiedKFold(n_splits=folds)

X_pre, year_factor = train_df.drop("Year_Factor", axis=1), train_df["Year_Factor"]

X, y = X_pre.drop("site_eui", axis=1), X_pre["site_eui"]
predictions = np.zeros(test_df.shape[0])

In [None]:
import timeit

train_score = 0
valid_score = 0

fold = 1
for train_index, valid_index in skf.split(X_pre, year_factor):
    
    print(f"fold = {fold}, train set size: {len(train_index)}, valid set size: {len(valid_index)}")
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    start = timeit.default_timer()
    model_pipe.fit(X_train, y_train)
    
    fold_train_score = root_mean_squared_error(y_train, model_pipe.predict(X_train))
    fold_valid_score = root_mean_squared_error(y_valid, model_pipe.predict(X_valid))
    print(f"Train Score: {fold_train_score}")
    print(f"Valid Score: {fold_valid_score}")
    
    train_score += fold_train_score
    valid_score += fold_valid_score
    
    # predictions =  np.add(predictions, model_pipe.predict(test_df))
    end = timeit.default_timer()
    print(f"time taken = {round(end-start)} seconds.")
    fold += 1
    print("\n")
print(f"mean train score = {train_score/folds}, mean valid score = {valid_score/folds}")

#### Submission

In [None]:
X_test = test_str.drop(columns=["Year_Factor", "building_ID"], axis=1)

In [None]:
model_pipe.fit(X, y);

In [None]:
submission_df2 = pd.DataFrame({
    "id": X_test["id"],
    "site_eui": model_pipe.predict(X_test)
})
# submission_df.to_csv("/kaggle/working/submission-rr1202-5.csv", index=False)

In [None]:
submission_df2

In [None]:
pd.concat([submission_df1, submission_df2]).to_csv("/kaggle/working/submission-rr-1702-2.csv", index=False)

# Ignore this section
## MLP Regressor - Work In Progress

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from scipy.stats import loguniform

In [None]:
params = {
    "activation": ["logistic", "relu"],
    "solver": ["lbfgs", "sgd", "adam"],
    "learning_rate_init": loguniform(1e-2, 2e3),
    "batch_size": loguniform(150, 512),
    "momentum":   loguniform(0.85, 0.975)
}


score_results = []
kfold = KFold(n_splits=10, random_state=1)
imputer = IterativeImputer(random_state=1, estimator=impute_estimator)

scaler = StandardScaler()

select = SelectFromModel(LassoCV(cv=kfold, random_state=1), threshold='0.5*median')

regressor = MLPRegressor(random_state=1,
                         activation='relu',
                         solver='sgd',
                         learning_rate='adaptive',
                         learning_rate_init=0.013000000000000001,
                         early_stopping=True,
                         hidden_layer_sizes=(140, 140),
                         max_iter=10000,
                         momentum=0.9697272727272728
                         )

pipe = make_pipeline(imputer, scaler, select, regressor)
my_model = RandomizedSearchCV(pipe,
                        params,
                        cv=kfold,
                        scoring='neg_mean_squared_log_error',
                        verbose=0,
                        n_jobs=-1,
                        error_score=-1000.)

my_model.fit(train_set, y)
print(-1 * my_model.score(train_set, y))
print(my_model.best_params_)

train_pred = my_model.predict(train_set)


column_transformer = make_column_transformer(
    (IterativeImputer(random_state=1, estimator=impute_estimator), features_selected),
    (StandardScaler(), ),
)

# Appendix

## Encoding the building_ID column created (does not give decent results)

In [None]:
# from sklearn.preprocessing import LabelEncoder

# encoder = LabelEncoder()

# # using entire df for encoding
# data=train_df['building_ID'].append(test_df['building_ID'])
# encoder.fit(data.values)

# train_df['building_ID'] = encoder.transform(train_df['building_ID'])
# test_df['building_ID'] = encoder.transform(test_df['building_ID'])

# using entire df for encoding
# data=train_df['month_ID'].append(test_df['month_ID'])
# encoder.fit(data.values)

# train_df['month_ID'] = encoder.transform(train_df['month_ID'])
# test_df['month_ID'] = encoder.transform(test_df['month_ID'])


# building_df = pd.concat([train_df, test_df]).building_ID.value_counts().reset_index()
# building_df.columns = ['building_ID', 'count']
# building_df['building_counter'] = np.where(building_df['count'] > 1, 0, 1)
# building_df = building_df.drop("count", axis=1)
# building_df

# train_df = train_df.merge(building_df, how='left', on='building_ID')
# test_df = test_df.merge(building_df, how='left', on='building_ID')