In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import ast
import re

In [None]:
df = pd.read_csv('/kaggle/input/hi-paris-2023/train/train_features_sent.csv')
test_df = pd.read_csv('/kaggle/input/hi-paris-2023/test/test_features_sent.csv')

# Cleaning up

In [None]:
def process_lowe_floor_thermal_conductivity(df):
    df['lowe_floor_thermal_conductivity'] = df[['lowe_floor_thermal_conductivity']].fillna(2.8)
    return df

def process_lower_floor_insulation_type(df):
    np.random.seed(0)
    insulation_df = pd.DataFrame()
    insulation_df['external_insulation'] = df['lower_floor_insulation_type'].str.contains('external').fillna(False).astype(int)
    insulation_df['internal_insulation'] = df['lower_floor_insulation_type'].str.contains('internal').fillna(False).astype(int)
    
    undetermined_insulation_mask = df['lower_floor_insulation_type'] == 'insulated'
    insulation_df[undetermined_insulation_mask] = np.random.randint(low=0, high=2, size=(np.sum(undetermined_insulation_mask), 2))
    
    df = pd.concat([df.drop(columns=['lower_floor_insulation_type']), insulation_df], axis=1)
    
    return df

def process_lower_floor_material(df):
    floor_df = pd.DataFrame()
    floor_df['concrete_slab_floor'] = (df['lower_floor_material'] == 'concrete slab')
    floor_df['heavy_floor'] = (df['lower_floor_material'] == 'heavy floor, such as clay floor joists, concrete beams')
    floor_df['insulated_joist_floor'] = (df['lower_floor_material'] == 'Insulated joist floor')
    floor_df['wood_floor'] = df['lower_floor_material'].str.contains('wood', case=False)
    floor_df['metal_floor'] = df['lower_floor_material'].str.contains('metal', case=False)
    
    floor_df['other_floor'] = ~floor_df.any(axis=1)
    
    floor_df = floor_df.fillna(False).astype(int)
    
    df = pd.concat([df.drop(columns=['lower_floor_material']), floor_df], axis=1)
    
    return df

def process_lower_floor_adjacency_type(df):
    np.random.seed(0)
    prob_series = df['lower_floor_adjacency_type'].value_counts(normalize=True)
    categories = prob_series.index.tolist()
    probabilities = prob_series.values.tolist()
   
    col_na_mask = df['lower_floor_adjacency_type'].isna()
    df['lower_floor_adjacency_type'][col_na_mask] = np.random.choice(categories, size=np.sum(col_na_mask), p=probabilities)
   
    one_hot_df = pd.get_dummies(df['lower_floor_adjacency_type'], prefix='lower_floor_adjacency_type', drop_first=True)
   
    df = pd.concat([df.drop(columns='lower_floor_adjacency_type'), one_hot_df], axis=1)
   
    return df

def process_cat_pct(df, column, new_col_name=None):
    cat_data_pct = df[column].value_counts().cumsum()/len(df)
    keep_cats = cat_data_pct[cat_data_pct<0.95].index.tolist()
    
    if cat_data_pct.iloc[0] > 0.3:
        df[column] = df[column].fillna(cat_data_pct.index[0])
        
    if not new_col_name:
        new_col_name = column
    
    for category in keep_cats:
        df[f'{new_col_name}_{category}'] = (df[column] == category).astype(int)
        
    df.drop(columns=[column], inplace = True)
    
    return df

def create_one_hot_ColumnSplitBySymbol(df, col_name, symbol): #symbol = ' + '
    df[col_name] = df[col_name].fillna('none')
    df[col_name] = df[col_name].apply(lambda x: 'none' if x=='' else x)
    new_df = pd.get_dummies(pd.DataFrame(df[col_name].apply(lambda x: x.split(symbol)).tolist()))
    new_df.columns = new_df.columns.str.split("_").str[-1]

    merged_df = pd.DataFrame()
    for unique_column in new_df.columns.unique():
        if len(new_df[unique_column].shape) == 1:
            merged_df[f'{col_name}_{unique_column}_merged'] = new_df[unique_column]
        else:
            merged_df[f'{col_name}_{unique_column}_merged'] = np.zeros(new_df[unique_column].shape[0], dtype=np.int32)
            for icol in range(new_df[unique_column].shape[1]):
                merged_df[f'{col_name}_{unique_column}_merged'] = merged_df[f'{col_name}_{unique_column}_merged'] | new_df[unique_column].iloc[:,icol]
    df = pd.concat([df, merged_df], axis=1)
    df.drop(columns=[col_name], inplace=True)
    return df

def create_one_hot_ColumnOfLists(df, col_name):
    df[col_name] = df[col_name].fillna(df[col_name].mode().iloc[0])
    df[col_name] = df[col_name].apply(lambda x: '[empty]' if x=='[]' else x)
    new_df = pd.get_dummies(pd.DataFrame(df[col_name].apply(lambda x: x[1:-1].split(',')).tolist()))
    new_df.columns = new_df.columns.str.split("_").str[-1]

    merged_df = pd.DataFrame()
    for unique_column in new_df.columns.unique():
        if len(new_df[unique_column].shape) == 1:
            merged_df[f'{col_name}_{unique_column}_merged'] = new_df[unique_column]
        else:
            merged_df[f'{col_name}_{unique_column}_merged'] = np.zeros(new_df[unique_column].shape[0], dtype=np.int32)
            for icol in range(new_df[unique_column].shape[1]):
                merged_df[f'{col_name}_{unique_column}_merged'] = merged_df[f'{col_name}_{unique_column}_merged'] | new_df[unique_column].iloc[:,icol]
    df = pd.concat([df, merged_df], axis=1)
    df.drop(columns=[col_name], inplace=True)
    return df

def create_one_hot(df, col_name):
    df[col_name] = df[col_name].fillna(df[col_name].mode().iloc[0])
    df = pd.concat([df, pd.get_dummies(df[col_name])], axis=1)
    df.drop(columns=[col_name], inplace=True)
    return df

def clean_1to24(data):
    data2 = data.copy()
    
    to_one_hot=[]
    data2 = re_categorize_by_count(data2,"additional_heat_generators", 1000)
    data2 = re_categorize_by_count(data2,"additional_water_heaters", 1000)

    data2["altitude"] = data2["altitude"].fillna(data2["altitude"].mean())
    
    data2.drop(columns=["balcony_depth"], inplace=True)
    
    data2 =create_one_hot_ColumnSplitBySymbol(data2, "bearing_wall_material", " - ")
    data2 = create_one_hot_ColumnOfLists(data2,"building_category")

    data2 = create_one_hot_ColumnOfLists(data2,"building_class")
    data2["building_height_ft"] = data2["building_height_ft"].fillna(data2["building_height_ft"].median())      

    data2["building_total_area_sqft"] = data2["building_total_area_sqft"].fillna(data2["building_total_area_sqft"].median())
    data2["living_area_sqft"] = data2["living_area_sqft"].fillna(data2["living_area_sqft"].median())
    
    to_one_hot.append("building_type")
    to_one_hot.append("building_use_type_code")
        
    for col in to_one_hot:
        data2 = create_one_hot(data2, col)
    
    return data2

def re_categorize_by_count(df, col_name, threshold):
    df[col_name] = df[col_name].fillna(df[col_name].mode().iloc[0])
    df[col_name] = df[col_name].apply(lambda x: '[empty]' if x=='[]' else x)
    
    dict_col = df[col_name].value_counts().to_dict()
    df[col_name] = df[col_name].apply(lambda x: '[other]' if dict_col[x]<threshold else x)
    
    new_df = pd.get_dummies(pd.DataFrame(df[col_name].apply(lambda x: x[1:-1].split(',')).tolist()))
    new_df.columns = new_df.columns.str.split("_").str[-1]

    merged_df = pd.DataFrame()
    for unique_column in new_df.columns.unique():
        if len(new_df[unique_column].shape) == 1:
            merged_df[f'{col_name}_{unique_column}_merged'] = new_df[unique_column]
        else:
            merged_df[f'{col_name}_{unique_column}_merged'] = np.zeros(new_df[unique_column].shape[0], dtype=np.int32)
            for icol in range(new_df[unique_column].shape[1]):
                merged_df[f'{col_name}_{unique_column}_merged'] = merged_df[f'{col_name}_{unique_column}_merged'] | new_df[unique_column].iloc[:,icol]
    df = pd.concat([df, merged_df], axis=1)
    df.drop(columns=[col_name], inplace=True)
    return df
    
def clean_features(orig_df):
    df = orig_df.copy()
    
    df = clean_1to24(df)
    
    df.building_year = df.building_year.fillna(df.building_year.median())
    df['years_old'] = 2023 - df.building_year
    
    df = process_lowe_floor_thermal_conductivity(df)
    df = process_lower_floor_adjacency_type(df)
    df = process_lower_floor_insulation_type(df)
    df = process_lower_floor_material(df)
    df = process_cat_pct(df, 'main_heat_generators')
    df = process_cat_pct(df, 'main_water_heaters')
    
    df.drop(columns='main_heating_type', inplace=True)
    df.drop(columns='main_water_heating_type', inplace=True)
    df.drop(columns='nb_commercial_units', inplace=True)
    df['nb_dwellings'] = df['nb_dwellings'].clip(upper=50)
    df.drop(columns='nb_gas_meters_commercial', inplace=True)
    df.drop(columns='nb_gas_meters_housing', inplace=True)
    df.drop(columns='nb_gas_meters_total', inplace=True)
    df.drop(columns='nb_housing_units', inplace=True)
    df.drop(columns='nb_meters', inplace=True)
    df.drop(columns='nb_parking_spaces', inplace=True)
    df.drop(columns='nb_power_meters_commercial', inplace=True)


    df['percentage_glazed_surfaced'] = df['percentage_glazed_surfaced'].fillna(df['percentage_glazed_surfaced'].mean())
    df.radon_risk_level = df.radon_risk_level.fillna(df.radon_risk_level.mode().iloc[0])
    df['radon_risk_level'] = df['radon_risk_level'].replace({'low': 0, 'medium': 1, 'high': 2})

    # From nb_power_meters_housing
    df.drop(columns=['building_period'], inplace=True)
    df.drop(columns=['building_use_type_description'], inplace=True)
    df.drop(columns=['nb_power_meters_housing'], inplace=True)
    df.drop(columns=['nb_power_meters_total'], inplace=True)
    df.drop(columns=['nb_units_total'], inplace=True)

    df = create_one_hot(df, 'outer_wall_materials')
    df.outer_wall_thermal_conductivity = df.outer_wall_thermal_conductivity.fillna(df.outer_wall_thermal_conductivity.median())


    df.outer_wall_thickness = df.outer_wall_thickness.fillna(df.outer_wall_thickness.mode().iloc[0])
    df.outer_wall_thickness = df.outer_wall_thickness.apply(lambda x: float(re.sub(' et -', '', x)))

    df.clay_risk_level = df.clay_risk_level.fillna(df.clay_risk_level.mode().iloc[0])
    df['clay_risk_level'] = df.clay_risk_level.replace({'low': 0, 'medium': 1, 'high': 2})


    df.drop(columns=['consumption_measurement_date'], inplace=True)
    df.has_balcony = df.has_balcony.fillna(df.has_balcony.mode().iloc[0])
    df = create_one_hot_ColumnOfLists(df, 'heat_generators')
    df = create_one_hot_ColumnSplitBySymbol(df, 'heating_energy_source', symbol= ' + ')
    df = create_one_hot(df, 'heating_type')
    df = create_one_hot(df, 'is_crossing_building')
    
    df.renewable_energy_sources = df.renewable_energy_sources.fillna('')
    df['solar thermal (ecs)'] = df.renewable_energy_sources.apply(lambda x: 'ecs' in x)
    df['solar photovoltaic'] = df.renewable_energy_sources.apply(lambda x: 'solar photovoltaic' in x)
    df['solar thermal (heating)'] = df.renewable_energy_sources.apply(lambda x: 'heating' in x)
    df['solar thermal (hot water)'] = df.renewable_energy_sources.apply(lambda x: 'hot water' in x)
    df['solar thermal (DHW)'] = df.renewable_energy_sources.apply(lambda x: 'DHW' in x)
    df.drop(columns=['renewable_energy_sources'], inplace=True)
    
    df = create_one_hot(df, 'roof_material')
    
    df['thermal_inertia'] = df.thermal_inertia.replace({'low': 0, 'medium': 1, 'high': 2, 'very high': 3})
    
    df = create_one_hot(df, 'upper_floor_adjacency_type')
    
    df.upper_floor_insulation_type = df.upper_floor_insulation_type.fillna('INTERNAL')
    df['upper_floor_insulation_typeINTERNAL'] = df.upper_floor_insulation_type.apply(lambda x: 'INTERNAL' in x)
    df['upper_floor_insulation_typeUNINSULATED'] = df.upper_floor_insulation_type.apply(lambda x: 'UNINSULATED' in x)
    df['upper_floor_insulation_typeEXTERNAL'] = df.upper_floor_insulation_type.apply(lambda x: 'EXTERNAL' in x)
    df['upper_floor_insulation_typeREFLEXION'] = df.upper_floor_insulation_type.apply(lambda x: 'REFLEXION' in x)
    df.drop(columns=['upper_floor_insulation_type'], inplace=True)

    df = create_one_hot(df, 'upper_floor_material')
    
    df.upper_floor_thermal_conductivity = df.upper_floor_thermal_conductivity.fillna(df.upper_floor_thermal_conductivity.median())

    df = create_one_hot(df, 'ventilation_type')
    
    df.wall_insulation_type = df.wall_insulation_type.fillna('internal')
    df['wall_insulation_type_internal'] = df.wall_insulation_type.apply(lambda x: 'internal' in x)
    df['wall_insulation_type non insulated'] = df.wall_insulation_type.apply(lambda x: 'non insulated' in x)
    df['wall_insulation_type external'] = df.wall_insulation_type.apply(lambda x: 'external' in x)
    df['wall_insulation_type reflection'] = df.wall_insulation_type.apply(lambda x: 'reflection' in x or 'reflexion' in x)
    df['wall_insulation_type insulated'] = df.wall_insulation_type.apply(lambda x: 'insulated' in x)
    df.drop(columns=['wall_insulation_type'], inplace=True)
    
    
    df = create_one_hot_ColumnOfLists(df, 'water_heaters')
    df = create_one_hot_ColumnSplitBySymbol(df, 'water_heating_energy_source', symbol= ' + ')
    df = create_one_hot(df, 'water_heating_type')
    df = create_one_hot(df, 'window_filling_type')
    df = create_one_hot(df, 'window_frame_material')
    df = create_one_hot(df, 'window_glazing_type')
    df.window_heat_retention_factor = df.window_heat_retention_factor.fillna(df.window_heat_retention_factor.median())
    
    df = create_one_hot_ColumnOfLists(df, 'window_orientation')
    df.window_thermal_conductivity = df.window_thermal_conductivity.fillna(df.window_thermal_conductivity.median())
    
    return df
df = clean_features(df)
test_df = clean_features(test_df)

In [None]:
df = df.loc[:,~df.columns.duplicated()].copy()
test_df = test_df.loc[:,~test_df.columns.duplicated()].copy()

In [None]:
test_df = test_df.reindex(columns =df.columns, fill_value=0)
df.shape, test_df.shape

# Feature Engineering

In [None]:
def feature_engineer(df):
    df['volumn'] = df.building_height_ft * df.building_total_area_sqft
    return df
df = feature_engineer(df)
test_df = feature_engineer(test_df)

# Train

In [None]:
from datetime import datetime
from pathlib import Path

import imblearn
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import gc
import tqdm
import re
from collections import defaultdict

from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, RidgeClassifier, Ridge, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool, CatBoostRegressor

from sklearn.model_selection import KFold, RepeatedStratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import EasyEnsembleClassifier,BalancedRandomForestClassifier
from sklearn import (
    decomposition,
    discriminant_analysis,
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    pipeline,
    preprocessing,
    svm,
)
from sklearn.metrics import explained_variance_score, mean_squared_error

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import sys

def str_to_class(classname):
    return getattr(sys.modules[__name__], classname)

In [None]:
def create_folds(df, target, n_s=5, n_grp=10000):
    skf = StratifiedKFold(n_splits=n_s)
    grp_target = pd.cut(target, n_grp, labels=False)
    return skf.split(grp_target, grp_target)

In [None]:
train_labels = pd.read_csv('/kaggle/input/hi-paris-2023/train/train_labels_sent.csv').energy_consumption_per_annum

In [None]:
# MODEL_NAME = 'LGBMRegressor'
# NFOLDS = 3
# NESTIMATORS = 100

In [None]:
# X_train = df[(train_labels< 1200) & (train_labels >=0)]
# ys = train_labels[(train_labels< 1200) & (train_labels >=0)]
# splits = list(create_folds(X_train, ys, n_s=NFOLDS, n_grp=1000))
# X_test = test_df

# print(f"Training {MODEL_NAME}")
# y_preds = np.zeros((NFOLDS, X_test.shape[0]))
# y_oof = np.zeros(X_train.shape[0])

# for fold_n in range(NFOLDS):
#     train_index, valid_index = splits[fold_n]
#     print(f"Fold {fold_n}")
#     X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
#     y_tr, y_val = ys.iloc[train_index], ys.iloc[valid_index]  

#     dtrain = lgb.Dataset(X_tr, y_tr, free_raw_data=False)
#     lgb_params = {
#             'n_jobs': -1,
#             'verbosity': -1,
#             'n_estimators': NESTIMATORS,
#         }
#     clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=200)

#     y_oof[valid_index] = clf.predict(X_val) 
#     y_preds[fold_n, :] = clf.predict(X_test)

#     del X_tr, X_val, y_tr, y_val
#     gc.collect() 

In [None]:
# print('Cross-validation score:')
# explained_variance_score(ys, y_oof)

In [None]:
# sub = pd.read_csv('/kaggle/input/hi-paris-2023/sample_submission_sent.csv')
# sub.energy_consumption_per_annum = np.mean(np.mean(all_y_preds, axis=0),axis=0)
# sub.to_csv('submission_6.csv', index=False)

In [None]:
# sub.head(20)

# Stacking

In [None]:
MODEL_NAMES = ['XGBRegressor', 'LGBMRegressor', 'CatBoostRegressor']
STACKING_MODEL = 'Ridge'
NFOLDS = 5
NESTIMATORS = 100
NESTIMATORS_STACK = 1000

In [None]:
X_train = df[(train_labels< 1200) & (train_labels >=0)]
ys = train_labels[(train_labels< 1200) & (train_labels >=0)]
splits = list(create_folds(X_train, ys, n_s=NFOLDS, n_grp=1000))
X_test = test_df

all_y_preds = []
all_y_oof = []
train_scores = []

for MODEL_NAME in MODEL_NAMES:
    print(f"Training {MODEL_NAME}")

    y_preds = np.zeros((NFOLDS, X_test.shape[0]))
    y_oof = np.zeros(X_train.shape[0])

    for fold_n in range(NFOLDS):
        train_index, valid_index = splits[fold_n]
        print(f"Fold {fold_n}")
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_tr, y_val = ys.iloc[train_index], ys.iloc[valid_index]  
        
        if MODEL_NAME=='XGBRegressor':
            clf = str_to_class(MODEL_NAME)(tree_method="hist", enable_categorical=True, n_estimators=NESTIMATORS)
            clf.fit(X_tr, y_tr)
        elif MODEL_NAME=='LGBMRegressor': 
            
            dtrain = lgb.Dataset(X_tr, y_tr, free_raw_data=False)
            lgb_params = {
                    'n_jobs': -1,
                    'verbosity': -1,
                    'n_estimators': NESTIMATORS,
                }
            clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=200)
        elif MODEL_NAME=='CatBoostRegressor':
            clf = str_to_class(MODEL_NAME)(n_estimators=NESTIMATORS)
            clf.fit(X_tr.to_numpy(), y_tr.to_numpy(), verbose=0)
            
        y_preds_tr = clf.predict(X_tr)
        y_preds_val = clf.predict(X_val)
        y_oof[valid_index] = y_preds_val
        y_preds[fold_n, :] = clf.predict(X_test)
        val_score = explained_variance_score(y_val, y_preds_val)
        train_score = explained_variance_score(y_tr, y_preds_tr)
        print(f"Val score: {val_score:.3g}")
        print(f"Train score: {train_score:.3g}")

        train_scores.append(train_score)

        del X_tr, X_val, y_tr, y_val
        gc.collect() 
            
    all_y_preds.append(y_preds)
    all_y_oof.append(y_oof)
# all_y_oof = np.concatenate(all_y_oof, axis=-1)
# all_y_preds = np.concatenate(all_y_preds, axis=-1)
# all_y_oof = np.array(all_y_oof).reshape(-1, len(MODEL_NAMES))
# all_y_preds = np.array(all_y_preds).reshape(NFOLDS, -1, len(MODEL_NAMES))

In [None]:
# Average of 3 models
score = explained_variance_score(ys, np.mean(all_y_oof, axis=0))  # scoring
print(f"Overall OOF score: {score:.3g}")

In [None]:
sub = pd.read_csv('/kaggle/input/hi-paris-2023/sample_submission_sent.csv')
sub.energy_consumption_per_annum = np.mean(np.mean(all_y_preds, axis=0),axis=0)
sub.to_csv('submission_6.csv', index=False)
sub.head()

In [None]:
# # Ridge stacking
# y_stk_val = np.zeros(X_train.shape[0])
# y_stk_test = np.zeros(X_test.shape[0])
# for nestimators in tqdm.tqdm(range(NESTIMATORS_STACK)):
#     stk = str_to_class(STACKING_MODEL)(alpha=0.1)
#     stk.fit(all_y_oof, ys.values)
#     y_stk_val += stk.predict(all_y_oof) / NESTIMATORS_STACK
#     for all_y_pred in all_y_preds:
#         y_stk_test += stk.predict(all_y_pred) / (NESTIMATORS_STACK * NFOLDS)
# score = explained_variance_score(ys, y_stk_val)  # scoring
# print(f"Overall OOF score: {score:.3g}")

In [None]:
# score = explained_variance_score(ys, y_stk_val)  # scoring
# print(f"Overall OOF score: {score:.3g}")