In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.pipeline import make_pipeline
from tpot.builtins import StackingEstimator

fn_train = r'train.csv'
fn_test = r'test.csv'

df_train = pd.read_csv(fn_train, na_values=-999.0)
df_test = pd.read_csv(fn_test)

In [2]:
# Separate train data based on target
df_tr_dtc = df_train[df_train.DTC.notna()].copy().drop(columns=["DTS"])
df_tr_dts = df_train[df_train.DTS.notna()].copy().drop(columns=["DTC"])

In [3]:
# df with complete log for each DTC and DTS
df_trc_dtc = df_tr_dtc.dropna()
df_trc_dts = df_tr_dts.dropna()

In [4]:
# Feature engineering
def apply_fe_simple(df: pd.DataFrame) -> pd.DataFrame:
    df_fe = df.copy()
    df_fe["HRD_log10"] = np.log10(df_fe.HRD)
    df_fe["HRM_log10"] = np.log10(df_fe.HRM)

    return df_fe


df_dtc_fe = apply_fe_simple(df_trc_dtc)
df_dts_fe = apply_fe_simple(df_trc_dts)
df_test_fe = apply_fe_simple(df_test)

col_ins = df_test.columns.to_list()
col_ins += ["HRD_log10", "HRD_log10"]


In [5]:
# Obtained from TPOT using DTC data
rgs_dtc = make_pipeline(
    StackingEstimator(
        estimator=
        ExtraTreesRegressor(bootstrap=False, max_features=0.4,
                            min_samples_leaf=1, min_samples_split=10,
                            n_estimators=100, random_state=24, n_jobs=6)),
    RandomForestRegressor(bootstrap=True, max_features=0.25, min_samples_leaf=2,
                          min_samples_split=20, n_estimators=100, n_jobs=6,
                          random_state=24)
)


In [6]:
# Obtained from TPOT using DTS data
rgs_dts = RandomForestRegressor(bootstrap=True, max_features=0.4,
                                min_samples_leaf=1, min_samples_split=6,
                                n_estimators=100, random_state=24, n_jobs=6)


In [7]:
# Model training
X_dtc = df_dtc_fe[col_ins].values
y_dtc = df_dtc_fe["DTC"].values
rgs_dtc.fit(X_dtc, y_dtc)

X_dts = df_dts_fe[col_ins].values
y_dts = df_dts_fe["DTS"].values
rgs_dts.fit(X_dts, y_dts)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features=0.4, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=6, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=6, oob_score=False,
                      random_state=24, verbose=0, warm_start=False)

In [8]:
# Inference
X_test = df_test_fe[col_ins].values
df_test["DTC"] = rgs_dtc.predict(X_test)
df_test["DTS"] = rgs_dts.predict(X_test)

df_test[["DTC", "DTS"]].to_csv("tf_submission_1.csv", index=False)