In [1]:
# !python -m pip install --upgrade pip
# !pip install numpy
# !pip install pandas
# !pip install itertools
# !pip install catboost
# !pip install scikit-learn

In [2]:
import numpy as np
import pandas as pd
import joblib
import os
import operator
from itertools import combinations
from itertools import product
import gc
import catboost as cat
import multiprocessing
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

os.chdir(r"D:\MyDrive2\pythonprojects\class\ML\BNP\data")

# Load Data

In [3]:
# train_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\train.csv"
# test_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\test.csv"
# submission_sample_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\sample_submission.csv"
# submission_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\submission.csv"
# good_feature_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\assignment\featimp_20241012_1720_8gpu_num2cat_num999_longparam.csv"

train_file_path = r"train.csv"
test_file_path = r"test.csv"
submission_sample_file_path = r"sample_submission.csv"
submission_file_path = r"submission.csv"
# good_feature_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\assignment\featimp_20241012_1720_8gpu_num2cat_num999_longparam.csv"


In [4]:
train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)
y = train['target']
train.drop('target', axis=1, inplace=True)
train.drop('ID', axis=1, inplace=True)
train.drop('v74', axis=1, inplace=True)  #v74, v3 are univariate
train.drop('v3', axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True) #ID is useless
test.drop('v74', axis=1, inplace=True)
test.drop('v3', axis=1, inplace=True)

# train = train[:50]
# test = test[:50]
# y = y[:50]


## Convert Low Cardinality Numerical to Categorical

In [5]:
# modify numerical features v62 v129 v38 v72 to categorical features
num_2_cat_list = ['v62', 'v129', 'v38', 'v72']
for c in num_2_cat_list:
    train[c] = train[c].astype('string')
    train[c] = train[c].astype('O')
for c in num_2_cat_list:
    test[c] = test[c].astype('string')
    test[c] = test[c].astype('O')

In [6]:
## Calculate feature's correlation with target, remove useless features

In [7]:
def calc_rel(df: pd.DataFrame, target_col: pd.Series, list1: list) -> dict:
    """calculate correlation
    @list1: a list of columns in df, to calculate with target_col"""
    correlations = dict()
    for col in list1:
        if df[col].dtype == 'O':
            corr = target_col.corr(df[col].astype('category').cat.codes)
        else:
            corr = target_col.corr(df[col])
        if not np.isnan(corr):
            correlations[col] = abs(corr)
    correlations = sorted(correlations.items(), key=lambda item: item[1], reverse=True)
    return correlations


In [8]:
#corr works on numerical column
num_cols = train.select_dtypes(include=['number']).columns
num_corr = calc_rel(train, y, num_cols)
# use 0.03 as threshold
num_corr = pd.DataFrame(num_corr[:30],columns=['feature','importance'])

In [9]:
num_cols = num_corr['feature'].to_list()

In [10]:
#round num to 2 decimals
for col in num_cols:
    train[col] = np.round(train[col],2)
for col in num_cols:
    test[col] = np.round(test[col],2)

In [11]:
cat_cols = train.select_dtypes(exclude=['number']).columns
cat_corr = pd.DataFrame(calc_rel(train, y, cat_cols), columns=['feature', 'correlation'])
cat_cols = cat_corr['feature'].to_list()

In [12]:
#combine the cat and num cols
sorted_num_list = [i for i in num_cols]
sorted_cat_list = [i for i in cat_cols]
keep_cols = sorted_num_list + sorted_cat_list

In [13]:
#drop useless from the train and test
train = train[keep_cols]
test = test[keep_cols]

In [14]:
#fill nan values
for col in sorted_cat_list:
    train.fillna({col:'NA'},inplace=True)
    # test[col] = test[col].fillna('NA')
    test.fillna({col:'NA'},inplace=True)
    

In [15]:
for col in sorted_num_list:
    # mean = train[col].mean(skipna=True)
    train[col] = train[col].fillna(-999)
    test[col] = test[col].fillna(-999)

assert (train.isnull().sum().sum() == 0)
assert (test.isnull().sum().sum() == 0)

In [16]:
#get 99% percent importance
def get_n_per_corr(org_df: pd.DataFrame, target_per: float = 0.99) -> pd.DataFrame:
    """df[correlation] should be descending ordered"""
    df = org_df.copy()
    df = df.sort_values(by='correlation', ascending=False, inplace=False)
    #normalization
    df['correlation'] = df['correlation'] / np.sum(df['correlation'])
    cum_relation = 0
    for i in range(len(df)):
        cum_relation += df.iloc[i]['correlation']
        if cum_relation >= target_per:
            break
    new_df = df.iloc[0:i + 1]
    print(i, cum_relation, new_df.shape)
    return new_df

In [17]:
#2-way combination for cat feat
c2 = list(combinations(sorted_cat_list, 2))
names_cs = [c[1] + c[0] for c in c2]
train_c2 = pd.concat([train[c[1]] + train[c[0]] for c in c2], axis=1, keys=names_cs)
test_c2 = pd.concat([test[c[1]] + test[c[0]] for c in c2], axis=1, keys=names_cs)

In [18]:
#v22 + c2
catcol_without_v22 = [i for i in sorted_cat_list if i != 'v22']

# First generate combination part without v22, then generate a new pair combine with v22
cc2_no_v22 = list(combinations(catcol_without_v22, 2))
v22_cc2 = list(product(['v22'], cc2_no_v22))
column_names = [c[0] + '_' + c[1][0] + '_' + c[1][1] for c in v22_cc2]

train_v22c2 = pd.concat([train[c[0]] + train[c[1][0]] + train[c[1][1]] for c in v22_cc2], axis=1, keys=column_names)
train_v22c2.head()

test_v22c2 = pd.concat([test[c[0]] + test[c[1][0]] + test[c[1][1]] for c in v22_cc2], axis=1, keys=column_names)
test_v22c2.head()

Unnamed: 0,v22_v110_v47,v22_v110_v31,v22_v110_v129,v22_v110_v113,v22_v110_v62,v22_v110_v72,v22_v110_v38,v22_v110_v66,v22_v110_v79,v22_v110_v56,...,v22_v91_v75,v22_v91_v71,v22_v91_v125,v22_v91_v52,v22_v75_v71,v22_v75_v125,v22_v75_v52,v22_v71_v125,v22_v71_v52,v22_v125_v52
0,AFPBAJ,AFPBAA,AFPBA0,AFPBANA,AFPBA1,AFPBA1,AFPBA0,AFPBAA,AFPBAQ,AFPBAAF,...,AFPBBD,AFPBBF,AFPBBAF,AFPBBC,AFPBDF,AFPBDAF,AFPBDC,AFPBFAF,AFPBFC,AFPBAFC
1,FOGBC,FOGBA,FOGB0,FOGBNA,FOGB1,FOGB5,FOGB4,FOGBC,FOGBE,FOGBDI,...,FOGCD,FOGCF,FOGCI,FOGCA,FOGDF,FOGDI,FOGDA,FOGFI,FOGFA,FOGIA
2,HXZAI,HXZAA,HXZA0,HXZAAJ,HXZA1,HXZA1,HXZA0,HXZAC,HXZAC,HXZADO,...,HXZCD,HXZCF,HXZCAV,HXZCA,HXZDF,HXZDAV,HXZDA,HXZFAV,HXZFA,HXZAVA
3,AGMUAI,AGMUAA,AGMUA0,AGMUAG,AGMUA1,AGMUA1,AGMUA0,AGMUAB,AGMUAC,AGMUACN,...,AGMUBD,AGMUBF,AGMUBB,AGMUBA,AGMUDF,AGMUDB,AGMUDA,AGMUFB,AGMUFA,AGMUBA
4,AWWAI,AWWAA,AWWA0,AWWAI,AWWA2,AWWA2,AWWA0,AWWAB,AWWAC,AWWANA,...,AWWBD,AWWBF,AWWBA,AWWBH,AWWDF,AWWDA,AWWDH,AWWFA,AWWFH,AWWAH


In [19]:
train = pd.concat([train, train_c2, train_v22c2], axis=1)
test = pd.concat([test, test_c2, test_v22c2], axis=1)

In [20]:
del train_c2, train_v22c2, test_c2, test_v22c2
gc.collect()

10

# V22+C3

In [21]:
cc3_no_v22 = list(combinations(catcol_without_v22, 3))
v22_cc3 = list(product(['v22'], cc3_no_v22))
len(v22_cc3)

1140

In [22]:
column_names = [col[0] + '_' + col[1][0] + '_' + col[1][1] + '_' + col[1][2] for col in v22_cc3]
train_cc3 = pd.concat([train[col[0]] + train[col[1][0]] + train[col[1][1]] + train[col[1][2]] for col in v22_cc3],
                      axis=1, keys=column_names)
train_cc3.shape

(114321, 1140)

In [23]:
corr_list = calc_rel(train_cc3, y, train_cc3.columns.to_list())
corr_df = pd.DataFrame(corr_list, columns=['feature', 'correlation'])

In [24]:
p99_corr_train_cc3 = get_n_per_corr(corr_df, target_per=0.99)
# p99_corr = corr_df

1123 0.9905946819599096 (1124, 2)


In [25]:
train_cc3 = train_cc3[p99_corr_train_cc3['feature'].to_list()]
train = pd.concat([train, train_cc3], axis=1)
del train_cc3
gc.collect()

0

In [26]:
test_cc3 = pd.concat([test[col[0]] + test[col[1][0]] + test[col[1][1]] + test[col[1][2]] for col in v22_cc3],
                      axis=1, keys=column_names)
test_cc3 = test_cc3[p99_corr_train_cc3['feature'].to_list()]
test = pd.concat([test, test_cc3], axis=1)
del test_cc3
gc.collect()

0

# deal comb with num features

In [27]:
#2-way combination for num feat
n2 = list(combinations(sorted_num_list, 2))
names_cs = [c[1] + c[0] for c in n2]
train_n2 = pd.concat([train[c[1]] + train[c[0]] for c in n2], axis=1, keys=names_cs)
test_n2 = pd.concat([test[c[1]] + test[c[0]] for c in n2], axis=1, keys=names_cs)

In [28]:
#v22 + c2
numcol_without_v50 = [i for i in sorted_num_list if i != 'v50']

# First generate combination part without v22, then generate a new pair combine with v22
nc2_no_v50 = list(combinations(numcol_without_v50, 2))
v50_nc2 = list(product(['v50'], nc2_no_v50))
column_names = [c[0] + '_' + c[1][0] + '_' + c[1][1] for c in v50_nc2]

train_v50n2 = pd.concat([train[c[0]] + train[c[1][0]] + train[c[1][1]] for c in v50_nc2], axis=1, keys=column_names)
train_v50n2.head()

test_v50n2 = pd.concat([test[c[0]] + test[c[1][0]] + test[c[1][1]] for c in v50_nc2], axis=1, keys=column_names)
test_v50n2.head()

Unnamed: 0,v50_v10_v14,v50_v10_v34,v50_v10_v114,v50_v10_v21,v50_v10_v4,v50_v10_v119,v50_v10_v123,v50_v10_v48,v50_v10_v12,v50_v10_v106,...,v50_v121_v88,v50_v121_v36,v50_v121_v111,v50_v121_v65,v50_v88_v36,v50_v88_v111,v50_v88_v65,v50_v36_v111,v50_v36_v65,v50_v111_v65
0,13.66,6.66,13.68,8.87,5.63,1.43,3.25,14.2,8.39,13.12,...,5.17,15.57,6.96,19.0,14.08,5.47,17.51,15.87,27.91,19.3
1,13.08,10.71,18.6,8.22,-996.68,-996.68,-996.68,-996.68,8.94,-996.68,...,-1996.97,-1996.97,-1996.97,-1996.97,-1996.97,-1996.97,-1996.97,-1996.97,-1996.97,-1996.97
2,15.73,10.38,18.04,10.67,6.96,2.42,4.39,15.19,9.31,14.17,...,9.05,15.54,6.46,19.12,18.75,9.67,22.33,16.16,28.82,19.74
3,12.93,8.75,16.23,8.07,4.19,20.77,10.69,19.32,9.39,8.49,...,5.53,18.84,9.03,17.47,15.12,5.31,13.75,18.62,27.06,17.25
4,13.89,8.02,15.94,9.28,7.18,3.2,3.87,13.12,8.81,17.21,...,5.4,13.98,5.0,20.66,14.53,5.55,21.21,14.13,29.79,20.81


In [29]:
corr_list = calc_rel(train_v50n2, y, train_v50n2.columns.to_list())
corr_df = pd.DataFrame(corr_list, columns=['feature', 'correlation'])

In [30]:
p99_corr_v50n2 = get_n_per_corr(corr_df, target_per=0.99)
# p99_corr = corr_df

393 0.9900523716723958 (394, 2)


In [31]:
train_v50n2 = train_v50n2[p99_corr_v50n2['feature'].to_list()]
train = pd.concat([train, train_n2, train_v50n2], axis=1)
del train_n2, train_v50n2
gc.collect()

10

# v50+n2

In [32]:
test_v50n2 = test_v50n2[p99_corr_v50n2['feature'].to_list()]
test = pd.concat([test, test_n2, test_v50n2], axis=1)
# test.shape

In [33]:
del test_n2, test_v50n2
gc.collect()

0

In [34]:
assert(train.shape[1] == test.shape[1])
for i in test.columns:
    if i not in train.columns:
        assert(True)

In [35]:
cat_col = train.select_dtypes(exclude=['number']).columns.values
num_col = train.select_dtypes(include=['number']).columns.values
cols = train.columns.to_list()

# a list for removing useless features

In [36]:
# featimp_df = pd.read_csv("featimp_20241012_1720_8gpu_num2cat_num999_longparam.csv")
# featimp_df = featimp_df.sort_values(by=['importance'],ascending=False)
# important_features = featimp_df['feature'].to_list()
# cols = [i for i in train.columns if i in important_features]
# cols_cat = [i for i in cat_col if i in important_features]
# cols = [i for i in train.columns if i in important_features]
# cat_col = [i for i in cat_col if i in important_features]


In [37]:

# train = train[cols] #not necessary
# test = test[cols] #not necessary
# train_data = cat.Pool(train, label=y,cat_features=cat_col)
# test_data = cat.Pool(test,cat_features=cat_col)




In [38]:
params = {
    "loss_function": "Logloss",
    "eval_metric": "Logloss",
    "learning_rate": 0.02,
    "iterations": 2800,
    # "l2_leaf_reg": 3,
    # "random_seed": 432013,
    # "subsample": 0.66,
    # "od_type": "Iter",
    # "rsm": 0.2,
    "depth": 6,
    # "border_count": 128,
    "task_type": "GPU",
    "cat_features":cat_col,
    "verbose":100
}

In [39]:
model = cat.CatBoostClassifier(**params)
fit_model = model.fit(train,y,verbose=100)

0:	learn: 0.6836150	total: 2.04s	remaining: 1h 35m 10s
100:	learn: 0.4705688	total: 2m 34s	remaining: 1h 8m 58s
200:	learn: 0.4549213	total: 4m 57s	remaining: 1h 4m 12s
300:	learn: 0.4494758	total: 7m 21s	remaining: 1h 1m 8s
400:	learn: 0.4462807	total: 9m 47s	remaining: 58m 34s
500:	learn: 0.4439359	total: 12m 16s	remaining: 56m 20s
600:	learn: 0.4422005	total: 14m 42s	remaining: 53m 49s
700:	learn: 0.4406861	total: 17m 8s	remaining: 51m 18s
800:	learn: 0.4392553	total: 19m 34s	remaining: 48m 52s
900:	learn: 0.4378327	total: 21m 59s	remaining: 46m 20s
1000:	learn: 0.4365857	total: 24m 19s	remaining: 43m 43s
1100:	learn: 0.4352196	total: 26m 49s	remaining: 41m 23s
1200:	learn: 0.4339789	total: 29m 17s	remaining: 38m 59s
1300:	learn: 0.4327748	total: 31m 45s	remaining: 36m 35s
1400:	learn: 0.4315866	total: 34m 19s	remaining: 34m 16s
1500:	learn: 0.4304853	total: 36m 40s	remaining: 31m 44s
1600:	learn: 0.4294153	total: 39m 3s	remaining: 29m 14s
1700:	learn: 0.4283003	total: 41m 24s	remai

In [40]:
ite_round = 0

class CustomDataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,**kwargs):
        return

    def fit(self, X, y=None, **kwargs):

        # Perform necessary fitting operations
        return self

    def transform(self, X, y=None, **kwargs):
        # Perform transformations on the data
        global ite_round
        ite_round += 1
        print(f"This is the {ite_round=:}")
        return X



In [41]:
from sklearn.pipeline import Pipeline, make_pipeline

param_grid_1 = {
    "catboostclassifier__iterations": np.arange(2800,2900,100),
    # "catboostclassifier__learning_rate": np.arange(0.01,0.05,0.01),
    "catboostclassifier__depth": np.arange(6,7,1)
    }

def pipeforcat():
    return make_pipeline(CustomDataPreprocessor(),cat.CatBoostClassifier(**params))

In [42]:
piped = pipeforcat()
piped

In [43]:
piped.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'customdatapreprocessor', 'catboostclassifier', 'catboostclassifier__iterations', 'catboostclassifier__learning_rate', 'catboostclassifier__depth', 'catboostclassifier__loss_function', 'catboostclassifier__verbose', 'catboostclassifier__eval_metric', 'catboostclassifier__task_type', 'catboostclassifier__cat_features'])

In [44]:
# piped.fit(train,y)

In [45]:
from sklearn.model_selection import GridSearchCV


# grid = GridSearchCV(cat.CatBoostClassifier(**params), param_grid_1, cv=2)
grid = GridSearchCV(piped, param_grid_1, cv=3, error_score='raise', verbose=3)

In [46]:
grid_model = grid.fit(train,y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
This is the ite_round=1
0:	learn: 0.6836823	total: 2.11s	remaining: 1h 38m 15s
100:	learn: 0.4728697	total: 2m 42s	remaining: 1h 12m 28s
200:	learn: 0.4586552	total: 5m 11s	remaining: 1h 7m 11s
300:	learn: 0.4531660	total: 7m 40s	remaining: 1h 3m 40s
400:	learn: 0.4497021	total: 10m 15s	remaining: 1h 1m 20s
500:	learn: 0.4470320	total: 12m 44s	remaining: 58m 25s
600:	learn: 0.4446674	total: 15m 11s	remaining: 55m 33s
700:	learn: 0.4425868	total: 17m 39s	remaining: 52m 52s
800:	learn: 0.4405637	total: 20m 7s	remaining: 50m 13s
900:	learn: 0.4387153	total: 22m 33s	remaining: 47m 32s
1000:	learn: 0.4369661	total: 24m 58s	remaining: 44m 53s
1100:	learn: 0.4353279	total: 27m 27s	remaining: 42m 22s
1200:	learn: 0.4337258	total: 29m 54s	remaining: 39m 48s
1300:	learn: 0.4322088	total: 32m 19s	remaining: 37m 15s
1400:	learn: 0.4306207	total: 34m 44s	remaining: 34m 41s
1500:	learn: 0.4289699	total: 37m 14s	remaining: 32m 13s
1600:	learn

In [47]:
grid.best_params_

{'catboostclassifier__depth': 6, 'catboostclassifier__iterations': 2800}

In [48]:
grid.cv_results_

{'mean_fit_time': array([4590.27944557]),
 'std_fit_time': array([139.78377339]),
 'mean_score_time': array([41.59855223]),
 'std_score_time': array([4.12812302]),
 'param_catboostclassifier__depth': masked_array(data=[6],
              mask=[False],
        fill_value=999999),
 'param_catboostclassifier__iterations': masked_array(data=[2800],
              mask=[False],
        fill_value=999999),
 'params': [{'catboostclassifier__depth': 6,
   'catboostclassifier__iterations': 2800}],
 'split0_test_score': array([0.79439473]),
 'split1_test_score': array([0.79439473]),
 'split2_test_score': array([0.79678274]),
 'mean_test_score': array([0.79519073]),
 'std_test_score': array([0.00112572]),
 'rank_test_score': array([1])}

In [49]:
# y_pred = grid.best_estimator_[1].predict_proba(test_data)


In [50]:
y_pred = fit_model.predict_proba(test)

In [51]:
y_pred.shape

(114393, 2)

In [52]:
submission = pd.read_csv("sample_submission.csv")
submission['PredictedProb'] = y_pred[:,1]
submission.to_csv('submission_20241013_1900_grid_.csv', index=False)

In [53]:
fit_model.get_best_iteration()

In [54]:
pd.DataFrame(grid.cv_results_).to_csv("cv_results_.csv",index=True)

In [55]:
# Save the grid search model
joblib.dump(grid, 'grid_search_model.pkl')
# Load the model
# loaded_model = joblib.load('grid_search_model.pkl')

['grid_search_model.pkl']

In [56]:
#save feature importance of grid search
fit_model = grid_model.best_estimator_[1]
features = fit_model.feature_names_
feature_importances = fit_model.get_feature_importance()
featimp_df = pd.DataFrame({'importance':feature_importances, 'feature':features}).sort_values(by=['importance'],ascending=False)
featimp_df.to_csv('featimp_20241012_2100_8gpu_runpod_grid_.csv', index=True)

In [57]:
# Load the model back
# fit_model = cat.CatBoostClassifier()
# fit_model.load_model("model_20241012_1720_8gpu_num2cat_num999_longparam.cbm")
# best_estimator = fit_model.best_estimator_[1]
# features = best_estimator.feature_names_
# feature_importances = best_estimator.get_feature_importance()
# featimp_df = pd.DataFrame({'importance':feature_importances, 'feature':features}).sort_values(by=['importance'],ascending=False)
# featimp_df.to_csv('featimp_20241012_2100_8gpu_runpod_grid_2100_6.csv', index=True)
# important_features = featimp_df['feature'].to_list()

In [58]:
joblib.dump(fit_model, 'fit_model.pkl')

['fit_model.pkl']

In [59]:
features = fit_model.feature_names_
feature_importances = fit_model.get_feature_importance()
featimp_df = pd.DataFrame({'importance':feature_importances, 'feature':features}).sort_values(by=['importance'],ascending=False)
featimp_df.to_csv('featimp.csv', index=True)