In [None]:
import pandas as pd
import pickle
import numpy as np
import os
import sys
import gc
import catboost

file = open('cat_cols.txt', 'rb')
cat_cols = pickle.load(file)
file.close()


# function to convert dtypes
def encode_cat(x):
    if isinstance(x, float):
        return str(x)
def dt_converter(dtype):
    if dtype == 'float64':
        return 'float16'
    elif dtype == 'int64':
        return 'int16'
    else:
        return 'object'

def mem_usage_gb(df):
    return round((df.memory_usage(deep=True).sum()/1073741824), 2)

def file_mem_usage_gb(file_name):
    return round((os.stat(file_name).st_size/1073741824), 2)

I pre-processed data in different file and saved to csv, now I only have to open it
but there is an issue with dtypes
beforehand I had dtypes_dict to convert float64 to float16 to reduce mem usage
now columns are named differently and I have to redo the conversion

In [None]:
train_data = pd.read_csv('prep_catboost_train.csv')
display(train_data.head())
print('DataFrame memory usage:', mem_usage_gb(train_data), 'GB    ', 'File size:', file_mem_usage_gb('prep_catboost_train.csv'), 'GB')

dataset is too big should be around 1 GB got 3.23 GB
This is because dtypes by default are float64, int64, lets deal with that

In [None]:
dt = train_data.dtypes
dt.unique()

There are only 3 dtypes in the DataFrame, converting should be easy
Create dict {col_name : dtype} to convert while reading from file

In [None]:
dtype_dict = dict(zip(dt.keys().tolist(), list(map(dt_converter, dt.values.tolist()))))
del dt
gc.collect()
dtype_dict

now load DataFrame again using dtype_dict to convert dtypes
and compare memory usage

In [None]:
train_data = pd.read_csv('prep_catboost_train.csv', dtype=dtype_dict)
print('DataFrame memory usage:', mem_usage_gb(train_data), 'GB    ', 'File size:', file_mem_usage_gb('prep_catboost_train.csv'), 'GB')

In [None]:
train_labels = pd.read_csv('train_labels.csv', dtype={'target': 'int8'})
train_labels.drop(columns=['customer_ID'], inplace=True)
train_labels = np.ravel(train_labels)

Categoricals for catboost should be str or int, yet some are float
lets change that

In [None]:
nulls = train_data[cat_cols].isna().sum(); types = train_data[cat_cols].dtypes
cat_stats = pd.concat([nulls, types], axis=1).rename(columns={0: "NaN_count", 1: "type"})
cat_stats

In [None]:
cols_to_encode = cat_stats.loc[(cat_stats['NaN_count'] != 0) | (cat_stats['type'] == 'float16')].index.tolist()
cols_to_encode

In [None]:
for col in cols_to_encode:
    train_data[col] = train_data[col].astype('object')
    train_data[col] = train_data[col].map(str)
train_data[cat_cols].fillna('no_data', inplace=True)
train_data.drop(columns=['customer_ID'], inplace=True)
train_data[cat_cols].dtypes

In [None]:
nulls = train_data[cat_cols].isna().sum(); types = train_data[cat_cols].dtypes
cat_stats = pd.concat([nulls, types], axis=1).rename(columns={0: "NaN_count", 1: "type"})
cat_stats

In [None]:
params = {
    'boosting_type': 'dart',
    'objective': 'binary',
    'learning_rate': 0.04,
    'n_estimators': 1100,
    'num_leaves': 127,
    'task_type': 'GPU'
}
model = catboost.CatBoostRegressor(bagging_temperature = 0.2,
                                   od_type='Iter',
                                   metric_period = 50,
                                   od_wait=20,
                                   iterations=3000)
model.fit(train_data, train_labels, cat_features=cat_cols)

In [None]:
test_data = pd.read_csv('prep_catboost_test.csv', dtype=dtype_dict)
nulls = test_data[cat_cols].isna().sum(); types = test_data[cat_cols].dtypes
cat_stats = pd.concat([nulls, types], axis=1).rename(columns={0: "NaN_count", 1: "type"})
cols_to_encode = cat_stats.loc[(cat_stats['NaN_count'] != 0) | (cat_stats['type'] == 'float16')].index.tolist()

for col in cols_to_encode:
    test_data[col] = test_data[col].astype('object')
    test_data[col] = test_data[col].map(str)
test_data[cat_cols].fillna('no_data', inplace=True)
test_data.drop(columns=['customer_ID'], inplace=True)
test_data[cat_cols].dtypes

In [None]:
sample = pd.read_csv('sample_submission.csv')
# loaded_model = joblib.load('model_third.joblib')
predictions = model.predict(test_data)
# p_clipped = np.clip(predictions, 0.025, 0.975)
output_file = pd.DataFrame({'customer_ID': sample.customer_ID, 'prediction': predictions})
# output_file.to_csv('submission_dart.csv', index= False)
import os
import joblib
dir_name = str(input('Specify directory name: '))
model_name = str(input('Specify model name: '))
full_model_name = model_name + '.joblib'
directory = 'total_output_' + dir_name
parent_dir = 'C:/Users/boomb/DataspellProjects/dsProject_1/'
path = os.path.join(parent_dir, directory)
os.mkdir(path)
open(os.path.join(path, full_model_name), 'x').close()
with open(os.path.join(path, (model_name + '_params.txt')), 'w') as fp:
    fp.write(str(params))
sub_name = 'submission_' + str(input("Specify sub name: ")) + '.csv'
open(os.path.join(path, sub_name), 'x').close()
output_file.to_csv(os.path.join(path, sub_name), index=False)
joblib.dump(model, os.path.join(path, full_model_name))

In [None]:
output_file.shape