In [None]:
import pandas as pd
import os
import gc
import sys
import pickle


file = open('dtype_dict.txt', 'rb')
dtype_dict = pickle.load(file)
file.close()

ipython_vars = ["In", "Out", "exit", "quit", "get_ipython", "ipython_vars"]
categorical = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
def mem_usage_gb(df):
    return round((df.memory_usage(deep=True).sum()/1073741824), 2)
def file_mem_usage_gb(file):
    return round((os.stat(file).st_size/1073741824), 2)

In [None]:
train_data = pd.read_csv('train_data.csv', dtype=dtype_dict)
numerical = train_data.drop(columns= categorical).columns.to_list()

numerical.remove('customer_ID'); numerical.remove('S_2')

train_num_agg = train_data.groupby("customer_ID")[numerical].agg(['mean', 'std', 'min', 'max', 'last'])
train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
train_num_agg.reset_index(inplace = True)
train_num_agg.head(10)

In [None]:
train_cat_agg = train_data.groupby("customer_ID")[categorical].agg(['count', 'last', 'nunique'])
train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
train_cat_agg.reset_index(inplace = True)
train_cat_agg.drop(['customer_ID'], axis= 1, inplace= True)
agg_cat_cols = train_cat_agg.columns
train_cat_agg.head(10)

In [None]:
train_data = pd.concat([train_num_agg, train_cat_agg], axis= 1)
# train_data.to_csv('prep_catboost_train.csv', index=False)
print('Saved preprocessed train data')
print(f'Size of pandas DataFrame in GB: {mem_usage_gb(train_data)}')
print(f'Size of saved CSV in GB: {file_mem_usage_gb("prep_catboost_train.csv")}')
mem = {
    key: value
    for key, value in sorted(
        [
            (x, sys.getsizeof(globals().get(x)))
            for x in dir()
            if not x.startswith("_") and x not in sys.modules and x not in ipython_vars
        ],
        key=lambda x: x[1],
        reverse=True,
    )
}
print(mem, round(sum(mem.values()) / 1e6, 2))

In [None]:
del train_data
del train_cat_agg
del train_num_agg
gc.collect()
print('Delete train dataset and do gc')
mem = {
    key: value
    for key, value in sorted(
        [
            (x, sys.getsizeof(globals().get(x)))
            for x in dir()
            if not x.startswith("_") and x not in sys.modules and x not in ipython_vars
        ],
        key=lambda x: x[1],
        reverse=True,
    )
}
print(mem, round(sum(mem.values()) / 1e6, 2))

In [None]:
test_data = pd.read_csv('test_data.csv', dtype=dtype_dict)
test_agg_num = test_data.groupby("customer_ID")[numerical].agg(['mean', 'std', 'min', 'max', 'last'])
test_agg_cat = test_data.groupby("customer_ID")[categorical].agg(['count', 'last', 'nunique'])
test_agg_num.columns = ['_'.join(x) for x in test_agg_num.columns]
test_agg_num.reset_index(inplace = True)
test_agg_cat.columns = ['_'.join(x) for x in test_agg_cat.columns]
test_agg_cat.reset_index(inplace = True)
test_agg_cat.drop(['customer_ID'], axis= 1, inplace= True)

In [None]:
test_data = pd.concat([test_agg_num, test_agg_cat], axis= 1)
# test_data.to_csv('prep_catboost_test.csv', index=False)
print('Saved preprocessed test data')
print(f'Size of pandas DataFrame in GB: {mem_usage_gb(test_data)}')
print(f'Size of saved CSV in GB: {file_mem_usage_gb("prep_catboost_test.csv")}')
mem = {
    key: value
    for key, value in sorted(
        [
            (x, sys.getsizeof(globals().get(x)))
            for x in dir()
            if not x.startswith("_") and x not in sys.modules and x not in ipython_vars
        ],
        key=lambda x: x[1],
        reverse=True,
    )
}
print(mem, round(sum(mem.values()) / 1e6, 2))

In [None]:
del test_data
del test_agg_cat
del test_agg_num
gc.collect()
print('Delete train dataset and do gc')
mem = {
    key: value
    for key, value in sorted(
        [
            (x, sys.getsizeof(globals().get(x)))
            for x in dir()
            if not x.startswith("_") and x not in sys.modules and x not in ipython_vars
        ],
        key=lambda x: x[1],
        reverse=True,
    )
}
print(mem, round(sum(mem.values()) / 1e6, 2))

In [None]:
cat_cols = agg_cat_cols.tolist()
file = open('cat_cols.txt', 'wb')
pickle.dump(cat_cols, file)
file.close()