In [1]:
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

In [2]:
chunksize = 1000000 # How much of the data to load in

# Load in using X_cols and customer_ID
train_df_iter = pd.read_csv("train_data.csv", chunksize=chunksize) # for this notebook we will use all the data


train_df = pd.DataFrame()
for i_chunk, chunk in enumerate(train_df_iter): # i_chunk is the index of the chunk, and chunk is the actual data
    train_df = pd.concat([train_df, chunk]) # adding it to train_df which is an empty dataframe
    print(train_df.shape)

(1000000, 190)
(2000000, 190)
(3000000, 190)
(4000000, 190)
(5000000, 190)
(5531451, 190)


In [3]:
# the features we will be using in a list
features = train_df.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()

In [4]:
# This is the categorical features
cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]

In [5]:
# This is the number of features that (not including the categorical features), and there column names
num_features = [col for col in features if col not in cat_features]

In [6]:
gc.collect()

0

In [7]:
del train_df_iter, train_df, chunksize, i_chunk, chunk

### Now we will load in the test data

In [8]:
chunksize = 1000000 # How much of the data to load in

# Load in using X_cols and customer_ID
test_df_iter = pd.read_csv("test_data.csv", chunksize=chunksize)


test_df = pd.DataFrame()
for i_chunk, chunk in enumerate(test_df_iter): # i_chunk is the index of the chunk, and chunk is the actual data
    test_df = pd.concat([test_df, chunk]) # adding it to train_df which is an empty dataframe
    print(test_df.shape)

(1000000, 190)
(2000000, 190)
(3000000, 190)
(4000000, 190)
(5000000, 190)
(6000000, 190)
(7000000, 190)
(8000000, 190)
(9000000, 190)
(10000000, 190)
(11000000, 190)
(11363762, 190)


In [9]:
del chunksize, i_chunk, chunk, test_df_iter

In [10]:
gc.collect()

0

In [11]:
print('Starting test feature engineer...')
test_num_agg = test_df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
test_num_agg.reset_index(inplace = True)
test_cat_agg = test_df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
test_cat_agg.reset_index(inplace = True)
test_df = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID')
print("Finished test feature engineer")

Starting test feature engineer...
Finished test feature engineer


In [12]:
del test_num_agg, test_cat_agg

In [13]:
gc.collect()

0

In [14]:
test_df.to_parquet("test_df_final.parquet")