In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(context="notebook", palette="Spectral", style = 'darkgrid' ,font_scale = 1.5, color_codes=True)
import os
import matplotlib.pyplot as plt

n_rows_to_load = 1000000

unique_vals = {
    'category_1': ['N', 'Y'],
    'category_3': ['A', 'B', 'C', -1],
    'category_2': [1.0, -1, 3.0, 5.0, 2.0, 4.0],
    'state_id': [16,9,-1,11,15,17,5,24,19,23,3,8,18,7,4,22,13,1,10,21,20,14,2,12,6],
    'category_4': ['N', 'Y'],
    'most_recent_purchases_range': ['E', 'D', 'C', 'B', 'A'],
    'most_recent_sales_range': ['E', 'D', 'C', 'B', 'A']
}

hist_df = pd.read_csv(
    "/home/nikolaevra/datasets/elo/historical_transactions.csv", 
    low_memory=True, 
    index_col='merchant_id'
)
merchants_df = pd.read_csv("/home/nikolaevra/datasets/elo/merchants.csv")
merchants_df = merchants_df.drop_duplicates(subset='merchant_id', keep='first')
merchants_df = merchants_df.set_index('merchant_id')

In [None]:
def process_batch(hist_df, merchants_df, i, final_set=False):
    if final_set:
        hist_df_batch = hist_df.iloc[i*n_rows_to_load:, :]
    else:
        hist_df_batch = hist_df.iloc[i*n_rows_to_load:(i+1)*n_rows_to_load, :]
    print("Processing batch: ", i)
    print("Batch size: ", hist_df_batch.shape)

    cols_to_use = merchants_df.columns.difference(hist_df_batch.columns)

    hist_transactions = hist_df_batch.join(
        merchants_df[cols_to_use],
        how='inner',
        on='merchant_id',
    ).reset_index()

    hist_transactions['purchase_date'] = pd.to_datetime(hist_transactions['purchase_date'])
    hist_transactions['purchase_date_day'] = pd.to_datetime(hist_transactions['purchase_date']).dt.day
    hist_transactions['days_between_purch'] = hist_transactions['purchase_date_day'].rolling(2).sum()
    hist_transactions['days_between_purch'] = hist_transactions['days_between_purch'].fillna(0)
    hist_transactions['category_2'] = hist_transactions['category_2'].fillna(-1)
    hist_transactions['category_3'] = hist_transactions['category_3'].fillna(-1)
    hist_transactions['category_4'] = hist_transactions['category_4'].fillna(-1)
    hist_transactions['avg_sales_lag12'] = hist_transactions['avg_sales_lag12'].fillna(-1)
    hist_transactions['avg_sales_lag6'] = hist_transactions['avg_sales_lag6'].fillna(-1)
    hist_transactions['avg_sales_lag3'] = hist_transactions['avg_sales_lag3'].fillna(-1)
    hist_transactions['authorized_flag_binary'] = hist_transactions['authorized_flag'].apply(lambda x: 1 if x == 'Y' else 0)

    categorical_cols = [
        'category_1',
        'category_3',
        'category_2',
        'category_4',
        'state_id',
        'most_recent_purchases_range',
        'most_recent_sales_range'
    ]
    
    for cat in categorical_cols:
        hist_transactions[cat] = hist_transactions[cat].astype('category', categories=unique_vals[cat])

    to_process_cols = [
        'category_1', 'category_2', 'category_3', 'category_4', 'state_id', 
        'most_recent_purchases_range', 'most_recent_sales_range'
    ]
    vect_category_cols = []

    for cat in to_process_cols:
        vect_category_cols += [cat + '_' + str(col) for col in list(hist_transactions[cat].unique())]
        dummies = pd.get_dummies(hist_transactions[cat], prefix=cat)
        hist_transactions = pd.concat([hist_transactions, dummies], axis=1)

    def process_func(x):
        d = {}

        d['avg_month_lag'] = x['month_lag'].mean()
        d['avg_installments'] = x['installments'].mean()
        d['avg_days_between_purch'] = x['days_between_purch'].mean()
        d['std_days_between_purch'] = x['days_between_purch'].std()
        d['num_authorized'] = x['authorized_flag_binary'].sum()
        d['authorized_unauthorized_ratio'] = x['authorized_flag_binary'].sum() / x['authorized_flag_binary'].count()
        d['avg_purchase_amount'] = x['purchase_amount'].mean()
        d['std_purchase_amount'] = x['purchase_amount'].std()
        d['min_purchase_amount'] = x['purchase_amount'].min()
        d['max_purchase_amount'] = x['purchase_amount'].max()
        d['num_purchases'] = x['purchase_amount'].count()
        d['avg_numerical_1'] = x['numerical_1'].mean()
        d['std_numerical_1'] = x['numerical_1'].std()
        d['min_numerical_1'] = x['numerical_1'].min()
        d['max_numerical_1'] = x['numerical_1'].max() 
        d['avg_numerical_2'] = x['numerical_2'].mean()
        d['std_numerical_2'] = x['numerical_2'].std()
        d['min_numerical_2'] = x['numerical_2'].min()
        d['max_numerical_2'] = x['numerical_2'].max()
        d['avg_active_months_lag12'] = x['active_months_lag12'].mean()
        d['avg_purchases_lag12'] = x['avg_purchases_lag12'].mean()

        for col in vect_category_cols:
            d[col] = x[col].sum()
        return pd.Series(d, index=[
            'avg_month_lag', 'avg_installments', 'avg_days_between_purch', 'std_days_between_purch',
            'num_authorized', 'authorized_unauthorized_ratio', 'avg_purchase_amount', 'std_purchase_amount',
            'min_purchase_amount', 'max_purchase_amount', 'num_purchases', 'avg_numerical_1', 'std_numerical_1',
            'min_numerical_1', 'max_numerical_1', 'avg_numerical_2', 'std_numerical_2',
            'min_numerical_2', 'max_numerical_2', 'avg_active_months_lag12', 'avg_purchases_lag12'
        ] + vect_category_cols)

    final_df = hist_transactions.groupby(by='card_id').apply(process_func)

    print('Saving processed DF:', final_df.shape)
    print("===============================")

    final_df.to_csv('df_' + i + '.csv', sep=',')

In [None]:
num_slices = int(hist_df.shape[0] / n_rows_to_load)

for i in range(num_slices):
    process_batch(hist_df, i, final_set=True if (i+1)*n_rows_to_load >= hist_df.shape[0] else False)