In [None]:
import numpy as np
import pandas as pd
import datetime
import gc

In [None]:
def load_df_main(path, **kwargs):
    df = pd.read_csv(path, **kwargs)
    
    df["first_active_month"] = pd.to_datetime(df["first_active_month"])
    
    categories = ["feature_1", "feature_2", "feature_3"]
    for feature in categories:
        df[feature] = df[feature].astype("category")
        
    return df

In [None]:
train = load_df_main("../input/train.csv")

In [None]:
test = load_df_main("../input/test.csv")

In [None]:
def load_df_transactions(path, **kwargs):
    df = pd.read_csv(path, **kwargs)
    
    df["purchase_date"] = pd.to_datetime(df["purchase_date"])
    categories = ["authorized_flag", "city_id", "category_1", "category_3", "merchant_category_id", "state_id", "subsector_id"]
    for feature in categories:
        df[feature] = df[feature].astype("category")

    strings = ["card_id", "merchant_id"]
    for feature in strings:
        df[feature] = df[feature].astype("str")

    ints = ["month_lag", "installments"]
    for feature in ints:
        df[feature] = df[feature].astype("int16")

    floats = ["purchase_amount"]
    for feature in floats:
        df[feature] = df[feature].astype("float32")
    
    df["category_2"].fillna(1.0,inplace=True)
    df["category_2"] = df["category_2"].astype("str").apply(lambda x: x[:1] if x != "nan" else x).astype("category")
    df["category_3"].fillna("A",inplace=True)
    df["merchant_id"].fillna("M_ID_00a6ca8a8a",inplace=True)
    
    return df

## Engineering

In [None]:
def add_features_transactions(df):
    df["purchase_year"] = df["purchase_date"].dt.year
    df["purchase_weekofyear"] = df["purchase_date"].dt.weekofyear
    df["purchase_month"] = df["purchase_date"].dt.month
    df["purchase_dayofweek"] = df["purchase_date"].dt.dayofweek
    df["purchase_weekend"] = (df.purchase_date.dt.weekday >=5).astype(int)
    df["purchase_hour"] = df["purchase_date"].dt.hour
    df["authorized_flag"] = df["authorized_flag"].map({"Y":1, "N":0})
    df["category_1"] = df["category_1"].map({"Y":1, "N":0})
    df["month_diff"] = ((datetime.datetime.today() - df["purchase_date"]).dt.days)//30
    df["month_diff"] += df["month_lag"]
    df = pd.get_dummies(df, columns=["category_2", "category_3"])
    
    return df

In [None]:
def aggregate_transactions(df, prefix):
    
    aggs = {}
    aggs["purchase_date"] = ["max","min"]
    aggs["month_diff"] = ["mean"]
    aggs["card_id"] = ["size"]
    
    for col in ["purchase_amount", "installments", "month_lag"]:
        aggs[col] = ["sum","max","min","mean","var"]
        
    for col in ["authorized_flag", "purchase_weekend", "category_1", "category_2_1", "category_2_2", "category_2_3", "category_2_4", "category_2_5", "category_3_A", "category_3_B", "category_3_C"]:
        aggs[col] = ["sum", "mean"]
        
    for col in ["purchase_year", "purchase_weekofyear", "purchase_month", "purchase_dayofweek", "purchase_hour", "subsector_id", "merchant_id", "merchant_category_id"]:
        aggs[col] = ["nunique"]
    
    df_agg = df.groupby("card_id").agg(aggs)
    
    df_agg.columns = [prefix + "_" + "_".join(column_pair) for column_pair in df_agg.columns.ravel()]
    df_agg.reset_index(inplace=True)
    
    return df_agg

In [None]:
hist = load_df_transactions("../input/historical_transactions.csv")

In [None]:
hist = add_features_transactions(hist)

In [None]:
aggregated_transactions = aggregate_transactions(hist, "hist")
train = pd.merge(train, aggregated_transactions, on='card_id', how='left')
test = pd.merge(test, aggregated_transactions, on='card_id', how='left')
del aggregated_transactions
del hist
gc.collect()

In [None]:
new = load_df_transactions("../input/new_merchant_transactions.csv")

In [None]:
new = add_features_transactions(new)

In [None]:
aggregated_transactions = aggregate_transactions(new, "new")
train = pd.merge(train, aggregated_transactions, on='card_id', how='left')
test = pd.merge(test, aggregated_transactions, on='card_id', how='left')
del aggregated_transactions
del new
gc.collect()

In [None]:
def add_features_main(df):
    df["year"] = df["first_active_month"].dt.year.astype("category")
    df["weekofyear"] = df["first_active_month"].dt.weekofyear.astype("category")
    df["month"] = df["first_active_month"].dt.month.astype("category")
    df["elapsed_time"] = (datetime.date(2018, 2, 1) - df["first_active_month"].dt.date).dt.days
    
    df["hist_purchase_date_diff"] = (df["hist_purchase_date_max"] - df["hist_purchase_date_min"]).dt.days
    df["hist_purchase_date_average"] = df["hist_purchase_date_diff"]/df["hist_card_id_size"]
    df["hist_purchase_date_uptonow"] = (datetime.datetime.today() - df["hist_purchase_date_max"]).dt.days

    df["hist_first_buy"] = (df["hist_purchase_date_min"] - df["first_active_month"]).dt.days
    df["new_first_buy"] = (df["new_purchase_date_min"] - df["first_active_month"]).dt.days
    
    for col in ["hist_purchase_date_max","hist_purchase_date_min","new_purchase_date_max", "new_purchase_date_min"]:
        df[col] = df[col].astype(np.int64) * 1e-9
    df["card_id_total"] = df["new_card_id_size"] + df["hist_card_id_size"]
    df["purchase_amount_total"] = df["new_purchase_amount_sum"] + df["hist_purchase_amount_sum"]

In [None]:
add_features_main(train)
add_features_main(test)

In [None]:
train

## Merchants

In [None]:
# merchants = pd.read_csv('../input/merchants.csv',
#                        dtype={"merchant_group_id": "category",
#                               "merchant_category_id": "category",
#                               "subsector_id": "category",
#                               "category_1": "category",
#                               "most_recent_sales_range": "category",
#                               "most_recent_purchases_range": "category",
#                               "category_4": "category",
#                               "city_id": "category",
#                               "state_id": "category",
#                               "category_2": "category"})

# data_dictionary_merchant = pd.read_excel('../input/Data_Dictionary.xlsx', sheet_name='merchant')
# data_dictionary_merchant

# creating dataframe

In [None]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)