In [36]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [5]:
def read_train_test():
    X_train = pd.read_csv(
        "../data/processed/two_models/X_train.csv", index_col="client_id"
    )
    y_train = pd.read_csv(
        "../data/processed/two_models/y_train.csv",
        header=None,
        names=["client_id", "target"],
        index_col="client_id"
    )["target"]
    train_is_treatment = pd.read_csv(
        "../data/processed/two_models/X_train_is_treatment.csv",
        header=None,
        names=["client_id", "is_treatment"],
        index_col="client_id"
    )["is_treatment"]

    X_valid = pd.read_csv("../data/processed/two_models/X_valid.csv", index_col="client_id")
    y_valid = pd.read_csv(
        "../data/processed/two_models/y_valid.csv",
        header=None,
        names=["client_id", "target"],
        index_col="client_id"
    )["target"]
    valid_is_treatment = pd.read_csv(
        "../data/processed/two_models/X_valid_is_treatment.csv",
        header=None,
        names=["client_id", "is_treatment"],
        index_col="client_id"
    )["is_treatment"]

    X_test = pd.read_csv("../data/processed/two_models/X_test.csv", index_col="client_id")

    return X_train, y_train, train_is_treatment, X_valid, y_valid, valid_is_treatment, X_test


def join_train_validation(X_train, X_valid, y_train, y_valid):
    X_train = pd.concat([X_train, X_valid], ignore_index=False)
    y_train = pd.concat([y_train, y_valid], ignore_index=False)
    return X_train, y_train


def split_control_treatment(X, y, is_treatment):
    X_control = X[is_treatment == 0]
    X_treatment = X[is_treatment == 1]
    y_control = y[is_treatment == 0]
    y_treatment = y[is_treatment == 1]
    return X_control, X_treatment, y_control, y_treatment

In [34]:
X_train, y_train, train_is_treatment, X_valid, y_valid, valid_is_treatment, X_test = read_train_test()

# Test randomness of control / treatment split

In [8]:
X_train.sample(15)

Unnamed: 0_level_0,age,n_alchohol_products,avg_alchohol_products_in_purchase,pct_alcohol_products,n_own_trademark_products,pct_onw_trademark_in_purchase,pct_own_trademark_products,sum_sum_netto,avg_sum_netto,stddev_sum_netto,...,first_issue_weekday,first_issue_dayofmonth,first_issue_year,first_issue_month,first_issue_weekofyear,first_issue_week,first_issue_quarter,diff,avg_transaction_hour,last_month_avg_transaction_hour
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f255feda7c,27,0,0.0,0.0,8,1.0,0.135593,30.994,3.87425,2.519674,...,1,31,2017,10,44,44,4,13721700.0,13,12
95857d9245,19,0,0.0,0.0,22,0.323529,0.130178,83.823,1.232691,1.443138,...,3,22,2018,11,47,47,4,2269294.0,14,14
207841e3b3,42,2,0.055556,0.006849,41,1.138889,0.140411,767.367,21.31575,99.623569,...,0,15,2017,5,20,20,2,7593775.0,13,12
0db0050cfb,47,1,0.033333,0.004132,20,0.666667,0.082645,155.887,5.196233,3.615217,...,1,27,2017,6,26,26,2,11587890.0,13,13
ae8f416d7e,44,4,0.444444,0.137931,8,0.888889,0.275862,14.895,1.655,0.962286,...,5,10,2017,6,23,23,2,9750572.0,9,15
c683730e43,57,11,0.25,0.041199,50,1.136364,0.187266,131.769,2.99475,3.865767,...,3,8,2018,11,45,45,4,1308398.0,10,8
59a8b0abd5,81,0,0.0,0.0,2,0.666667,0.222222,3.989,1.329667,1.393162,...,2,4,2018,4,14,14,2,29105530.0,7,6
561a7fa266,49,0,0.0,0.0,40,1.081081,0.21164,129.564,3.50173,2.667215,...,5,7,2017,10,40,40,4,7315815.0,10,7
762fc244f7,26,0,0.0,0.0,9,0.9,0.152542,37.983,3.7983,4.153976,...,5,13,2017,5,19,19,2,28425560.0,11,7
190014ec69,39,2,0.166667,0.013986,15,1.25,0.104895,51.528,4.294,2.716773,...,2,13,2017,9,37,37,3,977504.0,14,15


In [23]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train.fillna(-999), train_is_treatment)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [28]:
clf.score(X_valid.fillna(-999), valid_is_treatment), roc_auc_score(valid_is_treatment, clf.predict_proba(X_valid.fillna(-999))[:, 1])

(0.5058738252349531, 0.506240470945706)

## OK

# Test randomness of train/test split

In [35]:
X_train = pd.concat([X_train, X_valid], ignore_index=False)
y = pd.DataFrame({"is_test": [0]*len(X_train)}, index=X_train.index)
X = pd.concat([X_train, X_test], ignore_index=False)
y = pd.concat([y, pd.DataFrame({"is_test": [1]*len(X_test)}, index=X_test.index)], ignore_index=False)
assert X.shape[0] == y.shape[0]
print(X.shape)

(400162, 105)


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train.fillna(-999), y_train)
clf.score(X_test.fillna(-999), y_test), roc_auc_score(y_test, clf.predict_proba(X_test.fillna(-999))[:, 1])

  This is separate from the ipykernel package so we can avoid doing imports until


(0.5001817438320687, 0.5020740492675879)

## OK