In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

In [2]:
train = pd.read_csv('../data/new_train.csv')
test = pd.read_csv('../data/new_test.csv')
mcf = pd.read_csv('../data/monthly_card_features.csv')

In [3]:
train = train.drop(columns=['Unnamed: 0', 'first_active_month']).set_index('card_id')
test = test.drop(columns=['Unnamed: 0', 'first_active_month']).set_index('card_id')

In [4]:
train = train.join(mcf.groupby('card_id').agg({
    'amt_total': [np.min, np.mean, np.max],
    'NDR':       [np.min, np.mean, np.max],
    'n_new_merchants':   np.mean,
    'n_total_merchants': np.max
}))
test = test.join(mcf.groupby('card_id').agg({
    'amt_total': [np.min, np.mean, np.max],
    'NDR':       [np.min, np.mean, np.max],
    'n_new_merchants':   np.mean,
    'n_total_merchants': np.max
}))



In [5]:
train.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in train.columns.values]
test.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in test.columns.values]

In [6]:
X = train.drop(columns='target')
X.columns

Index(['feature_1', 'feature_2', 'feature_3', 'tof', 'recency', 'frequency',
       'log_freq', 'amt', 'avg_amt', 'charge_per_day', 'log_charge_per_day',
       'max_amt', 'n_declines', 'log_n_declines', 'prop_new', 'merch_cat_1_Y',
       'merch_cat_2_1', 'merch_cat_2_2', 'merch_cat_2_3', 'merch_cat_2_4',
       'merch_cat_2_5', 'amt_total_amin', 'amt_total_mean', 'amt_total_amax',
       'NDR_amin', 'NDR_mean', 'NDR_amax', 'n_new_merchants_mean',
       'n_total_merchants_amax'],
      dtype='object')

In [7]:
Y = train['target']

In [8]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.3)
X_val.shape

(60576, 29)

In [9]:
rfr = RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100)

In [10]:
rfr.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [None]:
print("Root Mean squared error: %.2f"
      % np.sqrt(mean_squared_error(Y_val, rfr.predict(X_val))))

## What if we bring in our chump-classifier?

In [None]:
train['chump'] = train['target'] < -20

In [None]:
chumps = train[train.chump]
nonchumps = train[~train.chump]

In [None]:
tr_chumps, tt_chumps = train_test_split(chumps, test_size=0.3)
tr_nonchumps, tt_nonchumps = train_test_split(nonchumps, test_size=0.3)

In [None]:
tr = pd.concat([tr_chumps, tr_nonchumps])
tt = pd.concat([tt_chumps, tt_nonchumps])
print('Training chumps:', sum(tr.chump)/len(tr))
print('Test chumps:    ', sum(tt.chump)/len(tt))

In [None]:
balanced = pd.concat([
    tr[tr.chump],
    tr[~tr.chump].sample(n=9 * len(tr[tr.chump]))
])
# Train on balanced data (10% chumps)
X = balanced.drop(columns=['chump', 'target'])
Y = balanced['chump']

In [None]:
print('Balanced chumps: ', sum(balanced.chump) / len(balanced))

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=7)
clf.fit(X, Y)

### Now retrain the regressor on only "non-chumps"

In [None]:
X = tr[~tr.chump].drop(columns=['chump', 'target'])
Y = tr[~tr.chump]['target']

In [None]:
Y.hist()

In [None]:
rfr = RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100)
rfr.fit(X, Y)

## Apply trained models to full train set

In [None]:
X = tt.drop(columns=['chump', 'target'])
Y = tt['target']

In [None]:
X['pred_chump'] = clf.predict(X)

In [None]:
chumps = pd.DataFrame(X[X.pred_chump])
nonchumps = pd.DataFrame(X[~X.pred_chump])
chumps.head()

In [None]:
chumps['pred_target'] = -33.219281
chumps.head()

In [None]:
# X.loc[X.pred_chump, 'pred_target'] = -33.219281

In [None]:
X.head()

In [None]:
nonchumps['pred_target'] = rfr.predict(nonchumps.drop(columns='pred_chump'))
nonchumps.head()

In [None]:
# X.loc[~X.pred_chump, 'pred_target'] = rfr.predict(X[~X.pred_chump].drop(columns=['pred_chump', 'pred_target']))

In [None]:
X.head()

In [None]:
X = pd.concat([chumps, nonchumps])
X = X.join(Y)

In [None]:
np.sqrt(mean_squared_error(X.target, X.pred_target))

In [None]:
len(X[X.pred_chump])

In [None]:
sum(X.target < -20)

In [None]:
sum(X.pred_chump != (X.target < -20))

In [None]:
# Split dataset into tr for training and tt for final test
# tr and tt will maintain the same chump/nonchump ratio
train['chump'] = train['target'] < -20
chumps = train[train.chump]
nonchumps = train[~train.chump]
tr_chumps, tt_chumps = train_test_split(chumps, test_size=0.3)
tr_nonchumps, tt_nonchumps = train_test_split(nonchumps, test_size=0.3)
tr = pd.concat([tr_chumps, tr_nonchumps])
tt = pd.concat([tt_chumps, tt_nonchumps])

ranges = [1, 3, 5, 7, 10, 15, None]
models = [
    {'rmd': rmd,
     'rmf': rmf,
     'cmd': cmd,
     'cmf': cmf,
     'score': np.nan} 
    for rmd in ranges
    for rmf in ranges
    for cmd in ranges
    for cmf in ranges
]

for i, m in enumerate(models):
    # Train Classifier (rebalanced tr set)
    balanced = pd.concat([
        tr[tr.chump],
        tr[~tr.chump].sample(n=9 * len(tr[tr.chump]))
    ])
    # Train on balanced data (10% chumps)
    X = balanced.drop(columns=['chump', 'target'])
    Y = balanced['chump']
    clf = RandomForestClassifier(
        n_estimators=  100,
        max_depth=     m['cmd'],
        max_features=  m['cmf'],
        random_state=  0)
    clf.fit(X, Y)
    
    # Train Regressor (full tr set)
    X = tr[~tr.chump].drop(columns=['chump', 'target'])
    Y = tr[~tr.chump]['target']
    rfr = RandomForestRegressor(
        n_estimators=  100,
        max_depth=     m['rmd'],
        max_features=  m['rmf'],
        random_state=  0)
    rfr.fit(X, Y)
    
    # Apply models to tt holdout set
    X = tt.drop(columns=['chump', 'target'])
    Y = tt['target']
    X['pred_chump'] = clf.predict(X)
    chumps = pd.DataFrame(X[X.pred_chump])
    nonchumps = pd.DataFrame(X[~X.pred_chump])
    chumps['pred_target'] = -33.219281
    nonchumps['pred_target'] = rfr.predict(nonchumps.drop(columns='pred_chump'))
    X = pd.concat([chumps, nonchumps])
    X = X.join(Y)
    m['score'] = np.sqrt(mean_squared_error(X.target, X.pred_target))
    print(f'== {np.round((i + 1) / len(models) * 100, 2)}% ==\r', end='')

In [None]:
sorted(models, key = lambda x: x['score'])[:15]t

In [23]:
train['chump'] = train['target'] < -20
# Train Classifier (rebalanced tr set)
balanced = pd.concat([
    train[train.chump],
    train[~train.chump].sample(n=9 * len(train[train.chump]))
])
# Train on balanced data (10% chumps)
X = balanced.drop(columns=['chump', 'target'])
Y = balanced['chump']
clf = RandomForestClassifier(
    n_estimators=  1000,
    max_depth=     3,
    max_features=  3,
    random_state=  0)
clf.fit(X, Y)

# Train Regressor (full tr set)
X = train[~train.chump].drop(columns=['chump', 'target'])
Y = train[~train.chump]['target']
rfr = RandomForestRegressor(
    n_estimators=  1000,
    max_depth=     10,
    max_features=  15,
    random_state=  0)
rfr.fit(X, Y)

# Apply models to tt holdout set
test['pred_chump'] = clf.predict(test)
chumps = pd.DataFrame(test[test.pred_chump])
nonchumps = pd.DataFrame(test[~test.pred_chump])
chumps['pred_target'] = -33.219281
nonchumps['pred_target'] = rfr.predict(nonchumps.drop(columns='pred_chump'))
test = pd.concat([chumps, nonchumps])
predictions = pd.DataFrame(test['pred_target'], index=test.index)
predictions.head()

Unnamed: 0_level_0,target
card_id,Unnamed: 1_level_1
C_ID_0ab67a22ab,
C_ID_130fd0cbdd,
C_ID_b709037bc5,
C_ID_d27d835a9f,
C_ID_2b5e3df5c2,


In [30]:
predictions = pd.DataFrame(test['pred_target'], index=test.index)
predictions.columns = ['target']
predictions.head()

Unnamed: 0_level_0,target
card_id,Unnamed: 1_level_1
C_ID_0ab67a22ab,-0.272179
C_ID_130fd0cbdd,-0.450876
C_ID_b709037bc5,0.002175
C_ID_d27d835a9f,-0.293506
C_ID_2b5e3df5c2,-0.209106


In [31]:
predictions.to_csv('../submission_stacked_rf.csv')

# Forget the stacked model - tune RFR to full training data

In [13]:
train['chump'] = train['target'] < -20
chumps = train[train.chump]
nonchumps = train[~train.chump]
tr_chumps, tt_chumps = train_test_split(chumps, test_size=0.3)
tr_nonchumps, tt_nonchumps = train_test_split(nonchumps, test_size=0.3)
tr = pd.concat([tr_chumps, tr_nonchumps]).drop(columns='chump')
tt = pd.concat([tt_chumps, tt_nonchumps]).drop(columns='chump')

ranges = [1, 3, 5, 7, 10, 15, None]
models = [
    {'rmd': rmd,
     'rmf': rmf,
     'score': np.nan} 
    for rmd in ranges
    for rmf in ranges
]

for i, m in enumerate(models):
    X = tr.drop(columns='target')
    Y = tr['target']
    rfr = RandomForestRegressor(
        n_estimators=  100,
        max_depth=     m['rmd'],
        max_features=  m['rmf'],
        random_state=  0)
    rfr.fit(X, Y)
    
    # Apply models to tt holdout set
    X = tt.drop(columns='target')
    Y = tt['target']
    X['pred_target'] = rfr.predict(X)
    X = X.join(Y)
    m['score'] = np.sqrt(mean_squared_error(X.target, X.pred_target))
    print(f'== {np.round((i + 1) / len(models) * 100, 2)}% ==\r', end='')

== 100.0% ==

In [14]:
sorted(models, key = lambda x: x['score'])[:15]

[{'rmd': 10, 'rmf': 15, 'score': 3.7395660454228663},
 {'rmd': 10, 'rmf': 10, 'score': 3.7422055659302345},
 {'rmd': 7, 'rmf': 15, 'score': 3.7426121888584363},
 {'rmd': 7, 'rmf': None, 'score': 3.744888422483514},
 {'rmd': 10, 'rmf': None, 'score': 3.744935306206704},
 {'rmd': 10, 'rmf': 7, 'score': 3.746716717502648},
 {'rmd': 15, 'rmf': 7, 'score': 3.747300911105606},
 {'rmd': 7, 'rmf': 10, 'score': 3.7477582995558754},
 {'rmd': 15, 'rmf': 5, 'score': 3.7505094016742904},
 {'rmd': 15, 'rmf': 10, 'score': 3.752290507996586},
 {'rmd': 5, 'rmf': None, 'score': 3.7522945150526223},
 {'rmd': 5, 'rmf': 15, 'score': 3.7543497787056563},
 {'rmd': 15, 'rmf': 15, 'score': 3.7547762994379665},
 {'rmd': 7, 'rmf': 7, 'score': 3.7562905575751566},
 {'rmd': 10, 'rmf': 5, 'score': 3.7565375557534546}]

In [15]:
test.columns

Index(['feature_1', 'feature_2', 'feature_3', 'tof', 'recency', 'frequency',
       'log_freq', 'amt', 'avg_amt', 'charge_per_day', 'log_charge_per_day',
       'max_amt', 'n_declines', 'log_n_declines', 'prop_new', 'merch_cat_1_Y',
       'merch_cat_2_1', 'merch_cat_2_2', 'merch_cat_2_3', 'merch_cat_2_4',
       'merch_cat_2_5', 'amt_total_amin', 'amt_total_mean', 'amt_total_amax',
       'NDR_amin', 'NDR_mean', 'NDR_amax', 'n_new_merchants_mean',
       'n_total_merchants_amax'],
      dtype='object')

### Best model has `max_depth=10` and `max_features=15`

In [16]:
X = train.drop(columns=['target', 'chump'])
Y = train['target']
rfr = RandomForestRegressor(
        n_estimators=  1000,
        max_depth=     10,
        max_features=  15,
        random_state=  0)
rfr.fit(X, Y)
pd.DataFrame(rfr.predict(test), index=test.index, columns='target').head()

TypeError: Index(...) must be called with a collection of some kind, 'target' was passed

In [18]:
predictions = pd.DataFrame(rfr.predict(test), index=test.index, columns=['target'])
predictions.head()

Unnamed: 0_level_0,target
card_id,Unnamed: 1_level_1
C_ID_0ab67a22ab,-0.788704
C_ID_130fd0cbdd,-0.453762
C_ID_b709037bc5,0.004535
C_ID_d27d835a9f,-0.286328
C_ID_2b5e3df5c2,-0.293103


In [19]:
predictions.to_csv('../submission.csv')