In [1]:
import os
import gc
import time
import pickle
from collections import Counter

import pandas as pd
import numpy as np

# import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit

import xgboost as xgb

In [2]:
PATH_TO_DATA = 'data'

## CV Folds

In [3]:
def get_20_cv_splits(PATH_TO_DATA):
    #stratify_classes = y
    train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train.csv'), usecols=['target'])
    stratify_classes =  train.target.apply(lambda x: int(np.log10(x)))
    splits = {}
    for random_state in range(20):
        column = np.zeros(train.shape[0])
        sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=random_state)
        for i, (_, test_index) in enumerate(sss.split(train, stratify_classes)):
            column[test_index] = i

        splits["split{}".format(random_state)] = column

    pd.DataFrame(splits, index=train.index).to_csv(os.path.join(PATH_TO_DATA, 'folds/cv_splits_cleandata_stat_bin_red.csv'))

In [4]:
 # function to generate 100 folds from create_folds_from_cv_splits func
def create_folds_from_cv_splits(in_path):
    
    cv_splits = pd.read_csv(os.path.join(PATH_TO_DATA, in_path))
    folds_list = []
    for ind, i in enumerate(cv_splits.columns[1:]):
        folds = list(set(cv_splits[i].values))
        folds_list.append([])
        for m in folds:
            val_idx = list(cv_splits[cv_splits[i]==m].index)
            train_idx = list(set(list(cv_splits.index)) - set(val_idx))
            folds_list[ind].append((train_idx, val_idx))
    with open(os.path.join(PATH_TO_DATA, 'folds/custom_cv.pkl'), 'wb') as f:
        pickle.dump(folds_list, f)
    return folds_list

In [6]:
LOAD_CV = False

if LOAD_CV:
    with open(os.path.join(PATH_TO_DATA, 'folds/custom_cv.pkl'), 'rb') as f:
        cv_folds = pickle.load(f)
else:
    get_20_cv_splits(PATH_TO_DATA)
    cv_folds = create_folds_from_cv_splits(in_path='folds/cv_splits_cleandata_stat_bin_red.csv')

## XGBOOST

In [7]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {'n_estimators':200,
             'max_depth':4, 
             'lambda':10000,
             'eta': 0.3, 
            "colsample_bytree":0.8,
    #          "tweedie_variance_power":1.5,
             'objective': 'reg:linear', 
             'eval_metric':'rmse',
            'gamma':0.1}
    
    start_time = time.time()
    xgb_train = xgb.DMatrix(train_X, label=train_y)
    xgb_val = xgb.DMatrix(val_X, label=val_y)
    model = xgb.train(params, xgb_train, 200, 
                      evals=[(xgb_train, 'train'), (xgb_val, 'val')], 
                      early_stopping_rounds=30, 
                      verbose_eval=30)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = np.expm1(model.predict(test_X, ntree_limit=model.best_iteration))
    pred_oof_log = model.predict(val_X, ntree_limit=model.best_iteration)
    return pred_test_y, pred_oof_log, model

In [8]:


def run_xgb(train_X, train_y, val_X, val_y, test_X, params):
    
    
    
        
    
    start_time = time.time()
    xgb_train = xgb.DMatrix(train_X, train_y)
    xgb_val = xgb.DMatrix(val_X, val_y)
    model = xgb.train(params, xgb_train, 1200, 
                      evals=[(xgb_train, 'train'), (xgb_val, 'val')], 
                      early_stopping_rounds=30, 
                      verbose_eval=30)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    xgb_test = xgb.DMatrix(test_X)
    pred_test_y = np.expm1(model.predict(xgb_test, ntree_limit=model.best_iteration))
    pred_oof_log = model.predict(xgb_val, ntree_limit=model.best_iteration)
    return pred_test_y, pred_oof_log, model

In [56]:
feature_groups = pickle.load(open('features_groups.pkl', 'rb'))

# def preprocess(train_X, val_X):
#     train_X_stats = pd.DataFrame(index=train_X.index)
#     val_X_stats = pd.DataFrame(index=val_X.index)
    
    
#     for j, cols in enumerate([cols0] + feature_groups[:10]):
#         train_X_stats['ts_last_mean_{}'.format(j)] = train_X[cols].apply(np.mean, axis=1)
#         train_X_stats['ts_last_median_{}'.format(j)] = train_X[cols].apply(np.median, axis=1)
#         train_X_stats['ts_last_sum_{}'.format(j)] = train_X[cols].apply(np.sum, axis=1)
#         train_X_stats['ts_last_nonzero_{}'.format(j)] = train_X[cols].apply(lambda x: np.mean([1 if v > 0 else 0 for v in x]), axis=1)
#         train_X_stats['ts_last_nonzero_mean_{}'.format(j)] = train_X[cols].apply(lambda x: np.mean([v for v in x if v > 0]) if x.any() else 0, axis=1)
#         train_X_stats['ts_last_nonzero_median_{}'.format(j)] = train_X[cols].apply(lambda x: np.median([v for v in x if v > 0]) if x.any() else 0, axis=1)
        
#         val_X_stats['ts_last_mean_{}'.format(j)] = val_X[cols].apply(np.mean, axis=1)
#         val_X_stats['ts_last_median_{}'.format(j)] = val_X[cols].apply(np.median, axis=1)
#         val_X_stats['ts_last_sum_{}'.format(j)] = val_X[cols].apply(np.sum, axis=1)
#         val_X_stats['ts_last_nonzero_{}'.format(j)] = val_X[cols].apply(lambda x: np.mean([1 if v > 0 else 0 for v in x]), axis=1)
#         val_X_stats['ts_last_nonzero_mean_{}'.format(j)] = val_X[cols].apply(lambda x: np.mean([v for v in x if v > 0]) if x.any() else 0, axis=1)
#         val_X_stats['ts_last_nonzero_median_{}'.format(j)] = val_X[cols].apply(lambda x: np.median([v for v in x if v > 0]) if x.any() else 0, axis=1)
        

#     return train_X_stats, val_X_stats
def garb_out(x):
    c = Counter(x)
    clean = [j if c[j] <= 2 else 0 for j in x]
    return clean

def preprocess(test_X):
    test_X_stats = pd.DataFrame(index=test_X.index)
    
    for j, cols in enumerate([cols0] + feature_groups):
        test_X_col = test_X[cols].apply(garb_out, axis=1)
        test_X_stats['ts_last_mean_{}'.format(j)] = test_X_col.apply(np.mean, axis=1)
        test_X_stats['ts_last_median_{}'.format(j)] = test_X_col.apply(np.median, axis=1)
        test_X_stats['ts_last_sum_{}'.format(j)] = test_X_col.apply(np.sum, axis=1)
        test_X_stats['ts_last_nonzero_{}'.format(j)] = test_X_col.apply(lambda x: np.mean([1 if v > 0 else 0 for v in x]), axis=1)
        test_X_stats['ts_last_nonzero_mean_{}'.format(j)] = test_X_col.apply(lambda x: np.mean([v for v in x if v > 0]) if x.any() else 0, axis=1)
        test_X_stats['ts_last_nonzero_max_{}'.format(j)] = test_X_col.apply(lambda x: np.max([v for v in x if v > 0]) if x.any() else 0, axis=1)
        test_X_stats['ts_last_nonzero_min_{}'.format(j)] = test_X_col.apply(lambda x: np.min([v for v in x if v > 0]) if x.any() else 0, axis=1)
        test_X_stats['ts_last_nonzero_median_{}'.format(j)] = test_X_col.apply(lambda x: np.median([v for v in x if v > 0]) if x.any() else 0, axis=1)
        test_X_stats['slope'] = test_X_col.apply(lambda x: linregress(list(range(len(cols))), x.values)[0], axis=1)
    return test_X_stats

In [10]:
def run_calculations(X, test, big_cv_folds, func_name, params):
    if not func_name:
        return print('The function to run is not defined')
    else:
        y_oof_20_preds = []
        fold_errors_20_preds =[]
        avg_test_pred_20_preds = []
        
        
        
        for ind, cv_folds in enumerate(big_cv_folds):
            print('Fitting big fold', ind+1, 'out of', len(big_cv_folds))
            y_oof = np.zeros((y.shape[0]))
            fold_errors =[]
            pred_test_list = []
            
            for i, (train_index, val_index) in enumerate(cv_folds):
                print('Preprocessing fold ', i+1, 'out of ', len(cv_folds))
                X_train, X_val  = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y[train_index], y[val_index]
                
                print('Fitting sub fold ', i+1, 'out of ', len(cv_folds))
                # part to include additional functions
                pred_test_y, pred_oof_log, clf = func_name(X_train, y_train, X_val, y_val, test, params)


                y_oof[val_index] = pred_oof_log
                curr_fe = np.sqrt(mean_squared_error(y_val, pred_oof_log))
                print('Fold error ', curr_fe)
                fold_errors.append(curr_fe)
                pred_test_list.append(list(pred_test_y))

            print('Total error', np.sqrt(mean_squared_error(y, y_oof)))
            total_fe_std = round(np.std(fold_errors), 5)
            print('Total std ',  total_fe_std)
            avg_test_pred = np.mean(pred_test_list, axis=0)
            
            avg_test_pred_20_preds.append(avg_test_pred)
            fold_errors_20_preds.append(fold_errors)
            y_oof_20_preds.append(avg_test_pred)
            
        return y_oof_20_preds, avg_test_pred_20_preds, fold_errors_20_preds

In [11]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
y = train_df.target

In [30]:
y = np.log1p(train_df.target)

In [None]:
train =  preprocess(train_df.drop(columns=['ID', 'target']))
test = preprocess(test_df.drop(columns=['ID']))

In [23]:
good_columns = list(pickle.load(open('good_columns.pkl', 'rb')))

In [24]:
j = 'all_good'
cols = good_columns

train['ts_last_mean_{}'.format(j)] = train_df[cols].apply(np.mean, axis=1)
train['ts_last_median_{}'.format(j)] = train_df[cols].apply(np.median, axis=1)
train['ts_last_sum_{}'.format(j)] = train_df[cols].apply(np.sum, axis=1)
train['ts_last_nonzero_{}'.format(j)] = train_df[cols].apply(lambda x: np.mean([1 if v > 0 else 0 for v in x]), axis=1)
train['ts_last_nonzero_mean_{}'.format(j)] = train_df[cols].apply(lambda x: np.mean([v for v in x if v > 0]) if x.any() else 0, axis=1)
train['ts_last_nonzero_max_{}'.format(j)] = train_df[cols].apply(lambda x: np.max([v for v in x if v > 0]) if x.any() else 0, axis=1)
train['ts_last_nonzero_min_{}'.format(j)] = train_df[cols].apply(lambda x: np.min([v for v in x if v > 0]) if x.any() else 0, axis=1)
train['ts_last_nonzero_median_{}'.format(j)] = train_df[cols].apply(lambda x: np.median([v for v in x if v > 0]) if x.any() else 0, axis=1)


test['ts_last_mean_{}'.format(j)] = test_df[cols].apply(np.mean, axis=1)
test['ts_last_median_{}'.format(j)] = test_df[cols].apply(np.median, axis=1)
test['ts_last_sum_{}'.format(j)] = test_df[cols].apply(np.sum, axis=1)
test['ts_last_nonzero_{}'.format(j)] = test_df[cols].apply(lambda x: np.mean([1 if v > 0 else 0 for v in x]), axis=1)
test['ts_last_nonzero_mean_{}'.format(j)] = test_df[cols].apply(lambda x: np.mean([v for v in x if v > 0]) if x.any() else 0, axis=1)
test['ts_last_nonzero_max_{}'.format(j)] = test_df[cols].apply(lambda x: np.max([v for v in x if v > 0]) if x.any() else 0, axis=1)
test['ts_last_nonzero_min_{}'.format(j)] = test_df[cols].apply(lambda x: np.min([v for v in x if v > 0]) if x.any() else 0, axis=1)
test['ts_last_nonzero_median_{}'.format(j)] = test_df[cols].apply(lambda x: np.median([v for v in x if v > 0]) if x.any() else 0, axis=1)


In [25]:
def get_most_common(x, k):
    c = Counter([j for j in x if j>0])
    if c:
        return c.most_common(1)[0][k]
    else:
        return 0

In [26]:
train['mos_common_v'.format(j)] = train_df.drop(columns=['ID', 'target']).apply(lambda x: get_most_common(x, 0), axis=1)
train['mos_common_c'.format(j)] = train_df.drop(columns=['ID', 'target']).apply(lambda x: get_most_common(x, 1), axis=1)

test['mos_common_v'.format(j)] = test_df.drop(columns=['ID']).apply(lambda x: get_most_common(x, 0), axis=1)
test['mos_common_c'.format(j)] = test_df.drop(columns=['ID']).apply(lambda x: get_most_common(x, 1), axis=1)

In [15]:
from scipy.stats import linregress

In [133]:
for j, cols in enumerate([cols0] + feature_groups[:50]):
    train['unique_{}'.format(j)] = train_df[cols].apply(lambda x: len(set(x)), axis=1)
    test['unique_{}'.format(j)] = test_df[cols].apply(lambda x: len(set(x)), axis=1)

In [27]:
gc.collect()

1155

In [49]:
train[good_columns] =  train_df[good_columns]
test[good_columns] =  test_df[good_columns]

In [54]:
train.drop(columns=good_columns, inplace=True)
test.drop(columns=good_columns, inplace=True)

In [92]:
good_columns = list(pickle.load(open('good_columns.pkl', 'rb')))

In [149]:
gc.collect()

143

In [55]:
%%time
params = {'n_estimators':2000,
             'max_depth':4, 
             'lambda':10000,
             'eta': 0.3,
            'min_child_weight':12,
            "colsample_bytree":0.9,
    #          "tweedie_variance_power":1.5,
             'objective': 'reg:linear', 
             'eval_metric':'rmse',
             'gamma':0.2}
y_oof_lgb, pred_test_list_lgb, fold_errors = run_calculations(train, test, cv_folds, run_xgb, params)

Fitting big fold 1 out of 20
Preprocessing fold  1 out of  5
Fitting sub fold  1 out of  5
[0]	train-rmse:13.2159	val-rmse:13.2441
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 30 rounds.
[30]	train-rmse:2.55048	val-rmse:2.56924
[60]	train-rmse:1.55223	val-rmse:1.56974
[90]	train-rmse:1.42366	val-rmse:1.45213
[120]	train-rmse:1.37386	val-rmse:1.41259
[150]	train-rmse:1.34426	val-rmse:1.39273
[180]	train-rmse:1.32356	val-rmse:1.37951
[210]	train-rmse:1.30763	val-rmse:1.37161
[240]	train-rmse:1.29462	val-rmse:1.36569
[270]	train-rmse:1.28331	val-rmse:1.36035
[300]	train-rmse:1.2733	val-rmse:1.35673
[330]	train-rmse:1.26445	val-rmse:1.35433
[360]	train-rmse:1.25626	val-rmse:1.35142
[390]	train-rmse:1.24901	val-rmse:1.34934
[420]	train-rmse:1.24223	val-rmse:1.34806
[450]	train-rmse:1.23651	val-rmse:1.34662
[480]	train-rmse:1.23051	val-rmse:1.34526
[510]	train-rmse:1.22461	val-rmse:1.34425
[540]	train-rmse:1

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields ID

In [44]:
print('Length of test predictions:', len(pred_test_list_lgb))
avg_pred_test_list_lgb = np.mean(pred_test_list_lgb, axis=0)
print('Length of avg test predictions:', len(avg_pred_test_list_lgb))

Length of test predictions: 20
Length of avg test predictions: 49342


In [46]:
# ERRORS
errors = pd.DataFrame(fold_errors)
errors.to_csv(os.path.join(PATH_TO_DATA, 'output/20_fold_errors_xgb_cv1348_std0025.csv'), index=False, header=False)

# TRAIN TARGET OOF
with open(os.path.join(PATH_TO_DATA, 'output/out_of_20_folds_xgb_cv1348_std0025.csv'), 'wb') as f:
    pickle.dump(y_oof_lgb, f)

# SUBMIT
test = pd.read_csv(os.path.join(PATH_TO_DATA, 'test.csv'), usecols=['ID'])
lgb = pd.DataFrame({'ID': test['ID'].values,
                    'target': avg_pred_test_list_lgb})
lgb.to_csv(os.path.join(PATH_TO_DATA, 'output/nefedov_xgb_cv1348_std0025.csv'), index=False)