In [148]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')
import time
import random
from tqdm import tqdm_notebook

# Fastai
from fastai import *
from fastai.tabular import *

In [149]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
os.getcwd()

# Any results you write to the current directory are saved as output.
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

plt.style.use('seaborn')
sns.set(font_scale=1)

In [3]:
random_state = 17865302
np.random.seed(random_state)
tr = pd.read_csv('/storage/santander_comp/train.csv')
te = pd.read_csv('/storage/santander_comp/test.csv')
tr_poly = pd.read_csv('/storage/santander_comp/train_poly.csv')
te_poly = pd.read_csv('/storage/santander_comp/test_poly.csv')

In [6]:
# Sample datasets
tr_poly_small = tr_poly.sample(frac=0.2)
te_poly_small = te_poly.sample(frac=0.2)

In [7]:
print("% of target in small train == 1: {:01.2f}%".format(len(tr_poly_small.loc[tr_poly_small['target'] == 1]) / len(tr_poly_small) * 100))
print("% of target in train == 1: {:01.2f}%".format(len(tr_poly.loc[tr_poly['target'] == 1]) / len(tr_poly) * 100))

% of target in small train == 1: 10.05%
% of target in train == 1: 10.05%


Evenly distributed sample

In [8]:
features = [feature for feature in tr_poly.columns if feature not in ['ID_code', 'target']]

In [9]:
# Helper functions: normalize and augment
def normalize(df, features):
    result = df.copy()
    for feature in features:
        max_val = df[feature].max()
        min_val = df[feature].min()
        result[feature] = (df[feature] - min_val) / (max_val - min_val)
    return result

def z_norm(df, features):
    result = df.copy()
    for feature in features:
        std = np.std(df[feature])
        mean = np.mean(df[feature])
        result[feature] = (df[feature] - mean) / std
    return result

def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
oof = tr_poly[['ID_code', 'target']]
oof['predict'] = 0
predictions = tr_poly[['ID_code']]
val_aucs = []
feature_importance_df = pd.DataFrame()

In [146]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve

In [147]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(tr_poly[features],
                                                  tr_poly['target'],
                                                  test_size = 0.2,
                                                  random_state = random_state)

#### Small No Augments

In [None]:
trn_idx = X_train.index
val_idx = X_val.index
feature_importance_df = pd.DataFrame()

In [None]:
p_val, yp = 0, 0
trn_data = lgb.Dataset(X_train, label = y_train)
val_data = lgb.Dataset(X_val, label = y_val)
evals_result = {}
lgb_clf = lgb.train(lgb_params,
                        trn_data,
                        100000,
                        valid_sets = [trn_data, val_data],
                        early_stopping_rounds=2000,
                        verbose_eval = 1000,
                        evals_result=evals_result
                       )

p_val += lgb_clf.predict(X_val)
feature_importance_df["feature"] = features
feature_importance_df["importance"] = lgb_clf.feature_importance()
oof['predict'][val_idx] = p_val
val_score = roc_auc_score(y_val, p_val)
val_aucs.append(val_score)
predictions = yp

In [None]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[150:].index)
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')

#### Small Augments

In [None]:
p_val, yp = 0, 0

# Augments
X_t, y_t = augment(X_train.values, y_train.values)
X_t = pd.DataFrame(X_t)
X_t = X_t.add_prefix('var_')

trn_data = lgb.Dataset(X_t, label = y_t)
val_data = lgb.Dataset(X_val, label = y_val)
evals_result = {}
lgb_clf = lgb.train(lgb_params,
                        trn_data,
                        100000,
                        valid_sets = [trn_data, val_data],
                        early_stopping_rounds=2000,
                        verbose_eval = 1000,
                        evals_result=evals_result
                       )

p_val += lgb_clf.predict(X_val)
feature_importance_df["feature"] = features
feature_importance_df["importance"] = lgb_clf.feature_importance()
oof['predict'][val_idx] = p_val
val_score = roc_auc_score(y_val, p_val)
val_aucs.append(val_score)
predictions = yp

In [None]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[150:].index)
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')

#### Small Grid Search No Augments

In [None]:
param_grid = {
    "num_leaves" : [13, 11],
    "learning_rate" : [0.0085, 0.01],
    "bagging_freq": [3, 5],
    "bagging_fraction" : [0.3, 0.5],
    "feature_fraction" : [0.04, 0.05],
    "min_data_in_leaf": [60, 80],
}

In [None]:
params = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting_type": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "num_threads": 8,
    "learning_rate" : 0.0085,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.04,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    #"lambda_l1" : 5,
    #"lambda_l2" : 5,
    "bagging_seed" : random_state,
    "verbosity" : -1,
    "seed": random_state
}

In [None]:
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          metric = 'auc',
          max_depth = params['max_depth'],
          num_leaves = params['num_leaves'],
          num_threads = params['num_threads'],
          learning_rate = params['learning_rate'],               
          bagging_freq = params['bagging_freq'],
          bagging_fraction = params['bagging_fraction'],
          feature_fraction = params['feature_fraction'],
          min_data_in_leaf = params['min_data_in_leaf'],
          min_sum_heassian_in_leaf = params['min_sum_heassian_in_leaf'],
          tree_learner = params['tree_learner'],
          boost_from_average = params['boost_from_average'],
          bagging_seed = params['bagging_seed'],
          verbosity = params['verbosity'],
          seed = params['seed'])

mdl.get_params().keys()

In [None]:
grid = GridSearchCV(mdl, param_grid, verbose = 10)
grid.fit(X_train, y_train)

In [None]:
param_grid = {
    "bagging_fraction" : [0.3, 0.25],
#    "bagging_freq": [2,3],
    "feature_fraction" : [0.03, 0.035],
    "learning_rate" : [0.0075, 0.008],
    "min_data_in_leaf": [60, 65],
    "num_leaves" : [15, 14],
}

In [None]:
print(grid.best_params_)

In [None]:
# Current best params

# bagging_fraction: 0.3 [down from 0.4]
# bagging_freq = 2 [down from 3]
# feature_fraction = 0.03 [down from 0.04]
# learning_rate = 0.0075 [down from 0.0085]
# min_data_in_leaf = 60
# num_leaves = 15 .  [up from 13]

### Remove samples with duplicate values

In [10]:
params = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting_type": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 15,
    "num_threads": 8,
    "learning_rate" : 0.0075,
    "bagging_freq": 2,
    "bagging_fraction" : 0.3,
    "feature_fraction" : 0.03,
    "min_data_in_leaf": 60,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    #"lambda_l1" : 5,
    #"lambda_l2" : 5,
    "bagging_seed" : random_state,
    "verbosity" : -1,
    "seed": random_state
}

In [14]:
tr_poly.shape

(200000, 247)

In [27]:
def max_dups(df):
    '''
    count and return max duplicated values from feature columns
    '''
    unique_max_train = []
    for feature in features:
        values = df[feature].value_counts()
        unique_max_train.append([feature, values.max(), values.idxmax()])
    return unique_max_train

In [28]:
max_dups = max_dups(tr_poly)

In [29]:
dups = np.transpose((pd.DataFrame(max_dups, columns=['Feature', 'Max duplicates', 'Value'])).
            sort_values(by = 'Max duplicates', ascending=False))

In [50]:
dups = np.transpose(dups)

In [179]:
dups.head(15)

Unnamed: 0,Feature,Max duplicates,Value
68,var_68,1084,5.0214
108,var_108,313,14.1999
126,var_126,305,11.5356
12,var_12,203,13.5545
91,var_91,66,6.9785
103,var_103,61,1.6662
148,var_148,59,4.0456
71,var_71,54,0.7031
161,var_161,52,5.7688
25,var_25,41,13.6723


In [185]:
ten = dups.loc[dups['Max duplicates'] > 20]

In [186]:
ten_vars = ten['Feature'].values

In [187]:
len(ten_vars)

35

In [158]:
temp = tr_poly.copy()
temp.shape

(200000, 247)

In [139]:
for feature in ten_vars:
    idx = temp.loc[temp[feature] == temp[feature].value_counts().idxmax()].index
    temp = temp.drop(idx)

In [None]:
idx = temp.loc[temp[feature] == temp[feature].value_counts().idxmax()].index

In [140]:
temp.shape

(195272, 247)

In [142]:
tr_temp = temp

### Training dataset with samples removed that contain the most duplicated value from each feature column, with and without augments

With Augments

In [150]:
X_train, X_val, y_train, y_val = train_test_split(tr_temp[features],
                                                  tr_temp['target'],
                                                  test_size = 0.25,
                                                  random_state = random_state)

In [152]:
lgb_params = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting_type": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 15,
    "num_threads": 8,
    "learning_rate" : 0.0075,
    "bagging_freq": 2,
    "bagging_fraction" : 0.3,
    "feature_fraction" : 0.03,
    "min_data_in_leaf": 60,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    #"lambda_l1" : 5,
    #"lambda_l2" : 5,
    "bagging_seed" : random_state,
    "verbosity" : -1,
    "seed": random_state
}

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
oof = tr_poly[['ID_code', 'target']]
oof['predict'] = 0
predictions = tr_poly[['ID_code']]
val_aucs = []
trn_idx = X_train.index
val_idx = X_val.index
feature_importance_df = pd.DataFrame()

In [155]:
p_val, yp = 0, 0
trn_data = lgb.Dataset(X_train, label = y_train)
val_data = lgb.Dataset(X_val, label = y_val)
evals_result = {}
lgb_clf = lgb.train(lgb_params,
                        trn_data,
                        100000,
                        valid_sets = [trn_data, val_data],
                        early_stopping_rounds=2000,
                        verbose_eval = 1000,
                        evals_result=evals_result
                       )

p_val += lgb_clf.predict(X_val)
feature_importance_df["feature"] = features
feature_importance_df["importance"] = lgb_clf.feature_importance()
oof['predict'][val_idx] = p_val
val_score = roc_auc_score(y_val, p_val)
val_aucs.append(val_score)
predictions = yp

Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.913713	valid_1's auc: 0.887185
[2000]	training's auc: 0.922219	valid_1's auc: 0.893121
[3000]	training's auc: 0.927351	valid_1's auc: 0.894827
[4000]	training's auc: 0.931733	valid_1's auc: 0.896066
[5000]	training's auc: 0.935822	valid_1's auc: 0.896666
[6000]	training's auc: 0.939464	valid_1's auc: 0.897252
[7000]	training's auc: 0.94309	valid_1's auc: 0.897865
[8000]	training's auc: 0.946476	valid_1's auc: 0.897976
[9000]	training's auc: 0.949757	valid_1's auc: 0.898245
[10000]	training's auc: 0.952958	valid_1's auc: 0.89818
[11000]	training's auc: 0.955987	valid_1's auc: 0.898336
[12000]	training's auc: 0.958878	valid_1's auc: 0.8982
Early stopping, best iteration is:
[10997]	training's auc: 0.95598	valid_1's auc: 0.898346


NameError: name 'oof' is not defined

In [164]:
# Augment DF and re-run
def augment_df(df):
    for feature in features:
        df[f'sq_{feature}'] = df[feature]**2
        df[f'repo_{feature}'] = df[feature].apply(lambda x: 0 if x==0 else 1/x)
        df[f'repo_sq_{feature}'] = df[f'sq_{feature}'].apply(lambda x: 0 if x==0 else 1/x)
    
    df['min'] = df[features].min(axis=1)
    df['mean'] = df[features].mean(axis=1)
    df['max'] = df[features].max(axis=1)
    df['median'] = df[features].median(axis=1)
    df['std'] = df[features].std(axis=1)
    df['var'] = df[features].var(axis=1)
    df['abs_mean'] = df[features].abs().mean(axis=1)
    df['abs_median'] = df[features].abs().median(axis=1)
    df['abs_std'] = df[features].abs().std(axis=1)
    df['skew'] = df[features].skew(axis=1)
    df['kurt'] = df[features].kurt(axis=1)
    
    df['sq_kurt'] = df[[f'sq_{feature}' for feature in features]].kurt(axis=1)
    

In [188]:
cols = ten_vars

In [189]:
cols

array(['var_68', 'var_108', 'var_126', 'var_12', 'var_91', 'var_103', 'var_148', 'var_71', 'var_161', 'var_25',
       'var_125', 'var_43', 'var_133', 'var_166', 'var_169', 'var_15', 'var_131', 'var_93', 'var_23', 'var_34',
       'var_98', 'var_95', 'var_53', 'var_50', 'var_28', 'var_105', 'var_42', 'var_197', 'var_6', 'var_181', 'var_57',
       'var_130', 'var_156', 'var_144', 'var_59'], dtype=object)

In [190]:
tr.shape

(200000, 202)

In [193]:
features = [feature for feature in tr.columns if feature not in ['ID_code', 'target']]

In [194]:
len(features)

200

In [202]:
dups.shape

(245, 3)

In [218]:
values = dict(tr['var_0'].value_counts())

In [225]:
tr_temp = tr.copy()

In [223]:
def add_freq_vars(df):
    for feature in features:
        val_counts = dict(df[feature].value_counts())
        df['{}_freq'.format(feature)] = df[feature].apply(lambda x: val_counts[x])        

In [228]:
add_freq_vars(tr_temp)

In [229]:
tr_temp.shape

(200000, 402)

In [230]:
tr_temp.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190_freq,var_191_freq,var_192_freq,var_193_freq,var_194_freq,var_195_freq,var_196_freq,var_197_freq,var_198_freq,var_199_freq
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,3,6,7,3,4,4,3,13,5,2
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,5,4,6,1,1,2,2,13,2,1
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,3,4,3,1,2,2,3,8,2,2
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,1,2,4,4,3,7,4,4,2,2
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,3,4,1,1,1,5,3,6,2,2


In [231]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
oof = tr_poly[['ID_code', 'target']]
oof['predict'] = 0
predictions = tr_poly[['ID_code']]
val_aucs = []
feature_importance_df = pd.DataFrame()

In [232]:
features = [feature for feature in tr_temp.columns if feature not in ['ID_code', 'target']]

In [233]:
len(features)

400

In [234]:
X_train, X_val, y_train, y_val = train_test_split(tr_temp[features],
                                                  tr_temp['target'],
                                                  test_size = 0.2,
                                                  random_state = 37)

In [237]:
print("% of target in small train == 1: {:01.2f}%".format(len(y_train.loc[y_train == 1]) / len(y_train) * 100))
print("% of target in train == 1: {:01.2f}%".format(len(y_val.loc[y_val == 1]) / len(y_val) * 100))

% of target in small train == 1: 10.06%
% of target in train == 1: 9.99%


In [238]:
p_val, yp = 0, 0
trn_data = lgb.Dataset(X_train, label = y_train)
val_data = lgb.Dataset(X_val, label = y_val)
evals_result = {}
lgb_clf = lgb.train(lgb_params,
                        trn_data,
                        100000,
                        valid_sets = [trn_data, val_data],
                        early_stopping_rounds=2000,
                        verbose_eval = 1000,
                        evals_result=evals_result
                       )

p_val += lgb_clf.predict(X_val)
feature_importance_df["feature"] = features
feature_importance_df["importance"] = lgb_clf.feature_importance()
oof['predict'][val_idx] = p_val
val_score = roc_auc_score(y_val, p_val)
val_aucs.append(val_score)
predictions = yp

Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.905604	valid_1's auc: 0.884988
[2000]	training's auc: 0.916891	valid_1's auc: 0.893087
[3000]	training's auc: 0.923189	valid_1's auc: 0.896038
[4000]	training's auc: 0.928145	valid_1's auc: 0.898019
[5000]	training's auc: 0.932453	valid_1's auc: 0.899001
[6000]	training's auc: 0.936465	valid_1's auc: 0.899872
[7000]	training's auc: 0.940169	valid_1's auc: 0.900484
[8000]	training's auc: 0.943704	valid_1's auc: 0.900617
[9000]	training's auc: 0.947126	valid_1's auc: 0.900931
[10000]	training's auc: 0.950402	valid_1's auc: 0.901087
[11000]	training's auc: 0.953527	valid_1's auc: 0.901129
[12000]	training's auc: 0.956508	valid_1's auc: 0.901144
[13000]	training's auc: 0.959282	valid_1's auc: 0.901095
Early stopping, best iteration is:
[11606]	training's auc: 0.955317	valid_1's auc: 0.901189


ValueError: cannot set using a list-like indexer with a different length than the value

In [239]:
tr_temp.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190_freq,var_191_freq,var_192_freq,var_193_freq,var_194_freq,var_195_freq,var_196_freq,var_197_freq,var_198_freq,var_199_freq
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,3,6,7,3,4,4,3,13,5,2
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,5,4,6,1,1,2,2,13,2,1
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,3,4,3,1,2,2,3,8,2,2
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,1,2,4,4,3,7,4,4,2,2
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,3,4,1,1,1,5,3,6,2,2


In [240]:
add_freq_vars(te)

In [241]:
te.shape

(200000, 601)

In [242]:
te.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190_freq_freq,var_191_freq_freq,var_192_freq_freq,var_193_freq_freq,var_194_freq_freq,var_195_freq_freq,var_196_freq_freq,var_197_freq_freq,var_198_freq_freq,var_199_freq_freq
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,32604,34844,18040,3834,32668,23020,13182,16820,25065,31348
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,42900,36864,1568,32216,36112,1995,45510,15138,36486,4991
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,22465,32072,15648,32216,36112,20958,45510,16820,18564,57026
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,32604,36864,18515,32216,36112,23020,22185,3538,18564,49512
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,22465,25215,18515,32216,36112,901,47860,17184,25065,16965


In [253]:
te.drop(te.columns[401:], axis = 1, inplace = True)

In [255]:
te.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190_freq,var_191_freq,var_192_freq,var_193_freq,var_194_freq,var_195_freq,var_196_freq,var_197_freq,var_198_freq,var_199_freq
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,4,2,2,9,4,4,6,5,5,4
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,3,3,16,4,2,15,3,9,3,7
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,5,4,8,4,2,6,3,5,6,2
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,4,3,7,4,2,4,1,1,6,3
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,5,5,7,4,2,17,2,6,5,5


In [256]:
tr_temp.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190_freq,var_191_freq,var_192_freq,var_193_freq,var_194_freq,var_195_freq,var_196_freq,var_197_freq,var_198_freq,var_199_freq
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,3,6,7,3,4,4,3,13,5,2
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,5,4,6,1,1,2,2,13,2,1
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,3,4,3,1,2,2,3,8,2,2
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,1,2,4,4,3,7,4,4,2,2
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,3,4,1,1,1,5,3,6,2,2


In [257]:
tr_temp.to_csv('train_freq.csv', index = False)
te.to_csv('test_freq.csv', index = False)