# **031:** Train-Test Split for **all** features

## 1. Input: 

```
./data/300_features.csv
```

## 2. Output:
```
./data/400_X_submit.csv
./data/400_X_train.csv
./data/400_X_test.csv

./data/400_y_train.csv
./data/400_y_test.csv
```

In [1]:
from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

InteractiveShell.ast_node_interactivity = 'all'

df_features = pd.read_csv('./data/341_features_txn.csv')
df_train = pd.read_csv('./data/003_train.csv')
df_test  = pd.read_csv('./data/004_test.csv')

In [2]:
df_features.head()

Unnamed: 0,id,c1_c2_crossed_1.092,c1_c2_crossed_1.093,c1_c2_crossed_1.094,c1_c2_crossed_1.095,c1_c2_crossed_1.096,c1_c2_crossed_1.097,c1_c2_crossed_1.098,c1_c2_crossed_1.099,c1_c2_crossed_10.095,...,"n1_n2_crossed_(44000.0, 89000.0](-1.0, 4000.0]","n1_n2_crossed_(44000.0, 89000.0](175000.0, 7500000.0]","n1_n2_crossed_(44000.0, 89000.0](25000.0, 60000.0]","n1_n2_crossed_(44000.0, 89000.0](4000.0, 25000.0]","n1_n2_crossed_(44000.0, 89000.0](60000.0, 175000.0]","n1_n2_crossed_(89000.0, 20367000.0](-1.0, 4000.0]","n1_n2_crossed_(89000.0, 20367000.0](175000.0, 7500000.0]","n1_n2_crossed_(89000.0, 20367000.0](25000.0, 60000.0]","n1_n2_crossed_(89000.0, 20367000.0](4000.0, 25000.0]","n1_n2_crossed_(89000.0, 20367000.0](60000.0, 175000.0]"
0,76371,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
1,44326,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
2,33717,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,5.199338
3,96078,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,5.199338,-5.199338,-5.199338,-5.199338,-5.199338
4,13591,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338


In [3]:
train_set = pd.merge(df_train, df_features, how='left', on='id')
X_submit  = pd.merge(df_test, df_features, how='left', on='id')

train_set.set_index('id', inplace=True)
X_submit.set_index('id', inplace=True)

X = train_set.drop('label', axis=1)
y = train_set[['label']]

X.head()

Unnamed: 0_level_0,c1_c2_crossed_1.092,c1_c2_crossed_1.093,c1_c2_crossed_1.094,c1_c2_crossed_1.095,c1_c2_crossed_1.096,c1_c2_crossed_1.097,c1_c2_crossed_1.098,c1_c2_crossed_1.099,c1_c2_crossed_10.095,c1_c2_crossed_10.096,...,"n1_n2_crossed_(44000.0, 89000.0](-1.0, 4000.0]","n1_n2_crossed_(44000.0, 89000.0](175000.0, 7500000.0]","n1_n2_crossed_(44000.0, 89000.0](25000.0, 60000.0]","n1_n2_crossed_(44000.0, 89000.0](4000.0, 25000.0]","n1_n2_crossed_(44000.0, 89000.0](60000.0, 175000.0]","n1_n2_crossed_(89000.0, 20367000.0](-1.0, 4000.0]","n1_n2_crossed_(89000.0, 20367000.0](175000.0, 7500000.0]","n1_n2_crossed_(89000.0, 20367000.0](25000.0, 60000.0]","n1_n2_crossed_(89000.0, 20367000.0](4000.0, 25000.0]","n1_n2_crossed_(89000.0, 20367000.0](60000.0, 175000.0]"
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84531,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
25895,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
71109,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
64666,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
91813,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338


## Load previous train-test splits to maintain indexes

In [4]:
y_train = pd.read_csv('./data/410_y_train.csv')
y_test  = pd.read_csv('./data/410_y_test.csv')

In [5]:
X_train = X.loc[y_train['id'].values].copy()
X_train.head()
y_train.head()

Unnamed: 0_level_0,c1_c2_crossed_1.092,c1_c2_crossed_1.093,c1_c2_crossed_1.094,c1_c2_crossed_1.095,c1_c2_crossed_1.096,c1_c2_crossed_1.097,c1_c2_crossed_1.098,c1_c2_crossed_1.099,c1_c2_crossed_10.095,c1_c2_crossed_10.096,...,"n1_n2_crossed_(44000.0, 89000.0](-1.0, 4000.0]","n1_n2_crossed_(44000.0, 89000.0](175000.0, 7500000.0]","n1_n2_crossed_(44000.0, 89000.0](25000.0, 60000.0]","n1_n2_crossed_(44000.0, 89000.0](4000.0, 25000.0]","n1_n2_crossed_(44000.0, 89000.0](60000.0, 175000.0]","n1_n2_crossed_(89000.0, 20367000.0](-1.0, 4000.0]","n1_n2_crossed_(89000.0, 20367000.0](175000.0, 7500000.0]","n1_n2_crossed_(89000.0, 20367000.0](25000.0, 60000.0]","n1_n2_crossed_(89000.0, 20367000.0](4000.0, 25000.0]","n1_n2_crossed_(89000.0, 20367000.0](60000.0, 175000.0]"
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19849,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
34583,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
86660,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
73039,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
38589,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338


Unnamed: 0,id,label
0,19849,9
1,34583,2
2,86660,3
3,73039,12
4,38589,4


In [6]:
X_test  = X.loc[y_test['id'].values].copy()
X_test.head()
y_test.head()

Unnamed: 0_level_0,c1_c2_crossed_1.092,c1_c2_crossed_1.093,c1_c2_crossed_1.094,c1_c2_crossed_1.095,c1_c2_crossed_1.096,c1_c2_crossed_1.097,c1_c2_crossed_1.098,c1_c2_crossed_1.099,c1_c2_crossed_10.095,c1_c2_crossed_10.096,...,"n1_n2_crossed_(44000.0, 89000.0](-1.0, 4000.0]","n1_n2_crossed_(44000.0, 89000.0](175000.0, 7500000.0]","n1_n2_crossed_(44000.0, 89000.0](25000.0, 60000.0]","n1_n2_crossed_(44000.0, 89000.0](4000.0, 25000.0]","n1_n2_crossed_(44000.0, 89000.0](60000.0, 175000.0]","n1_n2_crossed_(89000.0, 20367000.0](-1.0, 4000.0]","n1_n2_crossed_(89000.0, 20367000.0](175000.0, 7500000.0]","n1_n2_crossed_(89000.0, 20367000.0](25000.0, 60000.0]","n1_n2_crossed_(89000.0, 20367000.0](4000.0, 25000.0]","n1_n2_crossed_(89000.0, 20367000.0](60000.0, 175000.0]"
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11020,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,5.199338
78932,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,5.199338
66518,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
61221,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
48750,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338


Unnamed: 0,id,label
0,11020,12
1,78932,4
2,66518,3
3,61221,4
4,48750,2


In [7]:
X.shape
X_submit.shape
X_train.shape
X_test.shape

(100000, 124)

(53240, 124)

(80000, 124)

(20000, 124)

In [8]:
X_submit.head()

Unnamed: 0_level_0,c1_c2_crossed_1.092,c1_c2_crossed_1.093,c1_c2_crossed_1.094,c1_c2_crossed_1.095,c1_c2_crossed_1.096,c1_c2_crossed_1.097,c1_c2_crossed_1.098,c1_c2_crossed_1.099,c1_c2_crossed_10.095,c1_c2_crossed_10.096,...,"n1_n2_crossed_(44000.0, 89000.0](-1.0, 4000.0]","n1_n2_crossed_(44000.0, 89000.0](175000.0, 7500000.0]","n1_n2_crossed_(44000.0, 89000.0](25000.0, 60000.0]","n1_n2_crossed_(44000.0, 89000.0](4000.0, 25000.0]","n1_n2_crossed_(44000.0, 89000.0](60000.0, 175000.0]","n1_n2_crossed_(89000.0, 20367000.0](-1.0, 4000.0]","n1_n2_crossed_(89000.0, 20367000.0](175000.0, 7500000.0]","n1_n2_crossed_(89000.0, 20367000.0](25000.0, 60000.0]","n1_n2_crossed_(89000.0, 20367000.0](4000.0, 25000.0]","n1_n2_crossed_(89000.0, 20367000.0](60000.0, 175000.0]"
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
151807,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
118131,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
110921,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
105149,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
143868,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,5.199338


In [9]:
X_submit.to_csv('./data/441_X_submit.csv', index=True)
X_train.to_csv('./data/441_X_train.csv', index=True)
X_test.to_csv('./data/441_X_test.csv', index=True)