# **031:** Train-Test Split for **all** features

## 1. Input: 

```
./data/300_features.csv
```

## 2. Output:
```
./data/400_X_submit.csv
./data/400_X_train.csv
./data/400_X_test.csv

./data/400_y_train.csv
./data/400_y_test.csv
```

In [3]:
from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

InteractiveShell.ast_node_interactivity = 'all'

df_features = pd.read_csv('./data/330_features_txn.csv')
df_train = pd.read_csv('./data/003_train.csv')
df_test  = pd.read_csv('./data/004_test.csv')

In [4]:
train_set = pd.merge(df_train, df_features, how='left', on='id')
X_submit  = pd.merge(df_test, df_features, how='left', on='id')

train_set.set_index('id', inplace=True)
X_submit.set_index('id', inplace=True)

X = train_set.drop('label', axis=1)
y = train_set[['label']]

X.head()

Unnamed: 0_level_0,c7_n7,c5__0,c5__1,c5__2,c5__3,c5__4,c5__5,c5__6,c5__7,c5__8,...,n6_min,n7_min,n4_median,n5_median,n6_median,n7_median,n4_std,n5_std,n6_std,n7_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84531,1.27032,0.76471,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,0.386972,0.311322,0.919136,0.170351,0.165083,0.416452,1.260004,-0.103069,-5.199338
25895,-0.219151,1.255413,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,0.11346,-0.430727,-1.22064,-0.13971,-0.119209,-0.627521,-0.246973,0.076762,-5.199338
71109,0.437538,1.224655,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,0.544594,-0.76471,-1.22064,-0.128001,0.322287,-0.122,0.058722,-0.542718,-5.199338
64666,0.766535,-0.589456,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,0.425127,0.13971,-0.13971,0.953617,0.202001,-0.135418,1.124756,-0.081915,-5.199338
91813,0.175748,0.0,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-0.911522,0.793734,0.535083,0.011605,-1.287592,1.22563,0.828241,-0.421037,-5.199338


## Load previous train-test splits to maintain indexes

In [5]:
y_train = pd.read_csv('./data/410_y_train.csv')
y_test  = pd.read_csv('./data/410_y_test.csv')

In [6]:
X_train = X.loc[y_train['id'].values].copy()
X_train.head()
y_train.head()

Unnamed: 0_level_0,c7_n7,c5__0,c5__1,c5__2,c5__3,c5__4,c5__5,c5__6,c5__7,c5__8,...,n6_min,n7_min,n4_median,n5_median,n6_median,n7_median,n4_std,n5_std,n6_std,n7_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19849,-0.080881,0.13971,1.236227,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.731664,-0.76471,0.13971,0.005802,-0.686252,-0.425349,-0.6876,1.097375,0.570668
34583,-0.373794,0.799083,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.118311,-0.430727,0.13971,-5.199338,-1.468851,-0.9385,-0.848506,-1.293418,1.31293
86660,0.400765,1.168949,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,1.233387,-0.430727,-1.382994,0.320232,1.16403,-0.08954,0.074726,1.189175,-5.199338
73039,0.883244,0.703922,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-0.923867,-0.508488,1.225627,-0.499851,-1.298211,-0.818417,1.48686,0.660232,-5.199338
38589,0.514574,0.76471,1.236227,-5.199338,-5.199338,-5.199338,1.238343,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.187193,0.589456,0.430727,0.695743,-1.530862,0.916116,1.002813,0.265945,0.928437


Unnamed: 0,id,label
0,19849,9
1,34583,2
2,86660,3
3,73039,12
4,38589,4


In [7]:
X_test  = X.loc[y_test['id'].values].copy()
X_test.head()
y_test.head()

Unnamed: 0_level_0,c7_n7,c5__0,c5__1,c5__2,c5__3,c5__4,c5__5,c5__6,c5__7,c5__8,...,n6_min,n7_min,n4_median,n5_median,n6_median,n7_median,n4_std,n5_std,n6_std,n7_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11020,-0.439758,-0.13971,-5.199338,-5.199338,-5.199338,-5.199338,1.22064,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.369793,-0.76471,-0.330873,-0.13971,-1.697873,0.95919,-0.500311,-0.242148,-5.199338
78932,0.383094,0.908458,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.009516,0.823442,-0.330873,0.113389,0.456516,0.352517,-0.607897,-0.383644,1.248601
66518,1.392204,1.245029,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.404632,-1.347629,0.80685,0.092972,0.720235,0.470974,1.244705,-0.700137,1.263732
61221,0.927966,0.703922,-5.199338,-5.199338,-5.199338,-5.199338,1.22064,-5.199338,-5.199338,-5.199338,...,-5.199338,-2.097518,-1.22064,-0.330873,0.76471,-0.681898,0.042589,-0.324048,0.384235,0.732251
48750,-0.293778,-0.13971,-5.199338,-5.199338,-5.199338,-5.199338,1.22064,-5.199338,-5.199338,-5.199338,...,-5.199338,0.833391,-1.22064,-0.234219,-5.199338,0.629719,0.412459,-0.72234,-0.633366,-5.199338


Unnamed: 0,id,label
0,11020,12
1,78932,4
2,66518,3
3,61221,4
4,48750,2


In [8]:
X.shape
X_submit.shape
X_train.shape
X_test.shape

(100000, 311)

(53240, 311)

(80000, 311)

(20000, 311)

In [9]:
X_submit.to_csv('./data/430_X_submit.csv', index=True)
X_train.to_csv('./data/430_X_train.csv', index=True)
X_test.to_csv('./data/430_X_test.csv', index=True)