# **031:** Train-Test Split for **all** features

## 1. Input: 

```
./data/300_features.csv
```

## 2. Output:
```
./data/400_X_submit.csv
./data/400_X_train.csv
./data/400_X_test.csv

./data/400_y_train.csv
./data/400_y_test.csv
```

In [1]:
from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

InteractiveShell.ast_node_interactivity = 'all'

df_features = pd.read_csv('./data/320_features_txn.csv')
df_train = pd.read_csv('./data/003_train.csv')
df_test  = pd.read_csv('./data/004_test.csv')

In [2]:
train_set = pd.merge(df_train, df_features, how='left', on='id')
X_submit  = pd.merge(df_test, df_features, how='left', on='id')

train_set.set_index('id', inplace=True)
X_submit.set_index('id', inplace=True)

X = train_set.drop('label', axis=1)
y = train_set[['label']]

X.head()

Unnamed: 0_level_0,c7_n7,c5__0,c5__1,c5__2,c5__3,c5__4,c5__5,c5__6,c5__7,c5__8,...,n6_min,n7_min,n4_median,n5_median,n6_median,n7_median,n4_std,n5_std,n6_std,n7_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84531,1.27032,0.76471,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,0.378629,0.282216,0.919136,0.158486,0.162641,0.419431,1.25887,-0.099218,-5.199338
25895,-0.219151,1.255413,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,0.10833,-0.430727,-1.22064,-0.13971,-0.121257,-0.621876,-0.241215,0.080266,-5.199338
71109,0.437538,1.224655,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,0.533944,-0.76471,-1.22064,-0.12824,0.316766,-0.117262,0.064802,-0.540548,-5.199338
64666,0.766535,-0.589456,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,0.41622,0.13971,-0.13971,0.949427,0.198854,-0.130903,1.143911,-0.078111,-5.199338
91813,0.175748,0.0,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-0.910147,0.793734,0.535083,0.008526,-1.285378,1.225625,0.839776,-0.417353,-5.199338


## Load previous train-test splits to maintain indexes

In [3]:
y_train = pd.read_csv('./data/410_y_train.csv')
y_test  = pd.read_csv('./data/410_y_test.csv')

In [4]:
X_train = X.loc[y_train['id'].values].copy()
X_train.head()
y_train.head()

Unnamed: 0_level_0,c7_n7,c5__0,c5__1,c5__2,c5__3,c5__4,c5__5,c5__6,c5__7,c5__8,...,n6_min,n7_min,n4_median,n5_median,n6_median,n7_median,n4_std,n5_std,n6_std,n7_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19849,-0.080881,0.13971,1.236227,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.73338,-0.76471,0.13971,0.002842,-0.687031,-0.420312,-0.685877,1.096063,0.566021
34583,-0.373794,0.799083,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.11916,-0.430727,0.13971,-5.199338,-1.467294,-0.932398,-0.848604,-1.293898,1.266845
86660,0.400765,1.168949,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,1.232854,-0.430727,-1.382994,0.311322,1.163788,-0.084263,0.080849,1.187023,-5.199338
73039,0.883244,0.703922,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-0.922613,-0.508488,1.225627,-0.511474,-1.296033,-0.812419,1.458942,0.662654,-5.199338
38589,0.514574,0.76471,1.236227,-5.199338,-5.199338,-5.199338,1.238343,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.188905,0.556634,0.430727,0.694563,-1.529558,0.915289,1.018384,0.269928,0.925305


Unnamed: 0,id,label
0,19849,9
1,34583,2
2,86660,3
3,73039,12
4,38589,4


In [5]:
X_test  = X.loc[y_test['id'].values].copy()
X_test.head()
y_test.head()

Unnamed: 0_level_0,c7_n7,c5__0,c5__1,c5__2,c5__3,c5__4,c5__5,c5__6,c5__7,c5__8,...,n6_min,n7_min,n4_median,n5_median,n6_median,n7_median,n4_std,n5_std,n6_std,n7_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11020,-0.439758,-0.13971,-5.199338,-5.199338,-5.199338,-5.199338,1.22064,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.371799,-0.76471,-0.330873,-0.13971,-1.697363,0.957972,-0.495832,-0.23867,-5.199338
78932,0.383094,0.908458,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.009138,0.823442,-0.330873,0.108206,0.448946,0.356175,-0.604977,-0.380013,1.234934
66518,1.392204,1.245029,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-1.406606,-1.278543,0.80685,0.088216,0.716835,0.472421,1.24512,-0.698835,1.242561
61221,0.927966,0.703922,-5.199338,-5.199338,-5.199338,-5.199338,1.22064,-5.199338,-5.199338,-5.199338,...,-5.199338,-2.09901,-1.22064,-0.330873,0.76471,-0.682592,0.050121,-0.317771,0.388783,0.721652
48750,-0.293778,-0.13971,-5.199338,-5.199338,-5.199338,-5.199338,1.22064,-5.199338,-5.199338,-5.199338,...,-5.199338,0.823801,-1.22064,-0.234219,-5.199338,0.624855,0.415481,-0.721156,-0.63169,-5.199338


Unnamed: 0,id,label
0,11020,12
1,78932,4
2,66518,3
3,61221,4
4,48750,2


In [6]:
X.shape
X_submit.shape
X_train.shape
X_test.shape

(100000, 310)

(53240, 310)

(80000, 310)

(20000, 310)

In [7]:
X_submit.to_csv('./data/420_X_submit.csv', index=True)
X_train.to_csv('./data/420_X_train.csv', index=True)
X_test.to_csv('./data/420_X_test.csv', index=True)