# **031:** Train-Test Split for **all** features

## 1. Input: 

```
./data/300_features.csv
```

## 2. Output:
```
./data/400_X_submit.csv
./data/400_X_train.csv
./data/400_X_test.csv

./data/400_y_train.csv
./data/400_y_test.csv
```

In [4]:
from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

InteractiveShell.ast_node_interactivity = 'all'

df_features = pd.read_csv('./data/100_cross.csv')
df_train = pd.read_csv('./data/003_train.csv')
df_test  = pd.read_csv('./data/004_test.csv')

In [5]:
df_features.head()

Unnamed: 0,id,c1,c2,c1_c2_crossed,bin_n1,bin_n2,n1_n2_crossed
0,76371,7.0,97,7.097,"(-1.0, 1000.0]","(25000.0, 60000.0]","(-1.0, 1000.0](25000.0, 60000.0]"
1,44326,4.0,97,4.097,"(1000.0, 21000.0]","(60000.0, 175000.0]","(1000.0, 21000.0](60000.0, 175000.0]"
2,33717,3.0,97,3.097,"(89000.0, 20367000.0]","(60000.0, 175000.0]","(89000.0, 20367000.0](60000.0, 175000.0]"
3,96078,4.0,98,4.098,"(89000.0, 20367000.0]","(-1.0, 4000.0]","(89000.0, 20367000.0](-1.0, 4000.0]"
4,13591,3.0,97,3.097,"(-1.0, 1000.0]","(25000.0, 60000.0]","(-1.0, 1000.0](25000.0, 60000.0]"


In [6]:
train_set = pd.merge(df_train, df_features, how='left', on='id')
X_submit  = pd.merge(df_test, df_features, how='left', on='id')

train_set.set_index('id', inplace=True)
X_submit.set_index('id', inplace=True)

X = train_set.drop('label', axis=1)
y = train_set[['label']]

X.head()

Unnamed: 0_level_0,c1,c2,c1_c2_crossed,bin_n1,bin_n2,n1_n2_crossed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
84531,3.0,97,3.097,"(-1.0, 1000.0]","(25000.0, 60000.0]","(-1.0, 1000.0](25000.0, 60000.0]"
25895,3.0,98,3.098,"(21000.0, 44000.0]","(4000.0, 25000.0]","(21000.0, 44000.0](4000.0, 25000.0]"
71109,3.0,98,3.098,"(-1.0, 1000.0]","(25000.0, 60000.0]","(-1.0, 1000.0](25000.0, 60000.0]"
64666,3.0,97,3.097,"(44000.0, 89000.0]","(25000.0, 60000.0]","(44000.0, 89000.0](25000.0, 60000.0]"
91813,4.0,95,4.095,"(-1.0, 1000.0]","(175000.0, 7500000.0]","(-1.0, 1000.0](175000.0, 7500000.0]"


## Load previous train-test splits to maintain indexes

In [7]:
y_train = pd.read_csv('./data/410_y_train.csv')
y_test  = pd.read_csv('./data/410_y_test.csv')

In [8]:
X_train = X.loc[y_train['id'].values].copy()
X_train.head()
y_train.head()

Unnamed: 0_level_0,c1,c2,c1_c2_crossed,bin_n1,bin_n2,n1_n2_crossed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
19849,2.0,96,2.096,"(-1.0, 1000.0]","(60000.0, 175000.0]","(-1.0, 1000.0](60000.0, 175000.0]"
34583,3.0,97,3.097,"(-1.0, 1000.0]","(4000.0, 25000.0]","(-1.0, 1000.0](4000.0, 25000.0]"
86660,3.0,98,3.098,"(44000.0, 89000.0]","(60000.0, 175000.0]","(44000.0, 89000.0](60000.0, 175000.0]"
73039,12.0,97,12.097,"(-1.0, 1000.0]","(-1.0, 4000.0]","(-1.0, 1000.0](-1.0, 4000.0]"
38589,3.0,98,3.098,"(-1.0, 1000.0]","(60000.0, 175000.0]","(-1.0, 1000.0](60000.0, 175000.0]"


Unnamed: 0,id,label
0,19849,9
1,34583,2
2,86660,3
3,73039,12
4,38589,4


In [9]:
X_test  = X.loc[y_test['id'].values].copy()
X_test.head()
y_test.head()

Unnamed: 0_level_0,c1,c2,c1_c2_crossed,bin_n1,bin_n2,n1_n2_crossed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11020,3.0,98,3.098,"(89000.0, 20367000.0]","(60000.0, 175000.0]","(89000.0, 20367000.0](60000.0, 175000.0]"
78932,9.0,97,9.097,"(89000.0, 20367000.0]","(60000.0, 175000.0]","(89000.0, 20367000.0](60000.0, 175000.0]"
66518,3.0,97,3.097,"(44000.0, 89000.0]","(25000.0, 60000.0]","(44000.0, 89000.0](25000.0, 60000.0]"
61221,3.0,99,3.099,"(-1.0, 1000.0]","(4000.0, 25000.0]","(-1.0, 1000.0](4000.0, 25000.0]"
48750,2.0,97,2.097,"(-1.0, 1000.0]","(25000.0, 60000.0]","(-1.0, 1000.0](25000.0, 60000.0]"


Unnamed: 0,id,label
0,11020,12
1,78932,4
2,66518,3
3,61221,4
4,48750,2


In [10]:
X.shape
X_submit.shape
X_train.shape
X_test.shape

(100000, 6)

(53240, 6)

(80000, 6)

(20000, 6)

In [15]:
X_submit.head()

Unnamed: 0_level_0,c1,c2,c1_c2_crossed,bin_n1,bin_n2,n1_n2_crossed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
151807,7.0,96,7.096,"(-1.0, 1000.0]","(60000.0, 175000.0]","(-1.0, 1000.0](60000.0, 175000.0]"
118131,1.0,97,1.097,"(-1.0, 1000.0]","(4000.0, 25000.0]","(-1.0, 1000.0](4000.0, 25000.0]"
110921,4.0,98,4.098,"(-1.0, 1000.0]","(25000.0, 60000.0]","(-1.0, 1000.0](25000.0, 60000.0]"
105149,3.0,96,3.096,"(21000.0, 44000.0]","(25000.0, 60000.0]","(21000.0, 44000.0](25000.0, 60000.0]"
143868,3.0,97,3.097,"(89000.0, 20367000.0]","(60000.0, 175000.0]","(89000.0, 20367000.0](60000.0, 175000.0]"


In [11]:
X_submit.to_csv('./data/440_X_submit.csv', index=True)
X_train.to_csv('./data/440_X_train.csv', index=True)
X_test.to_csv('./data/440_X_test.csv', index=True)