# **031:** Train-Test Split for **all** features

## 1. Input: 

```
./data/300_features.csv
```

## 2. Output:
```
./data/400_X_submit.csv
./data/400_X_train.csv
./data/400_X_test.csv

./data/400_y_train.csv
./data/400_y_test.csv
```

In [1]:
from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

InteractiveShell.ast_node_interactivity = 'all'

df_features = pd.read_csv('./data/301_features_txn_ns_cmj_clboh.csv')
df_train = pd.read_csv('./data/003_train.csv')
df_test  = pd.read_csv('./data/004_test.csv')

In [2]:
train_set = pd.merge(df_train, df_features, how='left', on='id')
X_submit  = pd.merge(df_test, df_features, how='left', on='id')

train_set.set_index('id', inplace=True)
X_submit.set_index('id', inplace=True)

X = train_set.drop('label', axis=1)
y = train_set[['label']]

X.head()

Unnamed: 0_level_0,n4_mean,n5_mean,n6_mean,n7_mean,n4_max,n5_max,n6_max,n7_max,n4_min,n5_min,...,old_cc_label_9,old_cc_label_10,old_cc_label_11,old_cc_label_12,c5_mj,c6_mj,c7_mj,c56_mj,c57_mj,c67_mj
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84531,0.195438,1.139846,0.3046,0.202061,0.532983,0.861634,0.330873,0.051889,-0.282216,0.589456,...,-5.199338,-5.199338,5.199338,-5.199338,14,-10,48,14-10,1448,-1048
25895,-1.049161,-0.302699,-0.260304,-0.098846,-0.228994,-1.067571,0.492703,-0.253041,0.282216,-5.199338,...,-5.199338,5.199338,-5.199338,-5.199338,0,-10,0,0-10,0,-100
71109,-0.758637,-0.479887,-0.181057,0.366419,0.141637,0.35549,0.330873,0.214803,-0.13971,-5.199338,...,-5.199338,5.199338,-5.199338,-5.199338,0,-10,0,0-10,0,-100
64666,-0.234318,0.292557,0.695447,0.240538,0.149349,1.290649,-0.313866,0.091069,-0.35549,0.589456,...,-5.199338,5.199338,-5.199338,-5.199338,11,-10,57,11-10,1157,-1057
91813,1.227251,0.497574,0.019522,-1.448033,1.225417,1.290649,0.13971,-1.540556,-0.602249,-5.199338,...,-5.199338,-5.199338,-5.199338,5.199338,11,-10,54,11-10,1154,-1054


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=100)

len(Counter(y_test.values.reshape(1,-1)[0]))

13

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 84531 to 35548
Data columns (total 39 columns):
n4_mean            100000 non-null float64
n5_mean            100000 non-null float64
n6_mean            100000 non-null float64
n7_mean            100000 non-null float64
n4_max             100000 non-null float64
n5_max             100000 non-null float64
n6_max             100000 non-null float64
n7_max             100000 non-null float64
n4_min             100000 non-null float64
n5_min             100000 non-null float64
n6_min             100000 non-null float64
n7_min             100000 non-null float64
n4_median          100000 non-null float64
n5_median          100000 non-null float64
n6_median          100000 non-null float64
n7_median          100000 non-null float64
n4_std             100000 non-null float64
n5_std             100000 non-null float64
n6_std             100000 non-null float64
n7_std             100000 non-null float64
old_cc_label_0     100000 

In [5]:
X.shape
X_submit.shape
X_train.shape
X_test.shape

(100000, 39)

(53240, 39)

(80000, 39)

(20000, 39)

In [6]:
X_submit.to_csv('./data/401_X_submit.csv', index=True)
X_train.to_csv('./data/401_X_train.csv', index=True)
X_test.to_csv('./data/401_X_test.csv', index=True)

In [7]:
y_train.to_csv('./data/401_y_train.csv', index=True)
y_test.to_csv('./data/401_y_test.csv', index=True)