In [1]:
import pandas as pd

from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer
from sklearn.preprocessing import QuantileTransformer
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn import ensemble, linear_model
from xgboost.sklearn import XGBRegressor

from utils import metrics

InteractiveShell.ast_node_interactivity = 'all'

df_features = pd.read_csv('./data/101_agg_ns_cmj_clboh.csv')
df_train = pd.read_csv('./data/003_train.csv')
df_test  = pd.read_csv('./data/004_test.csv')

df_features = df_features.set_index('id')
df_features.head()

Unnamed: 0_level_0,n4_mean,n5_mean,n6_mean,n7_mean,n4_max,n5_max,n6_max,n7_max,n4_min,n5_min,...,old_cc_label_6,old_cc_label_7,old_cc_label_8,old_cc_label_9,old_cc_label_10,old_cc_label_11,old_cc_label_12,c5_mj,c6_mj,c7_mj
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,851.522436,101.511485,488.38515,258.409989,114300,151,997,268,-1000,93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,-10,73
2,34404.334244,103.557595,187.894572,3530.382273,621000,192,996,3562,-10600,93,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,11,-10,45
3,2254.716981,101.961461,62.503412,1595.431955,63600,189,997,1877,-400,93,...,0.0,0.0,0.0,0.0,0.457246,0.0,0.0,11,-10,73
4,33061.520631,101.491887,427.385257,2825.0,1616400,114,997,2825,-327800,93,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,11,-10,93
5,10487.065164,102.423322,355.121999,4353.0,540400,161,998,4353,-598100,93,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,11,-10,93


In [3]:
df_features.shape

(153240, 36)

### Exclude categorical features

In [4]:
columns_scale = [col for col in df_features.columns if ('_mj' not in col)]
columns_scale

['n4_mean',
 'n5_mean',
 'n6_mean',
 'n7_mean',
 'n4_max',
 'n5_max',
 'n6_max',
 'n7_max',
 'n4_min',
 'n5_min',
 'n6_min',
 'n7_min',
 'n4_median',
 'n5_median',
 'n6_median',
 'n7_median',
 'n4_std',
 'n5_std',
 'n6_std',
 'n7_std',
 'old_cc_label_0',
 'old_cc_label_1',
 'old_cc_label_2',
 'old_cc_label_3',
 'old_cc_label_4',
 'old_cc_label_5',
 'old_cc_label_6',
 'old_cc_label_7',
 'old_cc_label_8',
 'old_cc_label_9',
 'old_cc_label_10',
 'old_cc_label_11',
 'old_cc_label_12']

In [5]:
qt = QuantileTransformer(n_quantiles=10, random_state=100, output_distribution='normal')

df_features[columns_scale] = qt.fit_transform(df_features[columns_scale])
df_features.head()

Unnamed: 0_level_0,n4_mean,n5_mean,n6_mean,n7_mean,n4_max,n5_max,n6_max,n7_max,n4_min,n5_min,...,old_cc_label_6,old_cc_label_7,old_cc_label_8,old_cc_label_9,old_cc_label_10,old_cc_label_11,old_cc_label_12,c5_mj,c6_mj,c7_mj
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.222313,-1.510595,1.287874,-1.742949,0.938839,-1.271953,1.701288,-1.805353,-0.551638,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,11,-10,73
2,1.228235,-1.392355,-1.239499,0.954444,1.225369,0.76471,1.501086,0.773187,-1.220722,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,5.199338,-5.199338,-5.199338,11,-10,45
3,-0.064405,-1.482783,-1.801821,-0.440887,0.571145,0.556634,1.701288,-0.392031,-0.35549,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-0.564283,-5.199338,-5.199338,11,-10,73
4,1.227836,-1.511833,1.043858,0.453349,1.23563,-1.85197,1.701288,0.299707,-1.249924,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,5.199338,-5.199338,-5.199338,11,-10,93
5,1.221151,-1.455381,0.284868,1.250574,1.224543,-0.876143,2.009875,1.234766,-1.275677,-5.199338,...,-5.199338,5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,11,-10,93


In [6]:
cat_cols = [col for col in df_features.columns if ('_mj' in col)]
cat_cols

['c5_mj', 'c6_mj', 'c7_mj']

In [7]:
df_features['c56_mj'] = df_features['c5_mj'].astype(str) + df_features['c6_mj'].astype(str)
df_features['c57_mj'] = df_features['c5_mj'].astype(str) + df_features['c7_mj'].astype(str)
df_features['c67_mj'] = df_features['c6_mj'].astype(str) + df_features['c7_mj'].astype(str)

In [9]:
df_features.head()

Unnamed: 0_level_0,n4_mean,n5_mean,n6_mean,n7_mean,n4_max,n5_max,n6_max,n7_max,n4_min,n5_min,...,old_cc_label_9,old_cc_label_10,old_cc_label_11,old_cc_label_12,c5_mj,c6_mj,c7_mj,c56_mj,c57_mj,c67_mj
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.222313,-1.510595,1.287874,-1.742949,0.938839,-1.271953,1.701288,-1.805353,-0.551638,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,11,-10,73,11-10,1173,-1073
2,1.228235,-1.392355,-1.239499,0.954444,1.225369,0.76471,1.501086,0.773187,-1.220722,-5.199338,...,-5.199338,5.199338,-5.199338,-5.199338,11,-10,45,11-10,1145,-1045
3,-0.064405,-1.482783,-1.801821,-0.440887,0.571145,0.556634,1.701288,-0.392031,-0.35549,-5.199338,...,-5.199338,-0.564283,-5.199338,-5.199338,11,-10,73,11-10,1173,-1073
4,1.227836,-1.511833,1.043858,0.453349,1.23563,-1.85197,1.701288,0.299707,-1.249924,-5.199338,...,-5.199338,5.199338,-5.199338,-5.199338,11,-10,93,11-10,1193,-1093
5,1.221151,-1.455381,0.284868,1.250574,1.224543,-0.876143,2.009875,1.234766,-1.275677,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,11,-10,93,11-10,1193,-1093


In [11]:
df_features[['c5_mj', 'c6_mj', 'c7_mj', 'c56_mj', 'c57_mj', 'c67_mj']] = df_features[['c5_mj', 'c6_mj', 'c7_mj', 'c56_mj', 'c57_mj', 'c67_mj']].astype(str)
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 153240 entries, 1 to 153240
Data columns (total 39 columns):
n4_mean            153240 non-null float64
n5_mean            153240 non-null float64
n6_mean            153240 non-null float64
n7_mean            153240 non-null float64
n4_max             153240 non-null float64
n5_max             153240 non-null float64
n6_max             153240 non-null float64
n7_max             153240 non-null float64
n4_min             153240 non-null float64
n5_min             153240 non-null float64
n6_min             153240 non-null float64
n7_min             153240 non-null float64
n4_median          153240 non-null float64
n5_median          153240 non-null float64
n6_median          153240 non-null float64
n7_median          153240 non-null float64
n4_std             153240 non-null float64
n5_std             153240 non-null float64
n6_std             153240 non-null float64
n7_std             153240 non-null float64
old_cc_label_0     153240 non

In [12]:
df_features.columns.values[-6:]

array(['c5_mj', 'c6_mj', 'c7_mj', 'c56_mj', 'c57_mj', 'c67_mj'],
      dtype=object)

In [13]:
df_features.to_csv('./data/301_features_txn_ns_cmj_clboh.csv')

In [14]:
joblib.dump(qt, 'M_336/301_qt_normal.joblib')

['M_336/301_qt_normal.joblib']