Install & import packages

In [None]:
!pip install --quiet tqdm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import skewtest, boxcox
from sklearn.base import TransformerMixin, BaseEstimator
from tqdm import tqdm

Load the data

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
print('Train dataset has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
print('Test dataset has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

Train dataset has 595212 rows and 59 columns
Test dataset has 892816 rows and 58 columns


Create a combined dataset to deskew, then split out again

In [4]:
target = train['target']
train.drop('target', axis=1, inplace=True)
train['dataset'] = 'train'
test['dataset'] = 'test'
combined = pd.concat([train, test], axis=0)

In [5]:
combined.shape

(1488028, 59)

In [6]:
combined.columns

Index(['id', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin', 'dataset'],

In [7]:
categorical_columns = train.columns[train.columns.str.contains('_cat')]
binary_columns = train.columns[train.columns.str.contains('_bin')]
numeric_cols = [i for i in train.columns 
                if i not in list(categorical_columns) + list(binary_columns) + ['id','target','dataset']
               ]

In [8]:
print('There are {} categorical Columns'.format(len(categorical_columns)))
print('There are {} binary Columns'.format(len(binary_columns)))
print('There are {} numeric Columns'.format(len(numeric_cols)))

There are 14 categorical Columns
There are 17 binary Columns
There are 26 numeric Columns


In [None]:
# fig = plt.figure(figsize=(12,10))
# for i, col in enumerate(numeric_cols):
#     fig.add_subplot(6,5,1+i)
#     combined.sample(frac=.05, random_state=42)[col].plot.hist(bins=20)
#     plt.xlabel(col)
# fig.tight_layout()

In [9]:
skew_results = []

for col in numeric_cols:
    _, pval = skewtest(combined[col])
    skew_results.append({
        'column':col,
        'P-Value':pval,
        'Skewed':pval<.05
    })
    
skew_results_df = pd.DataFrame(skew_results)
cols_to_deskew = list(np.array(numeric_cols)[skew_results_df['Skewed'].values])
skew_results_df

Unnamed: 0,P-Value,Skewed,column
0,0.0,True,ps_ind_01
1,0.0,True,ps_ind_03
2,0.0,True,ps_ind_14
3,0.0,True,ps_ind_15
4,0.0,True,ps_reg_01
5,0.0,True,ps_reg_02
6,0.0,True,ps_reg_03
7,0.0,True,ps_car_11
8,0.0,True,ps_car_12
9,0.0,True,ps_car_13


In [10]:
numeric_df = combined[numeric_cols]

In [11]:
from scipy.stats import boxcox
class BoxcoxTransformer(BaseEstimator, TransformerMixin):
    
    '''
    Takes in a DataFrame (numeric only) and boxcox-deskews all the skewed data.
    '''
    
    def __init__(self):
        pass
        self.lambdas = {}
        self.col_mins = {}
        
    def fit(self, X, y=None):
        for col in tqdm(X.columns):
            if skewtest(X[col]).pvalue < .05:
                self.col_mins[col] =-1* X[col].min() + 1E-4
                _, lbda = boxcox(X[col] + self.col_mins[col])
                self.lambdas[col]=lbda
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        for col, lbda in self.lambdas.items():
            X_copy[col] = boxcox(X[col] + self.col_mins[col], self.lambdas[col])
        return X_copy

In [12]:
bct = BoxcoxTransformer()

In [13]:
combined_deskewed_numeric = bct.fit_transform(numeric_df)

100%|██████████| 26/26 [00:48<00:00,  1.87s/it]


In [None]:
# combined_deskewed_numeric.hist(figsize=(12,12), bins=20)
# plt.tight_layout()
# plt.show()

In [14]:
categorical_columns.shape

(14,)

Plot categorical features against target

In [15]:
# fig = plt.figure(figsize=(12,8))
# for i, col in tqdm(enumerate(categorical_columns)):
#     fig.add_subplot(4,4,1+i)
#     pd.concat([train[col], target], axis=1).groupby('target').plot.barh()
# fig.tight_layout()
# fig.savefig('../assets/categorical_cols_target_bar.png')

To do:
* get_dummies for the categorical columns
* re-split the train and test sets

In [16]:
categorical_columns

Index(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
       'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat',
       'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
       'ps_car_10_cat', 'ps_car_11_cat'],
      dtype='object')

In [17]:
binary_columns

Index(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin',
       'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='object')

In [18]:
cols_for_dummies = list(categorical_columns) + list(binary_columns)

In [19]:
cat_df = combined[cols_for_dummies]

In [20]:
for col in cat_df.columns:
    cat_df[col] = cat_df[col].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
combined_dummies = pd.get_dummies(cat_df)

In [25]:
combined_final = pd.concat([numeric_df, combined_dummies], axis=1)

In [28]:
train_final = combined_final[combined['dataset'] == 'train']
test_final = combined_final[combined['dataset'] == 'test']

In [29]:
train_final.to_pickle('../data/train_final.pkl')
test_final.to_pickle('../data/test_final.pkl')