In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew, boxcox
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import itertools

In [2]:
drop_list = ['cat15','cat18','cat20','cat21','cat22','cat48','cat55','cat56','cat58','cat59','cat60'
             ,'cat62','cat63','cat64','cat65','cat68','cat69','cat77','cat78','cat85']

In [3]:
def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain

In [4]:
train = pd.read_csv("train.csv",index_col=0)
test = pd.read_csv("test.csv",index_col=0)

In [5]:
train = train.drop(drop_list,axis=1)
test = test.drop(drop_list,axis=1)

In [6]:
bins1 = [x for x in np.arange(0,1.2,0.2)]
train.cont1 = pd.cut(train.cont1,bins1)
test.cont1 = pd.cut(test.cont1,bins1)
bins2 = [x for x in np.arange(0,1.05,0.05)]
train.cont2 = pd.cut(train.cont2,bins2)
test.cont2 = pd.cut(test.cont2,bins2)

In [7]:
train.rename(columns={"cont1": "cat001", "cont2": "cat002"},inplace = True)
test.rename(columns={"cont1": "cat001", "cont2": "cat002"},inplace = True)

In [8]:
numeric_feats = [x for x in train.columns[0:-1] if 'cont' in x]
categorical_feats = [x for x in train.columns[0:-1] if 'cat' in x]

In [9]:
train_test, ntrain = mungeskewed(train, test, numeric_feats)

In [10]:
# for column in list(train.select_dtypes(include=['object']).columns):
#         if train[column].nunique() != test[column].nunique():
#             set_train = set(train[column].unique())
#             set_test = set(test[column].unique())
#             remove_train = set_train - set_test
#             remove_test = set_test - set_train

#             remove = remove_train.union(remove_test)


#             def filter_cat(x):
#                 if x in remove:
#                     return np.nan
#                 return x


#             train_test[column] = train_test[column].apply(lambda x: filter_cat(x), 1)

In [10]:
train_test["cont4"] = np.sqrt(preprocessing.minmax_scale(train_test["cont4"]))
train_test["cont5"] = np.sqrt(preprocessing.minmax_scale(train_test["cont5"]))
train_test["cont8"] = np.sqrt(preprocessing.minmax_scale(train_test["cont8"]))
train_test["cont10"] = np.sqrt(preprocessing.minmax_scale(train_test["cont10"]))
train_test["cont11"] = np.sqrt(preprocessing.minmax_scale(train_test["cont11"]))
train_test["cont12"] = np.sqrt(preprocessing.minmax_scale(train_test["cont12"]))
test.cont7 = np.log1p(test.cont7)
train_test["cont6"] = np.log1p(preprocessing.minmax_scale(train_test["cont6"]))
train_test["cont7"] = np.log1p(preprocessing.minmax_scale(train_test["cont7"]) )
train_test["cont9"] = np.log1p(preprocessing.minmax_scale(train_test["cont9"]) )
train_test["cont13"] = np.log1p(preprocessing.minmax_scale(train_test["cont13"]) )
#train_test["cont14"] = (np.maximum(train_test["cont14"] - 0.179722, 0) / 0.665122) ** 0.25

In [11]:
ss = StandardScaler()
train_test[numeric_feats] = ss.fit_transform(train_test[numeric_feats].values)

In [12]:
train = train_test.iloc[:ntrain, :].copy()
test = train_test.iloc[ntrain:, :].copy()


In [13]:
train.loss = np.log1p(train.loss)
test = test.drop(["loss"],axis = 1)

In [14]:
train.to_csv('self_train.csv')
test.to_csv('self_test.csv')