In [1]:
import os

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtypes(df)
    return df

In [3]:
class CollinearColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold, col_regex=None, exclude_cols=None):
        """
        :param threshold: float in [0, 1], if two columns have correlation greater than threshold
                          one of them will be removed
        :param col_regex: str, regular expression to select columns
        """
        self._threshold = threshold
        self._col_regex = col_regex
        if exclude_cols is None:
            self._exclude_cols = []
        else:
            self._exclude_cols = exclude_cols
    
    def _collinear_columns(self, df, threshold):
        if self._col_regex is None:
            df_sel = df.select_dtypes(["number", "bool"])
        else:
            df_sel = df.filter(regex=self._col_regex)
            df_sel = df_sel.select_dtypes(["number", "bool"])
        
        df_sel = df_sel.astype("float32")
        
        all_cols = df_sel.columns.to_list()
        all_cols = [col for col in all_cols if col not in self._exclude_cols]
        df_sel = df_sel[all_cols]
        ncols = len(all_cols)
        
        corr_mat = df_sel.corr().abs()
        self._corr_mat = corr_mat
        collin_cols = []
        for i in range(ncols-1):
            col_i = all_cols[i]
            if col_i in collin_cols:
                continue
            
            for j in range(i + 1, ncols):
                col_j = all_cols[j]
                if col_j in collin_cols:
                    continue
                
                corr = corr_mat.loc[col_i, col_j]
                if corr > threshold:
                    collin_cols.append(col_j)
        
        collin_cols = list(set(collin_cols))
        return collin_cols
    
    
    def fit(self, df):
        self._collin_cols = self._collinear_columns(df, self._threshold)
        return self
    
    def transform(self, df):
        all_cols = df.columns.to_list()
        nonexist_cols = [col for col in self._collin_cols if col not in all_cols]
        if len(nonexist_cols) > 0:
            print("WARNING: These collinear cols to be droped do not exist in df:", nonexist_cols)
            
        droped_col = [col for col in self._collin_cols if col in all_cols]
        print("Number of columns droped due to collinearity:", len(droped_col))
        return df.drop(droped_col, axis="columns")


class SameCatColsRemover(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold=0.99):
        self._threshold = threshold
    
    def fit(self, df_train):
        cols = df_train.select_dtypes(["category", "object"]).columns.to_list()
        ncols = len(cols)
        
        self._cols_to_drop = []
        for i in range(ncols - 1):
            col_i = cols[i]
            if col_i in self._cols_to_drop:
                continue
            
            for j in range(i + 1, ncols):
                col_j = cols[j]
                if col_j in self._cols_to_drop:
                    continue
                
                if (df_train[col_i].astype("object") == df_train[col_j].astype("object")).mean() > self._threshold:
                    self._cols_to_drop.append(col_j)
        
        return self
    
    def transform(self, df):
        print("Number of same columns droped:", len(self._cols_to_drop))
        return df.drop(self._cols_to_drop, axis="columns")


In [4]:
class NumColsQCuter(BaseEstimator, TransformerMixin):
    
    def __init__(self, labels=None, exclude_cols=None, 
                 nunique_min=1000, keep_old=True, 
                 output_type=np.int32):
        
        if labels is None:
            self._labels = [1, 2, 3, 4]
        else:
            self._labels = list(labels)
        self._nbins = len(self._labels)
        assert self._isunique(self._labels), "labels must be unique"
        
        self._exclude_cols = ["ID_code", "target"]
        if exclude_cols is not None:
            self._exclude_cols = self._exclude_cols + list(exclude_cols)
        
        self._nunique_min = nunique_min
        
        self._suffix = "_%dQCUT" % len(self._labels)
        
        self._keep_old = keep_old
        self._output_type = output_type
    
    def _isunique(self, x):
        return len(np.unique(x)) == len(x)
    
    def fit(self, df_train):
        
        quantiles = np.linspace(0, 1, self._nbins + 1)
        quantiles = quantiles[1: -1]
        
        sel_cols = df_train.select_dtypes(["number"]).columns.to_list()
        sel_cols = [col for col in sel_cols if col not in self._exclude_cols]
        sel_cols = [col for col in sel_cols if df_train[col].nunique() >= self._nunique_min]
        
        self._bins = {}
        for col in sel_cols:
            #if df_train[col].isnull().any():
            #    raise ValueError(col + " has null values")
            
            bins = df_train[col].quantile(quantiles).values
            bins = np.array([-np.inf] + list(bins) + [np.inf])
            
            if self._isunique(bins):
                self._bins[col] = bins
        
        return self
    
    def transform(self, df):
        df = df.copy()
        new_cols = []
        for col, bins in self._bins.items():
            new_col_name = col + self._suffix
            df[new_col_name] = pd.cut(df[col], bins, labels=self._labels)
            df[new_col_name] = df[new_col_name].astype(self._output_type)
            
            new_cols.append(new_col_name)
            
        if self._keep_old:
            return df
        else:
            return df[new_cols]
        

class CatValueCounter(BaseEstimator, TransformerMixin):
    def __init__(self, keep_old=True, exclude_cols=None, suffix="_VALCOUNT"):
        self._keep_old = keep_old
        self._suffix = suffix
        
        self._exclude_cols = ["ID_code", "target"]
        if exclude_cols is not None:
            self._exclude_cols = self._exclude_cols + list(exclude_cols)
    
    def fit(self, df_train):
        sel_cols = df_train.select_dtypes(["category", "object"]).columns.to_list()
        sel_cols = [col for col in sel_cols if col not in self._exclude_cols]
        
        self._val_counts = {}
        for col in sel_cols:
            self._val_counts[col] = df_train[col].value_counts(normalize=True)
        
        return self
    
    def transform(self, df):
        df = df.copy()
        new_cols = []
        for col, val_count in self._val_counts.items():
            new_col = col + self._suffix
            df[new_col] = df[col].map(val_count)
            df[new_col] = df[new_col].fillna(0)
            
            new_cols.append(new_col)
        
        if self._keep_old:
            return df
        else:
            return df[new_cols]


class NumValueCounter(BaseEstimator, TransformerMixin):
    def __init__(self, nunique_min=1000, bins=100, keep_old=True, exclude_cols=None, suffix="_VALCOUNT"):
        self._nunique_min = nunique_min
        self._bins = bins
        self._keep_old = keep_old
        self._suffix = suffix
        
        self._exclude_cols = ["ID_code", "target"]
        if exclude_cols is not None:
            self._exclude_cols = self._exclude_cols + list(exclude_cols)
    
    def _cal_bin_edges(self, ser):
        val_min = ser.min()
        val_max = ser.max()
        
        edges = np.linspace(val_min, val_max, self._bins + 1)
        edges[0] = -np.inf
        edges[-1] = np.inf
        return edges
    
    def fit(self, df_train):
        sel_cols = df_train.select_dtypes(["number"]).columns.to_list()
        sel_cols = [col for col in sel_cols if col not in self._exclude_cols]
        sel_cols = [col for col in sel_cols if df_train[col].nunique() >= self._nunique_min]
        
        self._val_counts = {}
        self._bin_edges = {}
        
        for col in sel_cols:
            edges = self._cal_bin_edges(df_train[col])
            self._bin_edges[col] = edges
            
            discrete_ser = pd.cut(df_train[col], edges, labels=list(range(self._bins)))
            self._val_counts[col] = discrete_ser.value_counts(normalize=True)
            
        return self
    
    def transform(self, df):
        df = df.copy()
        new_cols = []
        
        for col, edges in self._bin_edges.items():
            discrete_ser = pd.cut(df[col], edges, labels=list(range(self._bins)))
            
            val_count = self._val_counts[col]
            new_col = col + self._suffix
            df[new_col] = discrete_ser.map(val_count)
            df[new_col] = df[new_col].fillna(0)
            
            new_cols.append(new_col)
        
        if self._keep_old:
            return df
        else:
            return df[new_cols]


In [5]:
class TargetMeanFromNumCols(BaseEstimator, TransformerMixin):
    def __init__(self, nunique_min=1000, bins=10, keep_old=True, exclude_cols=None, suffix="_TARGETMEANNUM"):
        self._nunique_min = nunique_min
        self._bins = bins
        self._keep_old = keep_old
        self._suffix = suffix
        
        self._exclude_cols = ["ID_code", "target"]
        if exclude_cols is not None:
            self._exclude_cols = self._exclude_cols + list(exclude_cols)
    
    def _cal_bin_edges(self, ser):
        quantiles = np.linspace(0, 1, self._bins + 1)
        quantiles = quantiles[1: -1]
        
        edges = ser.quantile(quantiles).values
        edges = np.array([-np.inf] + list(edges) + [np.inf])
        return edges
    
    def _isunique(self, x):
        return len(np.unique(x)) == len(x)
    
    def fit(self, df_train, y_train):
        assert isinstance(y_train, pd.Series)
        assert df_train.shape[0] == y_train.shape[0]
        assert (df_train.index == y_train.index).all()
        
        self._y_mean = y_train.mean()
        
        sel_cols = df_train.select_dtypes(["number"]).columns.to_list()
        sel_cols = [col for col in sel_cols if col not in self._exclude_cols]
        sel_cols = [col for col in sel_cols if df_train[col].nunique() >= self._nunique_min]
        
        self._target_means = {}
        self._bin_edges = {}
        
        for col in sel_cols:
            edges = self._cal_bin_edges(df_train[col])
            
            if self._isunique(edges):
                self._bin_edges[col] = edges
                
                bin_label = pd.cut(df_train[col], edges, labels=list(range(self._bins)))
                tmp_df = pd.DataFrame({"bin_label": bin_label, "target": y_train})
                
                self._target_means[col] = tmp_df.groupby(["bin_label"])["target"].mean()
                self._target_means[col] = self._target_means[col].fillna(self._y_mean)
        return self
    
    def transform(self, df):
        df = df.copy()
        new_cols = []
        
        for col, edges in self._bin_edges.items():
            bin_label = pd.cut(df[col], edges, labels=list(range(self._bins)))
            
            target_mean = self._target_means[col]
            new_col = col + self._suffix
            df[new_col] = bin_label.map(target_mean).astype(np.float32)
            df[new_col] = df[new_col].fillna(self._y_mean)
            
            new_cols.append(new_col)
        
        if self._keep_old:
            return df
        else:
            return df[new_cols]


class WeightOfEvidenceNum(BaseEstimator, TransformerMixin):
    def __init__(self, nunique_min=1000, bins=10, keep_old=True, exclude_cols=None, suffix="_WOENUM"):
        self._nunique_min = nunique_min
        self._bins = bins
        self._keep_old = keep_old
        self._suffix = suffix
        
        self._exclude_cols = ["ID_code", "target"]
        if exclude_cols is not None:
            self._exclude_cols = self._exclude_cols + list(exclude_cols)
    
    def _cal_bin_edges(self, ser):
        quantiles = np.linspace(0, 1, self._bins + 1)
        quantiles = quantiles[1: -1]
        
        edges = ser.quantile(quantiles).values
        edges = np.array([-np.inf] + list(edges) + [np.inf])
        return edges
    
    def _isunique(self, x):
        return len(np.unique(x)) == len(x)
    
    def fit(self, df_train, y_train):
        assert isinstance(y_train, pd.Series)
        assert df_train.shape[0] == y_train.shape[0]
        assert (df_train.index == y_train.index).all()
        
        sel_cols = df_train.select_dtypes(["number"]).columns.to_list()
        sel_cols = [col for col in sel_cols if col not in self._exclude_cols]
        sel_cols = [col for col in sel_cols if df_train[col].nunique() >= self._nunique_min]
        
        self._woes = {}
        self._bin_edges = {}
        
        for col in sel_cols:
            edges = self._cal_bin_edges(df_train[col])
            
            if self._isunique(edges):
                self._bin_edges[col] = edges
                
                bin_label = pd.cut(df_train[col], edges, labels=list(range(self._bins)))
                tmp_df = pd.DataFrame({"bin_label": bin_label, "target": y_train})
                
                bad_dist = tmp_df.groupby(["bin_label"])["target"].mean()
                good_dist = 1. - bad_dist
                
                woe = np.log(good_dist / bad_dist) * 100
                woe = woe.fillna(0.)
                woe_min = woe.replace(-np.inf, np.nan).min() / 100.
                woe_max = woe.replace(np.inf, np.nan).max() * 100.
                
                woe = woe.replace(-np.inf, woe_min)
                woe = woe.replace(np.inf, woe_max)
                self._woes[col] = woe
        return self
    
    def transform(self, df):
        df = df.copy()
        new_cols = []
        
        for col, edges in self._bin_edges.items():
            bin_label = pd.cut(df[col], edges, labels=list(range(self._bins)))
            
            woe = self._woes[col]
            new_col = col + self._suffix
            df[new_col] = bin_label.map(woe).astype(np.float32)
            df[new_col] = df[new_col].fillna(0.)
            
            new_cols.append(new_col)
        
        if self._keep_old:
            return df
        else:
            return df[new_cols]


In [6]:
def two_random_disjoint_subsets(x, subset_size, randomstate=None):
    assert subset_size <= len(x) // 2
    rndstate = np.random.RandomState(randomstate)
    rndstate.shuffle(x)
    return x[:subset_size], x[-subset_size:]


class RandomInteractColsExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, subset_size=15, keep_old=True, randomstate=None):
        self._subset_size = subset_size
        self._keep_old = keep_old
        self._randomstate = randomstate
    
    def fit(self, df_train):
        cols = df_train.columns.to_list()
        self._sel_cols = two_random_disjoint_subsets(cols, self._subset_size, randomstate=self._randomstate)
        return self
    
    def transform(self, df):
        df = df.copy()
        new_cols = []
        
        cols_1, cols_2 = self._sel_cols
        for c1 in cols_1:
            for c2 in cols_2:
                new_col = c1 + "_" + c2 + "_INTERACT"
                new_cols.append(new_col)
                
                df[new_col] = df[c1] * df[c2]
        
        if self._keep_old:
            return df
        else:
            return df[new_cols]

In [7]:
INP_DIR = "data/download"
OUT_DIR = "data/data_"

In [8]:
df_train = load_csv(os.path.join(INP_DIR, "train.csv"))
df_test = load_csv(os.path.join(INP_DIR, "test.csv"))

print("df_train.shape", df_train.shape)
print("df_test.shape", df_test.shape)

Memory usage before changing types 323.20 MB
Memory usage after changing types 162.40 MB
Memory usage before changing types 321.60 MB
Memory usage after changing types 161.60 MB
df_train.shape (200000, 202)
df_test.shape (200000, 201)


In [9]:
df_train.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.522699,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356001,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [9]:
X_org_train = df_train.drop(["ID_code", "target"], axis="columns")
y_train = df_train[["target"]]

X_org_train.to_csv(os.path.join(OUT_DIR, "X_org_train.csv"), index=False)
y_train.to_csv(os.path.join(OUT_DIR, "y_train.csv"), index=False)


X_org_test = df_test.drop(["ID_code"], axis="columns")
id_code_test = df_test[["ID_code"]]

X_org_test.to_csv(os.path.join(OUT_DIR, "X_org_test.csv"), index=False)
id_code_test.to_csv(os.path.join(OUT_DIR, "id_code_test.csv"), index=False)

# Generate quintile columns by bining

In [11]:
qs = [i for i in range(1, 11)]

numcolsqcuter = NumColsQCuter(labels=qs, keep_old=False, output_type=np.int32)
numcolsqcuter.fit(X_org_train)
X_q10_train = numcolsqcuter.transform(X_org_train)
X_q10_test = numcolsqcuter.transform(X_org_test)

print("X_q10_train.shape", X_q10_train.shape)
print("X_q10_test.shape", X_q10_test.shape)


remover = CollinearColumnRemover(0.95)
remover.fit(X_q10_train)
X_q10_train = remover.transform(X_q10_train)
X_q10_test = remover.transform(X_q10_test)
print("X_q10_train.shape", X_q10_train.shape)
print("X_q10_test.shape", X_q10_test.shape)

X_q10_train.shape (200000, 199)
X_q10_test.shape (200000, 199)
Number of columns droped due to collinearity: 0
Number of columns droped due to collinearity: 0
X_q10_train.shape (200000, 199)
X_q10_test.shape (200000, 199)


In [12]:
X_q10_train.head()

Unnamed: 0,var_0_10QCUT,var_1_10QCUT,var_2_10QCUT,var_3_10QCUT,var_4_10QCUT,var_5_10QCUT,var_6_10QCUT,var_7_10QCUT,var_8_10QCUT,var_9_10QCUT,...,var_190_10QCUT,var_191_10QCUT,var_192_10QCUT,var_193_10QCUT,var_194_10QCUT,var_195_10QCUT,var_196_10QCUT,var_197_10QCUT,var_198_10QCUT,var_199_10QCUT
0,4,2,7,3,6,4,4,8,1,1,...,7,2,8,4,6,1,9,4,2,6
1,7,3,9,3,8,10,6,6,8,7,...,9,6,7,10,3,10,9,5,8,7
2,3,5,7,7,4,4,10,4,1,2,...,5,8,5,4,9,10,1,3,4,7
3,6,5,3,6,9,7,7,4,1,7,...,7,3,3,4,10,3,3,10,8,4
4,5,6,8,5,8,9,8,8,10,6,...,2,8,1,10,1,2,6,8,8,4


In [13]:
X_q10_test.head()

Unnamed: 0,var_0_10QCUT,var_1_10QCUT,var_2_10QCUT,var_3_10QCUT,var_4_10QCUT,var_5_10QCUT,var_6_10QCUT,var_7_10QCUT,var_8_10QCUT,var_9_10QCUT,...,var_190_10QCUT,var_191_10QCUT,var_192_10QCUT,var_193_10QCUT,var_194_10QCUT,var_195_10QCUT,var_196_10QCUT,var_197_10QCUT,var_198_10QCUT,var_199_10QCUT
0,6,10,8,9,6,7,7,7,7,9,...,2,10,1,5,1,10,7,10,5,4
1,3,8,6,3,2,6,8,8,1,2,...,10,7,3,10,3,7,3,9,9,1
2,1,1,5,6,4,10,3,9,7,7,...,3,9,6,4,1,10,1,1,10,1
3,3,6,7,5,1,9,4,9,8,5,...,10,8,5,6,3,10,6,7,2,5
4,7,7,9,7,2,4,10,1,8,4,...,6,8,4,5,7,5,1,1,3,4


In [14]:
X_q10_train.to_csv(os.path.join(OUT_DIR, "X_q10_train.csv"), index=False)
X_q10_test.to_csv(os.path.join(OUT_DIR, "X_q10_test.csv"), index=False)

# Generate value count columns

In [15]:
value_counter = NumValueCounter(keep_old=False)
value_counter.fit(X_org_train)

X_valcount_train = value_counter.transform(X_org_train)
X_valcount_test = value_counter.transform(X_org_test)

print("X_valcount_train.shape", X_valcount_train.shape)
print("X_valcount_test.shape", X_valcount_test.shape)
print("X_valcount_train.isnull().sum().sum()", X_valcount_train.isnull().sum().sum())
print("X_valcount_test.isnull().sum().sum()", X_valcount_test.isnull().sum().sum())


remover = CollinearColumnRemover(0.95)
remover.fit(X_valcount_train)
X_valcount_train = remover.transform(X_valcount_train)
X_valcount_test = remover.transform(X_valcount_test)
print("X_valcount_train.shape", X_valcount_train.shape)
print("X_valcount_test.shape", X_valcount_test.shape)

X_valcount_train.shape (200000, 199)
X_valcount_test.shape (200000, 199)
X_valcount_train.isnull().sum().sum() 0
X_valcount_test.isnull().sum().sum() 0
Number of columns droped due to collinearity: 0
Number of columns droped due to collinearity: 0
X_valcount_train.shape (200000, 199)
X_valcount_test.shape (200000, 199)


In [16]:
X_valcount_train.head()

Unnamed: 0,var_0_VALCOUNT,var_1_VALCOUNT,var_2_VALCOUNT,var_3_VALCOUNT,var_4_VALCOUNT,var_5_VALCOUNT,var_6_VALCOUNT,var_7_VALCOUNT,var_8_VALCOUNT,var_9_VALCOUNT,...,var_190_VALCOUNT,var_191_VALCOUNT,var_192_VALCOUNT,var_193_VALCOUNT,var_194_VALCOUNT,var_195_VALCOUNT,var_196_VALCOUNT,var_197_VALCOUNT,var_198_VALCOUNT,var_199_VALCOUNT
0,0.02318,0.01442,0.02018,0.018425,0.02553,0.01847,0.02502,0.020355,0.010525,0.00934,...,0.027125,0.01731,0.023575,0.02611,0.020655,0.00892,0.01803,0.02501,0.014545,0.0214
1,0.022535,0.018725,0.011875,0.019795,0.02262,0.007615,0.023985,0.021825,0.02077,0.016915,...,0.01823,0.022005,0.027595,0.004885,0.020115,0.00779,0.01722,0.023465,0.020685,0.02316
2,0.023055,0.02124,0.02071,0.02234,0.024285,0.019355,0.005935,0.02251,0.010525,0.011855,...,0.02661,0.01838,0.03231,0.02611,0.015245,0.005835,0.009335,0.02207,0.02433,0.022425
3,0.023185,0.02186,0.02313,0.02253,0.0202,0.021585,0.02492,0.022655,0.00575,0.017435,...,0.02624,0.02019,0.024865,0.025925,0.00934,0.02032,0.01641,0.010905,0.0203,0.019065
4,0.025325,0.02179,0.01782,0.0218,0.02262,0.01995,0.02422,0.018585,0.005105,0.02236,...,0.01844,0.02115,0.012755,0.0121,0.00854,0.019105,0.01931,0.019985,0.0203,0.019065


In [17]:
X_valcount_test.head()

Unnamed: 0,var_0_VALCOUNT,var_1_VALCOUNT,var_2_VALCOUNT,var_3_VALCOUNT,var_4_VALCOUNT,var_5_VALCOUNT,var_6_VALCOUNT,var_7_VALCOUNT,var_8_VALCOUNT,var_9_VALCOUNT,...,var_190_VALCOUNT,var_191_VALCOUNT,var_192_VALCOUNT,var_193_VALCOUNT,var_194_VALCOUNT,var_195_VALCOUNT,var_196_VALCOUNT,var_197_VALCOUNT,var_198_VALCOUNT,var_199_VALCOUNT
0,0.023185,0.002985,0.01782,0.01328,0.02587,0.02111,0.02492,0.02117,0.02107,0.019705,...,0.015905,0.009195,0.00174,0.0275,0.011735,0.004765,0.019855,0.004135,0.02347,0.019065
1,0.02258,0.0197,0.02007,0.019175,0.016155,0.020885,0.02129,0.020355,0.01238,0.011855,...,0.008525,0.02373,0.02548,0.00802,0.020115,0.024435,0.016955,0.01714,0.01703,0.008395
2,0.006135,0.002035,0.023045,0.02201,0.02212,0.003415,0.024355,0.01639,0.020705,0.01853,...,0.02134,0.012415,0.031905,0.027055,0.007435,0.00664,0.006805,0.00404,0.01307,0.005505
3,0.02258,0.02179,0.02071,0.0218,0.013615,0.01898,0.024275,0.01489,0.01923,0.017765,...,0.011155,0.023625,0.03231,0.02771,0.018975,0.005835,0.01931,0.021705,0.01613,0.021035
4,0.02217,0.021175,0.011185,0.022225,0.015815,0.01914,0.00703,0.007215,0.020765,0.01797,...,0.027125,0.02243,0.029515,0.02807,0.01965,0.02464,0.01242,0.00487,0.02091,0.019065


In [18]:
X_valcount_train.to_csv(os.path.join(OUT_DIR, "X_valcount_train.csv"), index=False)
X_valcount_test.to_csv(os.path.join(OUT_DIR, "X_valcount_test.csv"), index=False)
del X_valcount_train, X_valcount_test

# Generate mean target columns

In [19]:
target_mean = TargetMeanFromNumCols(keep_old=False)
target_mean.fit(X_org_train, y_train["target"])
X_target_mean_train = target_mean.transform(X_org_train)
X_target_mean_test = target_mean.transform(X_org_test)

print("X_target_mean_train.shape", X_target_mean_train.shape)
print("X_target_mean_test.shape", X_target_mean_test.shape)
print("X_target_mean_train.isnull().sum().sum()", X_target_mean_train.isnull().sum().sum())
print("X_test_mean_train.isnull().sum().sum()", X_target_mean_test.isnull().sum().sum())


remover = CollinearColumnRemover(0.95)
remover.fit(X_target_mean_train)
X_target_mean_train = remover.transform(X_target_mean_train)
X_target_mean_test = remover.transform(X_target_mean_test)
print("X_target_mean_train.shape", X_target_mean_train.shape)
print("X_target_mean_test.shape", X_target_mean_test.shape)

X_target_mean_train.shape (200000, 199)
X_target_mean_test.shape (200000, 199)
X_target_mean_train.isnull().sum().sum() 0
X_test_mean_train.isnull().sum().sum() 0
Number of columns droped due to collinearity: 0
Number of columns droped due to collinearity: 0
X_target_mean_train.shape (200000, 199)
X_target_mean_test.shape (200000, 199)


In [20]:
X_target_mean_train.to_csv(os.path.join(OUT_DIR, "X_target_mean_train.csv"), index=False)
X_target_mean_test.to_csv(os.path.join(OUT_DIR, "X_target_mean_test.csv"), index=False)
del X_target_mean_train, X_target_mean_test

# Generate Weight of Evidence columns

In [21]:
woe = WeightOfEvidenceNum(keep_old=False)
woe.fit(X_org_train, y_train["target"])
X_woe_train = woe.transform(X_org_train)
X_woe_test = woe.transform(X_org_test)

print("X_woe_train.shape", X_woe_train.shape)
print("X_woe_test.shape", X_woe_test.shape)
print("X_woe_train.isnull().sum().sum()", X_woe_train.isnull().sum().sum())
print("X_woe_test.isnull().sum().sum()", X_woe_test.isnull().sum().sum())


remover = CollinearColumnRemover(0.95)
remover.fit(X_woe_train)
X_woe_train = remover.transform(X_woe_train)
X_woe_test = remover.transform(X_woe_test)
print("X_woe_train.shape", X_woe_train.shape)
print("X_woe_test.shape", X_woe_test.shape)

X_woe_train.shape (200000, 199)
X_woe_test.shape (200000, 199)
X_woe_train.isnull().sum().sum() 0
X_woe_test.isnull().sum().sum() 0
Number of columns droped due to collinearity: 0
Number of columns droped due to collinearity: 0
X_woe_train.shape (200000, 199)
X_woe_test.shape (200000, 199)


In [22]:
X_woe_train.to_csv(os.path.join(OUT_DIR, "X_woe_train.csv"), index=False)
X_woe_test.to_csv(os.path.join(OUT_DIR, "X_woe_test.csv"), index=False)
del X_woe_train, X_woe_test

# Extract interaction columns

In [13]:
repeats = 10

for i in range(repeats):
    print(i)
    inter_cols = RandomInteractColsExtractor(subset_size=25, keep_old=False)
    inter_cols.fit(X_org_train)
    
    X_interact_train = inter_cols.transform(X_org_train)
    X_interact_test = inter_cols.transform(X_org_test)
    
    print("X_interact_train.shape:", X_interact_train.shape)
    print("X_interact_test.shape:", X_interact_test.shape)
    print("X_interact_train.isnull().sum().sum():", X_interact_train.isnull().sum().sum())
    print("X_interact_test.isnull().sum().sum():", X_interact_test.isnull().sum().sum())
    
    out_train = os.path.join(OUT_DIR, "X_interact_%d_train.csv"%i)
    out_test = os.path.join(OUT_DIR, "X_interact_%d_test.csv"%i)
    print("Saving to: " + out_train + " and " + out_test)
    
    X_interact_train.to_csv(out_train, index=False)
    X_interact_test.to_csv(out_test, index=False)
    
    print("")

0
X_interact_train.shape: (200000, 625)
X_interact_test.shape: (200000, 625)
X_interact_train.isnull().sum().sum(): 0
X_interact_test.isnull().sum().sum(): 0
Saving to: data/data_/X_interact_0_train.csv and data/data_/X_interact_0_test.csv

1
X_interact_train.shape: (200000, 625)
X_interact_test.shape: (200000, 625)
X_interact_train.isnull().sum().sum(): 0
X_interact_test.isnull().sum().sum(): 0
Saving to: data/data_/X_interact_1_train.csv and data/data_/X_interact_1_test.csv

2
X_interact_train.shape: (200000, 625)
X_interact_test.shape: (200000, 625)
X_interact_train.isnull().sum().sum(): 0
X_interact_test.isnull().sum().sum(): 0
Saving to: data/data_/X_interact_2_train.csv and data/data_/X_interact_2_test.csv

3
X_interact_train.shape: (200000, 625)
X_interact_test.shape: (200000, 625)
X_interact_train.isnull().sum().sum(): 0
X_interact_test.isnull().sum().sum(): 0
Saving to: data/data_/X_interact_3_train.csv and data/data_/X_interact_3_test.csv

4
X_interact_train.shape: (200000, 6