In [1]:
import pandas as pd
import numpy as np
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
#sample_submission=pd.read_csv('../input/sample_submission.csv')

In [2]:
print train.shape
print train.head(5)

(595212, 59)
   id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \
0   7       0          2              2          5              1   
1   9       0          1              1          7              0   
2  13       0          5              4          9              1   
3  16       0          0              1          2              0   
4  17       0          0              2          0              1   

   ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin  ps_ind_08_bin       ...        \
0              0              0              1              0       ...         
1              0              0              0              1       ...         
2              0              0              0              1       ...         
3              0              1              0              0       ...         
4              0              1              0              0       ...         

   ps_calc_11  ps_calc_12  ps_calc_13  ps_calc_14  ps_calc_15_bin  \
0           9   

In [3]:
# This function late in a list of features 'cols' from train and test dataset, 
# and performing frequency encoding. 
def freq_encoding(cols, train_df, test_df):
    # we are going to store our new dataset in these two resulting datasets
    result_train_df=pd.DataFrame()
    result_test_df=pd.DataFrame()
    
    # loop through each feature column to do this
    for col in cols:
        
        # capture the frequency of a feature in the training set in the form of a dataframe
        col_freq=col+'_freq'
        freq=train_df[col].value_counts()
        freq=pd.DataFrame(freq)
        freq.reset_index(inplace=True)
        freq.columns=[[col,col_freq]]

        # merge ths 'freq' datafarme with the train data
        temp_train_df=pd.merge(train_df[[col]], freq, how='left', on=col)
        temp_train_df.drop([col], axis=1, inplace=True)

        # merge this 'freq' dataframe with the test data
        temp_test_df=pd.merge(test_df[[col]], freq, how='left', on=col)
        temp_test_df.drop([col], axis=1, inplace=True)

        # if certain levels in the test dataset is not observed in the train dataset, 
        # we assign frequency of zero to them
        temp_test_df.fillna(0, inplace=True)
        temp_test_df[col_freq]=temp_test_df[col_freq].astype(np.int32)

        if result_train_df.shape[0]==0:
            result_train_df=temp_train_df
            result_test_df=temp_test_df
        else:
            result_train_df=pd.concat([result_train_df, temp_train_df],axis=1)
            result_test_df=pd.concat([result_test_df, temp_test_df],axis=1)
    
    return result_train_df, result_test_df

In [4]:
cat_cols=["ps_car_08_cat", "ps_ind_04_cat", "ps_car_02_cat", "ps_car_03_cat", "ps_car_05_cat", "ps_car_07_cat",
         "ps_car_10_cat", "ps_ind_02_cat", "ps_car_09_cat", "ps_ind_05_cat", "ps_car_04_cat", "ps_car_01_cat",
         "ps_car_06_cat"]

# generate dataframe for frequency features for the train and test dataset
train_freq, test_freq=freq_encoding(cat_cols, train, test)

# merge them into the original train and test dataset
train=pd.concat([train, train_freq], axis=1)
test=pd.concat([test,test_freq], axis=1)

In [5]:
print train.head(5)
print train.shape

   id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \
0   7       0          2              2          5              1   
1   9       0          1              1          7              0   
2  13       0          5              4          9              1   
3  16       0          0              1          2              0   
4  17       0          0              2          0              1   

   ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin  ps_ind_08_bin  \
0              0              0              1              0   
1              0              0              0              1   
2              0              0              0              1   
3              0              1              0              0   
4              0              1              0              0   

          ...          ps_car_03_cat_freq  ps_car_05_cat_freq  \
0         ...                      411231              172667   
1         ...                      411231              266551   

In [6]:
# perform binary encoding for categorical variable
# this function take in a pair of train and test data set, and the feature that need to be encode.
# it returns the two dataset with input feature encoded in binary representation
# this function assumpt that the feature to be encoded is already been encoded in a numeric manner 
# ranging from 0 to n-1 (n = number of levels in the feature). 

def binary_encoding(train_df, test_df, feat):
    # calculate the highest numerical value used for numeric encoding
    train_feat_max = train_df[feat].max()
    test_feat_max = test_df[feat].max()
    if train_feat_max > test_feat_max:
        feat_max = train_feat_max
    else:
        feat_max = test_feat_max
        
    # use the value of feat_max+1 to represent missing value
    train_df.loc[train_df[feat] == -1, feat] = feat_max + 1
    test_df.loc[test_df[feat] == -1, feat] = feat_max + 1
    
    # create a union set of all possible values of the feature
    union_val = np.union1d(train_df[feat].unique(), test_df[feat].unique())

    # extract the highest value from from the feature in decimal format.
    max_dec = union_val.max()
    
    # work out how the ammount of digtis required to be represent max_dev in binary representation
    max_bin_len = len("{0:b}".format(max_dec))
    index = np.arange(len(union_val))
    columns = list([feat])
    
    # create a binary encoding feature dataframe to capture all the levels for the feature
    bin_df = pd.DataFrame(index=index, columns=columns)
    bin_df[feat] = union_val
    
    # capture the binary representation for each level of the feature 
    feat_bin = bin_df[feat].apply(lambda x: "{0:b}".format(x).zfill(max_bin_len))
    
    # split the binary representation into different bit of digits 
    splitted = feat_bin.apply(lambda x: pd.Series(list(x)).astype(np.uint8))
    splitted.columns = [feat + '_bin_' + str(x) for x in splitted.columns]
    bin_df = bin_df.join(splitted)
    
    # merge the binary feature encoding dataframe with the train and test dataset - Done! 
    train_df = pd.merge(train_df, bin_df, how='left', on=[feat])
    test_df = pd.merge(test_df, bin_df, how='left', on=[feat])
    return train_df, test_df

In [7]:
for i in cat_cols:
    train, test=binary_encoding(train, test, i)

"""cat_cols=['ps_ind_02_cat','ps_car_04_cat', 'ps_car_09_cat',
          'ps_ind_05_cat', 'ps_car_01_cat']

train, test=binary_encoding(train, test, 'ps_ind_02_cat')
train, test=binary_encoding(train, test, 'ps_car_04_cat')
train, test=binary_encoding(train, test, 'ps_car_09_cat')
train, test=binary_encoding(train, test, 'ps_ind_05_cat')
train, test=binary_encoding(train, test, 'ps_car_01_cat')"""

"cat_cols=['ps_ind_02_cat','ps_car_04_cat', 'ps_car_09_cat',\n          'ps_ind_05_cat', 'ps_car_01_cat']\n\ntrain, test=binary_encoding(train, test, 'ps_ind_02_cat')\ntrain, test=binary_encoding(train, test, 'ps_car_04_cat')\ntrain, test=binary_encoding(train, test, 'ps_car_09_cat')\ntrain, test=binary_encoding(train, test, 'ps_ind_05_cat')\ntrain, test=binary_encoding(train, test, 'ps_car_01_cat')"

In [8]:
print train.head(5)
print train.shape

   id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \
0   7       0          2              2          5              1   
1   9       0          1              1          7              0   
2  13       0          5              4          9              1   
3  16       0          0              1          2              0   
4  17       0          0              2          0              1   

   ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin  ps_ind_08_bin  \
0              0              0              1              0   
1              0              0              0              1   
2              0              0              0              1   
3              0              1              0              0   
4              0              1              0              0   

          ...           ps_car_04_cat_bin_3  ps_car_01_cat_bin_0  \
0         ...                             0                    1   
1         ...                             0               

In [9]:
col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train.drop(col_to_drop, axis=1, inplace=True)  
test.drop(col_to_drop, axis=1, inplace=True)

In [10]:
print train.head(5)
print train.shape

   id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \
0   7       0          2              2          5              1   
1   9       0          1              1          7              0   
2  13       0          5              4          9              1   
3  16       0          0              1          2              0   
4  17       0          0              2          0              1   

   ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin  ps_ind_08_bin  \
0              0              0              1              0   
1              0              0              0              1   
2              0              0              0              1   
3              0              1              0              0   
4              0              1              0              0   

          ...           ps_car_04_cat_bin_3  ps_car_01_cat_bin_0  \
0         ...                             0                    1   
1         ...                             0               

In [11]:
train.to_csv("train_without_smote_mj_allcat.csv", index = False)

In [12]:
test.to_csv("test_without_smote_mj_allcat.csv", index = False)

In [1]:
train.head(5)

NameError: name 'train' is not defined

In [15]:
test.head(5)

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_car_09_cat_bin_0,ps_car_09_cat_bin_1,ps_car_09_cat_bin_2,ps_ind_05_cat_bin_0,ps_ind_05_cat_bin_1,ps_ind_05_cat_bin_2,ps_car_01_cat_bin_0,ps_car_01_cat_bin_1,ps_car_01_cat_bin_2,ps_car_01_cat_bin_3
0,0,0,1,8,1,0,0,1,0,0,...,0,1,0,0,0,0,0,1,1,1
1,1,4,2,5,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,2,5,1,3,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,1
3,3,0,1,6,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,1,1
4,4,5,1,7,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,1
