In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
import category_encoders as ce
from scipy import sparse

from itertools import permutations
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# import warnings
# warnings.filterwarnings('ignore')

# Amazon Employee Access Challenge

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.shape

(32769, 10)

In [4]:
test.shape

(58921, 10)

## One Hot Encoding

In [5]:
# One hot encoding of RESOURCE Feature
ohe = OneHotEncoder(handle_unknown='ignore')

ohe.fit(train['RESOURCE'].values.reshape(-1, 1))# Fit has to happen only on train data

train_resource_ohe = ohe.transform(train['RESOURCE'].values.reshape(-1, 1))
test_resource_ohe = ohe.transform(test['RESOURCE'].values.reshape(-1, 1))

print(train_resource_ohe.shape, test_resource_ohe.shape)

(32769, 7518) (58921, 7518)


In [6]:
# One hot encoding of MGR_ID Feature
ohe = OneHotEncoder(handle_unknown='ignore')

ohe.fit(train['MGR_ID'].values.reshape(-1, 1))# Fit has to happen only on train data

train_mgr_id_ohe = ohe.transform(train['MGR_ID'].values.reshape(-1, 1))
test_mgr_id_ohe = ohe.transform(test['MGR_ID'].values.reshape(-1, 1))

print(train_mgr_id_ohe.shape, test_mgr_id_ohe.shape)

(32769, 4243) (58921, 4243)


In [7]:
# One hot encoding of ROLE_ROLLUP_1 Feature
ohe = OneHotEncoder(handle_unknown='ignore')

ohe.fit(train['ROLE_ROLLUP_1'].values.reshape(-1, 1))# Fit has to happen only on train data

train_role_rollup_1_ohe = ohe.transform(train['ROLE_ROLLUP_1'].values.reshape(-1, 1))
test_role_rollup_1_ohe = ohe.transform(test['ROLE_ROLLUP_1'].values.reshape(-1, 1))

print(train_role_rollup_1_ohe.shape, test_role_rollup_1_ohe.shape)

(32769, 128) (58921, 128)


In [8]:
# One hot encoding of ROLE_ROLLUP_2 Feature
ohe = OneHotEncoder(handle_unknown='ignore')

ohe.fit(train['ROLE_ROLLUP_2'].values.reshape(-1, 1))# Fit has to happen only on train data

train_role_rollup_2_ohe = ohe.transform(train['ROLE_ROLLUP_2'].values.reshape(-1, 1))
test_role_rollup_2_ohe = ohe.transform(test['ROLE_ROLLUP_2'].values.reshape(-1, 1))

print(train_role_rollup_2_ohe.shape, test_role_rollup_2_ohe.shape)

(32769, 177) (58921, 177)


In [9]:
# One hot encoding of ROLE_DEPTNAME Feature
ohe = OneHotEncoder(handle_unknown='ignore')

ohe.fit(train['ROLE_DEPTNAME'].values.reshape(-1, 1))# Fit has to happen only on train data

train_role_deptname_ohe = ohe.transform(train['ROLE_DEPTNAME'].values.reshape(-1, 1))
test_role_deptname_ohe = ohe.transform(test['ROLE_DEPTNAME'].values.reshape(-1, 1))

print(train_role_deptname_ohe.shape, test_role_deptname_ohe.shape)

(32769, 449) (58921, 449)


In [10]:
# One hot encoding of ROLE_TITLE Feature
ohe = OneHotEncoder(handle_unknown='ignore')

ohe.fit(train['ROLE_TITLE'].values.reshape(-1, 1))# Fit has to happen only on train data

train_role_title_ohe = ohe.transform(train['ROLE_TITLE'].values.reshape(-1, 1))
test_role_title_ohe = ohe.transform(test['ROLE_TITLE'].values.reshape(-1, 1))

print(train_role_title_ohe.shape, test_role_title_ohe.shape)

(32769, 343) (58921, 343)


In [11]:
# One hot encoding of ROLE_FAMILY_DESC Feature
ohe = OneHotEncoder(handle_unknown='ignore')

ohe.fit(train['ROLE_FAMILY_DESC'].values.reshape(-1, 1))# Fit has to happen only on train data

train_role_family_desc_ohe = ohe.transform(train['ROLE_FAMILY_DESC'].values.reshape(-1, 1))
test_role_family_desc_ohe = ohe.transform(test['ROLE_FAMILY_DESC'].values.reshape(-1, 1))

print(train_role_family_desc_ohe.shape, test_role_family_desc_ohe.shape)

(32769, 2358) (58921, 2358)


In [12]:
# One hot encoding of ROLE_FAMILY Feature
ohe = OneHotEncoder(handle_unknown='ignore')

ohe.fit(train['ROLE_FAMILY'].values.reshape(-1, 1))# Fit has to happen only on train data

train_role_family_ohe = ohe.transform(train['ROLE_FAMILY'].values.reshape(-1, 1))
test_role_family_ohe = ohe.transform(test['ROLE_FAMILY'].values.reshape(-1, 1))

print(train_role_family_ohe.shape, test_role_family_ohe.shape)

(32769, 67) (58921, 67)


In [13]:
# One hot encoding of ROLE_CODE Feature
ohe = OneHotEncoder(handle_unknown='ignore')

ohe.fit(train['ROLE_CODE'].values.reshape(-1, 1))# Fit has to happen only on train data

train_role_code_ohe = ohe.transform(train['ROLE_CODE'].values.reshape(-1, 1))
test_role_code_ohe = ohe.transform(test['ROLE_CODE'].values.reshape(-1, 1))

print(train_role_code_ohe.shape, test_role_code_ohe.shape)

(32769, 343) (58921, 343)


In [14]:
train_ohe = hstack((train_resource_ohe, train_mgr_id_ohe,  train_role_rollup_1_ohe, train_role_rollup_2_ohe, train_role_deptname_ohe, train_role_title_ohe, train_role_family_desc_ohe, train_role_family_ohe, train_role_code_ohe))

In [15]:
test_ohe = hstack((test_resource_ohe, test_mgr_id_ohe, test_role_rollup_1_ohe, test_role_rollup_2_ohe, test_role_deptname_ohe, test_role_title_ohe, test_role_family_desc_ohe, test_role_family_ohe, test_role_code_ohe))

In [16]:
y_train_ohe = train['ACTION']

In [17]:
train_ohe.shape, test_ohe.shape, y_train_ohe.shape

((32769, 15626), (58921, 15626), (32769,))

## Frequency Encoding

https://python-data-science.readthedocs.io/en/latest/preprocess.html

In [18]:
### FREQUENCY ENCODING

# size of each category
# encoding = titanic.groupby('Embarked').size()
# get frequency of each category
# encoding = encoding/len(titanic)
# titanic['enc'] = titanic.Embarked.map(encoding)

In [19]:
### FREQUENCY ENCODING RESOURCE

# size of each category
encoding = train.groupby('RESOURCE').size()

# get frequency of each category
encoding = encoding/len(train)
train_resource_fc = train.RESOURCE.map(encoding)
test_resource_fc = test.RESOURCE.map(encoding)

print(train_resource_fc.shape, test_resource_fc.shape, train_resource_fc.isna().sum(), test_resource_fc.isna().sum())
# fill missing values
test_resource_fc = test_resource_fc.fillna(0)
print(train_resource_fc.shape, test_resource_fc.shape, train_resource_fc.isna().sum(), test_resource_fc.isna().sum())

(32769,) (58921,) 0 0
(32769,) (58921,) 0 0


In [20]:
### FREQUENCY ENCODING MGR_ID

# size of each category
encoding = train.groupby('MGR_ID').size()

# get frequency of each category
encoding = encoding/len(train)
train_mgr_id_fc = train.MGR_ID.map(encoding)
test_mgr_id_fc = test.MGR_ID.map(encoding)

print(train_mgr_id_fc.shape, test_mgr_id_fc.shape, train_mgr_id_fc.isna().sum(), test_mgr_id_fc.isna().sum())
# fill missing values
test_mgr_id_fc = test_mgr_id_fc.fillna(0)
print(train_mgr_id_fc.shape, test_mgr_id_fc.shape, train_mgr_id_fc.isna().sum(), test_mgr_id_fc.isna().sum())

(32769,) (58921,) 0 1627
(32769,) (58921,) 0 0


In [21]:
### FREQUENCY ENCODING ROLE_ROLLUP_1

# size of each category
encoding = train.groupby('ROLE_ROLLUP_1').size()

# get frequency of each category
encoding = encoding/len(train)
train_rollup_1_fc = train.ROLE_ROLLUP_1.map(encoding)
test_rollup_1_fc = test.ROLE_ROLLUP_1.map(encoding)

print(train_rollup_1_fc.shape, test_rollup_1_fc.shape, train_rollup_1_fc.isna().sum(), test_rollup_1_fc.isna().sum())
# fill missing values
test_rollup_1_fc = test_rollup_1_fc.fillna(0)
print(train_rollup_1_fc.shape, test_rollup_1_fc.shape, train_rollup_1_fc.isna().sum(), test_rollup_1_fc.isna().sum())

(32769,) (58921,) 0 4
(32769,) (58921,) 0 0


In [22]:
### FREQUENCY ENCODING ROLE_ROLLUP_2

# size of each category
encoding = train.groupby('ROLE_ROLLUP_2').size()

# get frequency of each category
encoding = encoding/len(train)
train_rollup_2_fc = train.ROLE_ROLLUP_2.map(encoding)
test_rollup_2_fc = test.ROLE_ROLLUP_2.map(encoding)

print(train_rollup_2_fc.shape, test_rollup_2_fc.shape, train_rollup_2_fc.isna().sum(), test_rollup_2_fc.isna().sum())
# fill missing values
test_rollup_2_fc = test_rollup_2_fc.fillna(0)
print(train_rollup_2_fc.shape, test_rollup_2_fc.shape, train_rollup_2_fc.isna().sum(), test_rollup_2_fc.isna().sum())

(32769,) (58921,) 0 12
(32769,) (58921,) 0 0


In [23]:
### FREQUENCY ENCODING ROLE_DEPTNAME

# size of each category
encoding = train.groupby('ROLE_DEPTNAME').size()

# get frequency of each category
encoding = encoding/len(train)
train_role_deptname_fc = train.ROLE_DEPTNAME.map(encoding)
test_role_deptname_fc = test.ROLE_DEPTNAME.map(encoding)

print(train_role_deptname_fc.shape, test_role_deptname_fc.shape, train_role_deptname_fc.isna().sum(), test_role_deptname_fc.isna().sum())
# fill missing values
test_role_deptname_fc = test_role_deptname_fc.fillna(0)
print(train_role_deptname_fc.shape, test_role_deptname_fc.shape, train_role_deptname_fc.isna().sum(), test_role_deptname_fc.isna().sum())

(32769,) (58921,) 0 62
(32769,) (58921,) 0 0


In [24]:
### FREQUENCY ENCODING ROLE_TITLE

# size of each category
encoding = train.groupby('ROLE_TITLE').size()

# get frequency of each category
encoding = encoding/len(train)
train_role_title_fc = train.ROLE_TITLE.map(encoding)
test_role_title_fc = test.ROLE_TITLE.map(encoding)

print(train_role_title_fc.shape, test_role_title_fc.shape, train_role_title_fc.isna().sum(), test_role_title_fc.isna().sum())
# fill missing values
test_role_title_fc = test_role_title_fc.fillna(0)
print(train_role_title_fc.shape, test_role_title_fc.shape, train_role_title_fc.isna().sum(), test_role_title_fc.isna().sum())

(32769,) (58921,) 0 30
(32769,) (58921,) 0 0


In [25]:
### FREQUENCY ENCODING ROLE_FAMILY_DESC

# size of each category
encoding = train.groupby('ROLE_FAMILY_DESC').size()

# get frequency of each category
encoding = encoding/len(train)
train_role_family_desc_fc = train.ROLE_FAMILY_DESC.map(encoding)
test_role_family_desc_fc = test.ROLE_FAMILY_DESC.map(encoding)

print(train_role_family_desc_fc.shape, test_role_family_desc_fc.shape, train_role_family_desc_fc.isna().sum(), test_role_family_desc_fc.isna().sum())
# fill missing values
test_role_family_desc_fc = test_role_family_desc_fc.fillna(0)
print(train_role_family_desc_fc.shape, test_role_family_desc_fc.shape, train_role_family_desc_fc.isna().sum(), test_role_family_desc_fc.isna().sum())

(32769,) (58921,) 0 1249
(32769,) (58921,) 0 0


In [26]:
### FREQUENCY ENCODING ROLE_FAMILY

# size of each category
encoding = train.groupby('ROLE_FAMILY').size()

# get frequency of each category
encoding = encoding/len(train)
train_role_family_fc = train.ROLE_FAMILY.map(encoding)
test_role_family_fc = test.ROLE_FAMILY.map(encoding)

print(train_role_family_fc.shape, test_role_family_fc.shape, train_role_family_fc.isna().sum(), test_role_family_fc.isna().sum())
# fill missing values
test_role_family_fc = test_role_family_fc.fillna(0)
print(train_role_family_fc.shape, test_role_family_fc.shape, train_role_family_fc.isna().sum(), test_role_family_fc.isna().sum())

(32769,) (58921,) 0 1
(32769,) (58921,) 0 0


In [27]:
### FREQUENCY ENCODING ROLE_CODE

# size of each category
encoding = train.groupby('ROLE_CODE').size()

# get frequency of each category
encoding = encoding/len(train)
train_role_code_fc = train.ROLE_CODE.map(encoding)
test_role_code_fc = test.ROLE_CODE.map(encoding)

print(train_role_code_fc.shape, test_role_code_fc.shape, train_role_code_fc.isna().sum(), test_role_code_fc.isna().sum())
# fill missing values
test_role_code_fc = test_role_code_fc.fillna(0)
print(train_role_code_fc.shape, test_role_code_fc.shape, train_role_code_fc.isna().sum(), test_role_code_fc.isna().sum())

(32769,) (58921,) 0 30
(32769,) (58921,) 0 0


In [28]:
type(test_role_code_fc[0:10])

pandas.core.series.Series

In [29]:
train_df_fc = pd.DataFrame({'resource_fc':train_resource_fc, 'mgr_id_fc':train_mgr_id_fc,'rollup_1_fc':train_rollup_1_fc, 'rollup_2_fc':train_rollup_2_fc, 'role_deptname_fc':train_role_deptname_fc, 'role_title_fc':train_role_title_fc, 'role_family_desc_fc':train_role_family_desc_fc, 'role_family_fc':train_role_family_fc, 'role_code_fc':train_role_code_fc})

In [30]:
test_df_fc = pd.DataFrame({'resource_fc':test_resource_fc, 'mgr_id_fc':test_mgr_id_fc, 'rollup_1_fc':test_rollup_1_fc, 'rollup_2_fc':test_rollup_2_fc, 'role_deptname_fc':test_role_deptname_fc, 'role_title_fc':test_role_title_fc, 'role_family_desc_fc':test_role_family_desc_fc, 'role_family_fc':test_role_family_fc, 'role_code_fc':test_role_code_fc})

In [31]:
train_df_fc.shape

(32769, 9)

In [32]:
test_df_fc.shape

(58921, 9)

In [33]:
train_y_fc = train['ACTION'].values

In [34]:
train_y_fc.shape

(32769,)

## Response Encoding

https://medium.com/analytics-vidhya/types-of-categorical-data-encoding-schemes-a5bbeb4ba02b

In [35]:
# sample
data = pd.DataFrame({
    'color' : ['Blue', 'Black', 'Black','Blue', 'Blue'],
    'outcome' : [1,      2,        1,     1,      2,]
})
# column to perform encoding
X = data['color']
Y = data['outcome']
# create an object of the TargetEncoder
ce_TE = ce.TargetEncoder(cols=['color'])
# fit and transform and you will get the encoded data
ce_TE.fit(X,Y)
ce_TE.transform(X)

Unnamed: 0,color
0,1.34128
1,1.473106
2,1.473106
3,1.34128
4,1.34128


In [36]:
### RESPONSE ENCODING RESOURCE

# column to perform encoding
X = train['RESOURCE']
Y = train['ACTION']
# create an object of the TargetEncoder
ce_TE = ce.TargetEncoder(cols=['RESOURCE'])
# fit and transform and you will get the encoded data
ce_TE.fit(X,Y)
train_resource_rc = ce_TE.transform(X)
test_resource_rc = ce_TE.transform(test['RESOURCE'])

print(train_resource_rc.shape, test_resource_rc.shape)

(32769, 1) (58921, 1)


In [37]:
train_resource_rc[:10]

Unnamed: 0,RESOURCE
0,0.993099
1,0.966667
2,0.984431
3,0.94211
4,0.999947
5,0.802556
6,0.953545
7,1.0
8,0.997255
9,1.0


In [38]:
### RESPONSE ENCODING MGR_ID

# column to perform encoding
X = train['MGR_ID']
Y = train['ACTION']
# create an object of the TargetEncoder
ce_TE = ce.TargetEncoder(cols=['MGR_ID'])
# fit and transform and you will get the encoded data
ce_TE.fit(X,Y)
train_mgr_id_rc = ce_TE.transform(X)
test_mgr_id_rc = ce_TE.transform(test['MGR_ID'])

print(train_mgr_id_rc.shape, test_mgr_id_rc.shape)

(32769, 1) (58921, 1)


In [39]:
### RESPONSE ENCODING ROLE_ROLLUP_1

# column to perform encoding
X = train['ROLE_ROLLUP_1']
Y = train['ACTION']
# create an object of the TargetEncoder
ce_TE = ce.TargetEncoder(cols=['ROLE_ROLLUP_1'])
# fit and transform and you will get the encoded data
ce_TE.fit(X,Y)
train_rollup_1_rc = ce_TE.transform(X)
test_rollup_1_rc = ce_TE.transform(test['ROLE_ROLLUP_1'])

print(train_rollup_1_rc.shape, test_rollup_1_rc.shape)

(32769, 1) (58921, 1)


In [40]:
### RESPONSE ENCODING ROLE_ROLLUP_2

# column to perform encoding
X = train['ROLE_ROLLUP_2']
Y = train['ACTION']
# create an object of the TargetEncoder
ce_TE = ce.TargetEncoder(cols=['ROLE_ROLLUP_2'])
# fit and transform and you will get the encoded data
ce_TE.fit(X,Y)
train_rollup_2_rc = ce_TE.transform(X)
test_rollup_2_rc = ce_TE.transform(test['ROLE_ROLLUP_2'])

print(train_rollup_2_rc.shape, test_rollup_2_rc.shape)

(32769, 1) (58921, 1)


In [41]:
### RESPONSE ENCODING ROLE_DEPTNAME

# column to perform encoding
X = train['ROLE_DEPTNAME']
Y = train['ACTION']
# create an object of the TargetEncoder
ce_TE = ce.TargetEncoder(cols=['ROLE_DEPTNAME'])
# fit and transform and you will get the encoded data
ce_TE.fit(X,Y)
train_role_deptname_rc = ce_TE.transform(X)
test_role_deptname_rc = ce_TE.transform(test['ROLE_DEPTNAME'])

print(train_role_deptname_rc.shape, test_role_deptname_rc.shape)

(32769, 1) (58921, 1)


In [42]:
### RESPONSE ENCODING ROLE_TITLE

# column to perform encoding
X = train['ROLE_TITLE']
Y = train['ACTION']
# create an object of the TargetEncoder
ce_TE = ce.TargetEncoder(cols=['ROLE_TITLE'])
# fit and transform and you will get the encoded data
ce_TE.fit(X,Y)
train_role_title_rc = ce_TE.transform(X)
test_role_title_rc = ce_TE.transform(test['ROLE_TITLE'])

print(train_role_title_rc.shape, test_role_title_rc.shape)

(32769, 1) (58921, 1)


In [43]:
### RESPONSE ENCODING ROLE_FAMILY_DESC

# column to perform encoding
X = train['ROLE_FAMILY_DESC']
Y = train['ACTION']
# create an object of the TargetEncoder
ce_TE = ce.TargetEncoder(cols=['ROLE_FAMILY_DESC'])
# fit and transform and you will get the encoded data
ce_TE.fit(X,Y)
train_role_family_desc_rc = ce_TE.transform(X)
test_role_family_desc_rc = ce_TE.transform(test['ROLE_FAMILY_DESC'])

print(train_role_family_desc_rc.shape, test_role_family_desc_rc.shape)

(32769, 1) (58921, 1)


In [44]:
### RESPONSE ENCODING ROLE_FAMILY

# column to perform encoding
X = train['ROLE_FAMILY']
Y = train['ACTION']
# create an object of the TargetEncoder
ce_TE = ce.TargetEncoder(cols=['ROLE_FAMILY'])
# fit and transform and you will get the encoded data
ce_TE.fit(X,Y)
train_role_family_rc = ce_TE.transform(X)
test_role_family_rc = ce_TE.transform(test['ROLE_FAMILY'])

print(train_role_family_rc.shape, test_role_family_rc.shape)

(32769, 1) (58921, 1)


In [45]:
### RESPONSE ENCODING ROLE_CODE

# column to perform encoding
X = train['ROLE_CODE']
Y = train['ACTION']
# create an object of the TargetEncoder
ce_TE = ce.TargetEncoder(cols=['ROLE_CODE'])
# fit and transform and you will get the encoded data
ce_TE.fit(X,Y)
train_role_code_rc = ce_TE.transform(X)
test_role_code_rc = ce_TE.transform(test['ROLE_CODE'])

print(train_role_code_rc.shape, test_role_code_rc.shape)

(32769, 1) (58921, 1)


In [46]:
train_df_rc = pd.DataFrame ({'resource_rc':train_resource_rc['RESOURCE'],'mgr_id_rc':train_mgr_id_rc['MGR_ID'], 'rollup_1_rc':train_rollup_1_rc['ROLE_ROLLUP_1'],  'rollup_2_rc':train_rollup_2_rc['ROLE_ROLLUP_2'], 'role_deptname_rc':train_role_deptname_rc['ROLE_DEPTNAME'], 'role_title_rc':train_role_title_rc['ROLE_TITLE'], 'role_family_desc_rc':train_role_family_desc_rc['ROLE_FAMILY_DESC'], 'role_family_rc':train_role_family_rc['ROLE_FAMILY'], 'role_code_rc':train_role_code_rc['ROLE_CODE']})

In [47]:
test_df_rc = pd.DataFrame ({'resource_rc':test_resource_rc['RESOURCE'],'mgr_id_rc':test_mgr_id_rc['MGR_ID'], 'rollup_1_rc':test_rollup_1_rc['ROLE_ROLLUP_1'],  'rollup_2_rc':test_rollup_2_rc['ROLE_ROLLUP_2'], 'role_deptname_rc':test_role_deptname_rc['ROLE_DEPTNAME'], 'role_title_rc':test_role_title_rc['ROLE_TITLE'], 'role_family_desc_rc':test_role_family_desc_rc['ROLE_FAMILY_DESC'], 'role_family_rc':test_role_family_rc['ROLE_FAMILY'], 'role_code_rc':test_role_code_rc['ROLE_CODE']})

In [48]:
train_df_rc

Unnamed: 0,resource_rc,mgr_id_rc,rollup_1_rc,rollup_2_rc,role_deptname_rc,role_title_rc,role_family_desc_rc,role_family_rc,role_code_rc
0,0.993099,1.000000,0.949222,0.956148,0.958333,0.967625,0.933440,0.942350,0.967625
1,0.966667,0.999993,0.949222,0.969075,0.893082,0.962963,0.999999,0.947941,0.962963
2,0.984431,0.993099,0.918478,0.918478,0.923077,0.889331,0.939394,0.907815,0.889331
3,0.942110,1.000000,0.949222,0.969075,0.989474,0.920413,0.955788,0.942350,0.920413
4,0.999947,0.999981,0.931159,0.876812,0.755556,0.866667,0.947368,0.837017,0.866667
...,...,...,...,...,...,...,...,...,...
32764,0.901961,0.965517,0.949222,0.956148,0.989474,0.920413,0.955788,0.942350,0.920413
32765,0.984431,0.999981,0.963939,0.963939,1.000000,1.000000,0.998959,0.977679,1.000000
32766,0.962733,0.998959,0.949222,0.954563,1.000000,0.993099,0.984431,0.974359,0.993099
32767,0.999857,0.687500,0.734545,0.719844,0.864947,0.913706,0.860656,0.862671,0.913706


In [49]:
test_df_rc

Unnamed: 0,resource_rc,mgr_id_rc,rollup_1_rc,rollup_2_rc,role_deptname_rc,role_title_rc,role_family_desc_rc,role_family_rc,role_code_rc
0,1.000000,0.802556,0.809955,0.809955,0.937445,0.889331,0.719298,0.907815,0.889331
1,0.618902,1.000000,0.949222,0.954563,0.943820,0.991736,1.000000,0.952087,0.991736
2,0.999993,1.000000,0.949222,0.956148,1.000000,0.937500,0.948052,0.977679,0.937500
3,0.984431,0.866667,0.949222,0.957205,0.979323,0.913495,1.000000,0.941935,0.913495
4,0.941176,1.000000,0.949222,0.969075,0.977901,0.992021,1.000000,0.972491,0.992021
...,...,...,...,...,...,...,...,...,...
58916,0.990220,0.956522,0.949222,0.912584,0.992188,0.929458,0.933440,0.942350,0.929458
58917,0.946488,0.814815,0.949222,0.957205,0.891304,0.970284,0.980707,0.959167,0.970284
58918,0.961240,0.999613,0.949222,0.969075,0.884615,0.979592,0.984925,0.942350,0.979592
58919,0.882353,0.998959,0.949222,0.954563,0.884058,0.920413,0.933440,0.942350,0.920413


In [50]:
train_y_rc = train['ACTION'].values

In [51]:
train_y_rc.shape

(32769,)

# Feature Engineering

## Encoding with Singular Value Decomposition

Here I'll use singular value decomposition (SVD) to learn encodings from pairs of categorical features. SVD is one of the more complex encodings, but it can also be very effective. We'll construct a matrix of co-occurences for each pair of categorical features. Each row corresponds to a value in feature A, while each column corresponds to a value in feature B. Each element is the count of rows where the value in A appears together with the value in B.

You then use singular value decomposition to find two smaller matrices that equal the count matrix when multiplied.

In [52]:
#https://www.kaggle.com/dmitrylarko/kaggledays-sf-2-amazon-unsupervised-encoding#SVD-Encoding
#https://www.kaggle.com/matleonard/encoding-categorical-features-with-svd

In [53]:
train_data=train.drop(columns=['ACTION'],axis=1)

In [54]:
train_data.shape

(32769, 9)

In [55]:
train_data.nunique()

RESOURCE            7518
MGR_ID              4243
ROLE_ROLLUP_1        128
ROLE_ROLLUP_2        177
ROLE_DEPTNAME        449
ROLE_TITLE           343
ROLE_FAMILY_DESC    2358
ROLE_FAMILY           67
ROLE_CODE            343
dtype: int64

In [56]:
test_data=test.drop(columns=['id'],axis=1)

In [57]:
test_data.shape

(58921, 9)

In [58]:
test_data.nunique()

RESOURCE            4971
MGR_ID              4689
ROLE_ROLLUP_1        126
ROLE_ROLLUP_2        177
ROLE_DEPTNAME        466
ROLE_TITLE           351
ROLE_FAMILY_DESC    2749
ROLE_FAMILY           68
ROLE_CODE            351
dtype: int64

In [59]:
train_svd = pd.DataFrame()
test_svd = pd.DataFrame()

In [60]:
temp = train_data.groupby(['ROLE_ROLLUP_1','ROLE_ROLLUP_2'])['ROLE_ROLLUP_1'].count()
temp=temp.unstack(fill_value=0)

In [61]:
temp

ROLE_ROLLUP_2,23779,31010,32137,117877,117883,117891,117894,117903,117911,117917,...,141176,141222,143009,145248,147237,151110,159716,176316,185842,286791
ROLE_ROLLUP_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5110,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11146,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
117876,0,0,0,171,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203209,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
209434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
216705,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
247952,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
temp = train_data.groupby(['RESOURCE','MGR_ID'])['MGR_ID'].count()
temp=temp.unstack(fill_value=0)

In [63]:
temp

MGR_ID,25,27,30,32,33,36,43,46,47,55,...,311251,311338,311355,311433,311438,311597,311651,311682,311683,311696
RESOURCE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
136,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312136,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
312139,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
312140,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
312152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
train_data.columns

Index(['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2', 'ROLE_DEPTNAME',
       'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY', 'ROLE_CODE'],
      dtype='object')

In [65]:
for col1,col2 in tqdm(permutations(train_data.columns,2)):
    res_train=(train_data.groupby([col1,col2])[col2].count()) 
    res_train=res_train.unstack(fill_value=0)

    svd=TruncatedSVD(n_components=1,random_state=42,).fit(res_train)
    val_train=svd.transform(res_train)
    val_train = pd.DataFrame(val_train)
    val_train = val_train.set_index(res_train.index)
    
    train_svd[col1+'_'+col2]=train[col1].map(val_train.iloc[:,0])
    test_svd[col1+'_'+col2]=test[col1].map(val_train.iloc[:,0])

72it [00:23,  3.06it/s]


In [66]:
train_svd.shape,test_svd.shape

((32769, 72), (58921, 72))

In [67]:
train_svd.fillna(0,inplace=True)
test_svd.fillna(0,inplace=True)
print(train_svd.isna().sum().values)
print(test_svd.isna().sum().values)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [68]:
train_svd.head()

Unnamed: 0,RESOURCE_MGR_ID,RESOURCE_ROLE_ROLLUP_1,RESOURCE_ROLE_ROLLUP_2,RESOURCE_ROLE_DEPTNAME,RESOURCE_ROLE_TITLE,RESOURCE_ROLE_FAMILY_DESC,RESOURCE_ROLE_FAMILY,RESOURCE_ROLE_CODE,MGR_ID_RESOURCE,MGR_ID_ROLE_ROLLUP_1,...,ROLE_FAMILY_ROLE_FAMILY_DESC,ROLE_FAMILY_ROLE_CODE,ROLE_CODE_RESOURCE,ROLE_CODE_MGR_ID,ROLE_CODE_ROLE_ROLLUP_1,ROLE_CODE_ROLE_ROLLUP_2,ROLE_CODE_ROLE_DEPTNAME,ROLE_CODE_ROLE_TITLE,ROLE_CODE_ROLE_FAMILY_DESC,ROLE_CODE_ROLE_FAMILY
0,0.088724,2.995769,1.810303,0.070125,1.59363,2.91956,2.934431,1.59363,1.416749,54.99993,...,6975.313331,6146.289,288.345856,164.386849,2930.072697,1055.973092,1.816763,6.276052e-13,2671.72,3583.0
1,0.559935,25.998514,13.24768,1.084496,4.285689,4.5347,8.583779,4.285689,0.560297,9.999987,...,0.101858,1.215882e-16,5.964018,0.288915,80.895295,25.034824,0.024511,-5.09522e-21,0.0007123211,1.6022760000000001e-18
2,0.000108,0.007828,0.022128,0.509533,0.049782,0.007275,0.0587,0.049782,0.000724,5.493129e-10,...,0.024495,3.378811e-12,10.862651,3.7e-05,0.255356,0.160767,617.679185,-1.968758e-13,0.0005953084,1.010753e-11
3,0.044904,0.99859,0.597128,0.018862,0.669129,0.164492,0.978144,0.669129,2.151085,61.99992,...,6975.313331,6146.289,292.012416,244.185412,4102.019163,1488.847285,2.455416,4649.0,3538.719,4649.0
4,0.05941,2.022416,0.320066,0.804351,0.689633,0.1752,2.03362,0.689633,0.013122,0.0001490441,...,2.2e-05,-7.200891e-18,0.267473,0.000104,0.010027,0.005174,8.969635,-9.548687000000001e-22,1.778875e-10,-8.786878e-20


### Normalizing the data

In [69]:
from sklearn.preprocessing import Normalizer
columns = (train_svd.columns)
x_vals1=train_svd[columns]
x_vals2=test_svd[columns]
n=Normalizer()
n.fit(x_vals1)
x_vals1 = n.transform(x_vals1)
train_svd = pd.DataFrame(x_vals1,columns=columns)
x_vals2 = n.transform(x_vals2)
test_svd = pd.DataFrame(x_vals2,columns=columns)

In [70]:
train_svd.shape,test_svd.shape

((32769, 72), (58921, 72))

In [71]:
train_svd.head()

Unnamed: 0,RESOURCE_MGR_ID,RESOURCE_ROLE_ROLLUP_1,RESOURCE_ROLE_ROLLUP_2,RESOURCE_ROLE_DEPTNAME,RESOURCE_ROLE_TITLE,RESOURCE_ROLE_FAMILY_DESC,RESOURCE_ROLE_FAMILY,RESOURCE_ROLE_CODE,MGR_ID_RESOURCE,MGR_ID_ROLE_ROLLUP_1,...,ROLE_FAMILY_ROLE_FAMILY_DESC,ROLE_FAMILY_ROLE_CODE,ROLE_CODE_RESOURCE,ROLE_CODE_MGR_ID,ROLE_CODE_ROLE_ROLLUP_1,ROLE_CODE_ROLE_ROLLUP_2,ROLE_CODE_ROLE_DEPTNAME,ROLE_CODE_ROLE_TITLE,ROLE_CODE_ROLE_FAMILY_DESC,ROLE_CODE_ROLE_FAMILY
0,3.338246e-06,0.000113,6.8e-05,2.638468e-06,6e-05,0.00011,0.00011,6e-05,5.330531e-05,0.002069377,...,0.2624468,0.2312547,0.010849,0.00618507,0.110244,0.039731,6.8e-05,2.3613700000000002e-17,0.1005237,0.1348107
1,3.290961e-05,0.001528,0.000779,6.374011e-05,0.000252,0.000267,0.000505,0.000252,3.293088e-05,0.0005877389,...,5.986619e-06,7.146218e-21,0.000351,1.69807e-05,0.004755,0.001471,1e-06,-2.9946630000000003e-25,4.186593e-08,9.417210000000001e-23
2,1.122108e-07,8e-06,2.3e-05,0.0005280569,5.2e-05,8e-06,6.1e-05,5.2e-05,7.499907e-07,5.692832e-13,...,2.538523e-05,3.501648e-15,0.011258,3.821987e-08,0.000265,0.000167,0.640135,-2.040332e-16,6.169509e-07,1.047499e-14
3,1.733916e-06,3.9e-05,2.3e-05,7.283213e-07,2.6e-05,6e-06,3.8e-05,2.6e-05,8.306181e-05,0.00239406,...,0.2693442,0.2373323,0.011276,0.009428956,0.158395,0.05749,9.5e-05,0.1795161,0.1366438,0.1795161
4,0.0004072207,0.013863,0.002194,0.005513384,0.004727,0.001201,0.013939,0.004727,8.994073e-05,1.021616e-06,...,1.516644e-07,-4.935814e-20,0.001833,7.149129e-07,6.9e-05,3.5e-05,0.061482,-6.5450989999999995e-24,1.219321e-12,-6.022921e-22


In [72]:
test_svd.head()

Unnamed: 0,RESOURCE_MGR_ID,RESOURCE_ROLE_ROLLUP_1,RESOURCE_ROLE_ROLLUP_2,RESOURCE_ROLE_DEPTNAME,RESOURCE_ROLE_TITLE,RESOURCE_ROLE_FAMILY_DESC,RESOURCE_ROLE_FAMILY,RESOURCE_ROLE_CODE,MGR_ID_RESOURCE,MGR_ID_ROLE_ROLLUP_1,...,ROLE_FAMILY_ROLE_FAMILY_DESC,ROLE_FAMILY_ROLE_CODE,ROLE_CODE_RESOURCE,ROLE_CODE_MGR_ID,ROLE_CODE_ROLE_ROLLUP_1,ROLE_CODE_ROLE_ROLLUP_2,ROLE_CODE_ROLE_DEPTNAME,ROLE_CODE_ROLE_TITLE,ROLE_CODE_ROLE_FAMILY_DESC,ROLE_CODE_ROLE_FAMILY
0,1.748205e-06,1.4e-05,3.3e-05,0.006349,0.000224,9.368598e-06,0.000464,0.000224,1e-06,5.576305999999999e-20,...,2.057914e-05,2.838695e-15,0.009126,3.098385e-08,0.000215,0.000135,0.5189408,-1.654044e-16,5.001461e-07,8.491806e-15
1,4.757212e-07,6.1e-05,1.6e-05,2e-06,4e-05,5.865678e-05,6.1e-05,4e-05,3e-06,0.001024593,...,1.792938e-05,-1.951387e-18,3e-05,2.341034e-07,0.002751,0.000949,4.818718e-05,1.195552e-22,1.502169e-09,-3.841632e-20
2,1.895173e-05,0.000584,0.000352,3e-05,1.4e-05,8.070228e-07,5.5e-05,1.4e-05,0.000193,0.00304247,...,9.018517e-10,7.429734e-24,0.000134,3.14775e-07,0.000996,0.00053,4.060167e-07,3.080197e-26,5.827259e-14,7.736424e-26
3,3.237126e-06,0.00012,3.2e-05,1.3e-05,8e-05,1.984018e-05,0.000118,8e-05,0.000123,0.002713827,...,7.910264e-08,4.375788e-17,0.000589,0.0003272294,0.015867,0.004973,5.197746e-06,-4.4639960000000004e-17,1.437485e-08,2.057506e-18
4,0.0003102218,0.008945,0.004305,0.000712,0.001083,0.001242129,0.002395,0.001083,7.5e-05,0.001109102,...,3.883922e-08,2.82792e-20,0.00055,9.799353e-07,0.013715,0.007055,3.5753e-06,2.960214e-16,-1.035395e-17,1.72956e-21


In [73]:
# Save data into csv files

In [74]:
train_df_fc.to_csv('data/train_df_fc.csv', index=False)
test_df_fc.to_csv('data/test_df_fc.csv', index=False)

train_df_rc.to_csv('data/train_df_rc.csv', index=False)
test_df_rc.to_csv('data/test_df_rc.csv', index=False)

train_svd.to_csv('data/train_svd.csv', index=False)
test_svd.to_csv('data/test_svd.csv', index=False)

In [75]:
# feature selection for one hot encoding
train_ohe.shape, test_ohe.shape, y_train_ohe.shape

((32769, 15626), (58921, 15626), (32769,))

In [76]:
from sklearn.feature_selection import SelectKBest,chi2
ktop = SelectKBest(chi2,k=4500).fit(train_ohe,y_train_ohe)
train_ohe=ktop.transform(train_ohe)
test_ohe=ktop.transform(test_ohe)

In [77]:
train_ohe.shape, test_ohe.shape, y_train_ohe.shape

((32769, 4500), (58921, 4500), (32769,))

In [78]:
sparse.save_npz('data/train_ohe.npz', train_ohe)
sparse.save_npz('data/test_ohe.npz', test_ohe)