# this notebook tries entity embeddings

In [5]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from fastai.tabular.all import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_csv(r'data/adult.csv')
# df = pd.read_csv('adult.csv').sample(frac=0.3)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

df.shape

(48842, 15)

In [7]:
# params
n_estimators = 100
max_samples = 0.5
max_features = 0.5
min_samples_leaf = 10

# fastai nn
bs = 1024 # batch size
fc_1 = 250
fc_2 = 100
n_epochs = 10
lr = 3e-3

# train baseline rf

In [8]:
# rf baseline with dummy variables
# %%time

x = df.drop('income', axis=1)
x = pd.get_dummies(x)

y = df['income']
y = y.map({'<=50K': 0, '>50K': 1})

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=1)

rf = RandomForestClassifier(n_estimators=n_estimators,
                            max_samples=max_samples,
                            max_features=max_features,
                            min_samples_leaf=min_samples_leaf).fit(x_train, y_train)
                            
preds = rf.predict(x_valid)

roc_baseline = round(roc_auc_score(y_valid, preds), 4)

print(f'total non-embedded feats: {len(x.columns)}')
print(f'accuracy: {round(accuracy_score(y_valid, preds), 4)}')
print(f'roc auc: {roc_baseline}')

total non-embedded feats: 108
accuracy: 0.8641
roc auc: 0.7814


# fastai embedder

In [9]:
# fastai nn with embeddings
# %%time

# prepare data for embeddings
dep_var = 'income' # our target var

num, cat = cont_cat_split(df, max_card=50, dep_var=dep_var) # numerical vs categorical split based on cardinality

splits = RandomSplitter()(range_of(df)) # split data
procs = [Categorify, Normalize, FillMissing] # needed augs

to = TabularPandas(df, procs, cat, num, y_names=dep_var, splits=splits) # data block ~ torch dataset
dls = to.dataloaders(bs, device=device) # loader

learn = tabular_learner(dls, layers=[fc_1, fc_2], n_out=2) # model to embed our cat values. Can tune (nn) layers.
learn.fit_one_cycle(n_epochs, lr) # train the model for some epochs

# scores in fastai
preds, _ = learn.get_preds()

# acc_fast = round(accuracy_score(targs.flatten(), preds.argmax(1)), 4)
acc_fast = round(accuracy_score(to.valid.y, preds.argmax(1)), 4)
roc_fast = round(roc_auc_score(to.valid.y, preds.argmax(1)), 4)

print(f'total all embeded feats: {len(num)+len(cat)}')
print(f'accuracy: {acc_fast}')
print(f'roc auc: {roc_fast}')

In [10]:
to.train.y

40442    0
13156    0
5917     0
39622    1
1345     1
        ..
28027    0
24288    0
34180    0
19406    1
11399    0
Name: income, Length: 39074, dtype: int8

In [11]:
# to make it more readbale
x_train = to.train.xs
y_train = to.train.y

x_valid = to.valid.xs
y_valid = to.valid.y

In [12]:
# main function to embed features
def embed_features(learner, df):
  """
  learner: fastai Learner used to train the neural net
  df: DataFrame containing input variables. Categorical values are defined by their rank. 
 ::return:: copy of `df` with embeddings replacing each categorical variable
  """
  df = df.copy()
  for i, col in enumerate(learn.dls.cat_names): # names of all cat cols
    
    # get matrix containing each row's embedding vector
    emb = learn.model.embeds[i]
    emb_data = emb(tensor(df[col], dtype=torch.int64))
    emb_names = [f'{col}_{j}' for j in range(emb_data.shape[1])]
    
    # join the embedded category and drop the old feature column
    feat_df = pd.DataFrame(data=emb_data, index=df.index, columns=emb_names)
    df = df.drop(col, axis=1)
    df = df.join(feat_df)

  return df

In [13]:
# this is how df look before and after embeddings
# df.sample(1)
# df_train_emb.sample(1)

# train another rf with all the new embeded cols

In [14]:
# %%time

# embedded df
x_train_emb = embed_features(learn, x_train)
x_valid_emb = embed_features(learn, x_valid)

# a lot of new features. Could take some time to cycle through all of them (~30-40 sec)
rf_emb = RandomForestClassifier(n_estimators=20, # we will only need this model to find important features
                                max_samples=max_samples,
                                max_features=max_features,
                                min_samples_leaf=min_samples_leaf).fit(x_train_emb, y_train)

preds = rf_emb.predict(x_valid_emb)

roc_emb = round(roc_auc_score(y_valid, preds), 4)

print(f'total all embeded feats: {len(x_train_emb.columns)}')
print(f'accuracy: {round(accuracy_score(y_valid, preds), 4)}')
print(f'roc auc: {roc_emb}')

total all embeded feats: 604
accuracy: 0.8317
roc auc: 0.7727


In [15]:
# the result is much worse. There could be too many features

# let's try to filter out most important features and build rf again

In [16]:
# %%time

# using rf model trained with all the embeded features build df with importance values
fi = pd.DataFrame({'cols': x_train_emb.columns, 'imp': rf_emb.feature_importances_}).sort_values(by='imp', ascending=False) 

imp_cols = fi[fi['imp'] != 0].cols.tolist() # filter important cols. We can try different threshold
print(f'num of features not equal to 0: {len(imp_cols)}')

x_train_emb_filt = x_train_emb[imp_cols] # df with important features
x_valid_emb_filt = x_valid_emb[imp_cols]

rf_emb_filt = RandomForestClassifier(n_estimators=n_estimators,
                                     max_samples=max_samples,
                                     max_features=max_features,
                                     min_samples_leaf=min_samples_leaf).fit(x_train_emb_filt, y_train)

# preds
preds = rf_emb_filt.predict(x_valid_emb_filt)

roc_emb_filt = round(roc_auc_score(y_valid, preds), 4)

print(f'total filtered embeded feats: {len(x_train_emb_filt.columns)}')
print(f'accuracy: {round(accuracy_score(y_valid, preds), 4)}')
print(f'roc auc: {roc_emb_filt}')

num of features not equal to 0: 580
total filtered embeded feats: 580
accuracy: 0.8307
roc auc: 0.7726


# results

In [17]:
print(f'roc dummy: {roc_baseline}')
print(f'roc fast: {roc_fast}') 
print(f'roc emb: {roc_emb}')
print(f'roc emb filtered: {roc_emb_filt}')

roc dummy: 0.7814
roc fast: 0.7952
roc emb: 0.7727
roc emb filtered: 0.7726


In [18]:
# feature importance plot
# def plot_fi(fi):
#     return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

# plot_fi(fi[:20]);

# need to redo it to calculate score by adding more features every itteration
# instead of increasing importance and do a lot of similar operations

In [19]:
# %%time

# # using rf model trained with all the embeded features build df with importance values
# fi = pd.DataFrame({'cols': x_train_emb.columns, 'imp': rf_emb.feature_importances_}).sort_values(by='imp', ascending=False) 

# importance = []
# roc_score = []

# # when we cycle through importances in for loop later, and set max_imp too high, we will get an error because there wont be any features with importance that high. So we cant use linspace(0,1,20) for example because when we get to importance 0.2, for example, we wont have any features (all of them <0.1) and it will throw an error. That is why I find and set max importance from the df and deduct something from it to guarantee I have at least 1 feature when we reach that max importance
# max_imp = fi.imp.max() - 1e-6

# for i in np.linspace(0, max_imp, 40): # MAKE SURE YOU CHOOSE AT LEAST SOME FEATURES

#     t0 = time.time()

#     imp_cols = fi[fi['imp'] > i].cols.tolist() # filter important cols. We can try different threshold
#     # print(f'for {i:.3f}: {imp_cols[:5]}\n') # you can check the number of features returned here

#     x_train_emb_filt = x_train_emb[imp_cols] # df with important features
#     x_valid_emb_filt = x_valid_emb[imp_cols]

#     # print(f'total features: {x_train_emb_filt.shape[1]}')

#     rf_emb_filt = RandomForestClassifier(n_estimators=n_estimators,
#                                          max_samples=max_samples, 
#                                          max_features=max_features,
#                                          min_samples_leaf=min_samples_leaf).fit(x_train_emb_filt, y_train)

#     # preds
#     preds = rf_emb_filt.predict(x_valid_emb_filt)

#     roc_emb_filt = round(roc_auc_score(y_valid, preds), 4)

#     importance.append(i)
#     roc_score.append(roc_emb_filt)

In [20]:
# plt.xlabel('importance')
# plt.ylabel('roc auc score')
# # plt.scatter(importance, roc_score);
# print(f'best roc score: {np.max(roc_score)} is when imortantce threshold > {importance[np.argmax(roc_score)]:.4f}')
# plt.plot(importance, roc_score, '-o');