# Approaching Categorical Variables

This notebook records what I learnt from the AAAMLP book.

In [35]:
import pandas as pd
from sklearn import preprocessing

pd.set_option('display.max_columns', 100)

# read the data
df = pd.read_csv("../input/aaamlp/cat_train.csv")
df

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,599995,0.0,1.0,0.0,T,N,Red,Polygon,Axolotl,India,Theremin,014770cf0,da5014b01,a7059911d,158183c63,015c63324,3.0,Novice,Freezing,a,R,GZ,5.0,,0
599996,599996,1.0,0.0,0.0,T,Y,Blue,Polygon,Dog,Costa Rica,Oboe,,2023ed4ed,83bdea3a5,e9fde8fa8,a02ae6a63,2.0,Novice,Boiling Hot,n,N,sf,,3.0,0
599997,599997,0.0,0.0,0.0,F,Y,Red,Circle,Axolotl,Russia,Theremin,c7dc5d460,5d7d341ac,114b1dbf3,cccbca824,40f9610c1,2.0,Contributor,Freezing,n,H,MV,7.0,5.0,0
599998,599998,1.0,1.0,0.0,F,Y,,Polygon,Axolotl,,Piano,4d7780407,209e1054e,fba315672,4164322bd,c1a8374a0,1.0,Master,Warm,m,X,Ey,1.0,5.0,0


# 1(a) Convert categories to numbers using dictionary map

In [36]:
df.ord_2.value_counts(dropna=False)

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
NaN             18075
Name: ord_2, dtype: int64

In [37]:
# convert categories to numbers using dictionary map.
mapping = {
    "Freezing": 0,
    "Warm": 1,
    "Cold": 2,
    "Boiling Hot": 3,
    "Hot": 4,
    "Lava Hot": 5
}

df.loc[:, "ord_2"] = df.ord_2.map(mapping)

In [38]:
df.ord_2.value_counts(dropna=False)

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
NaN     18075
Name: ord_2, dtype: int64

# 1(b) Convert categories to numbers using LabelEncoder

We can use this directly in many tree-based models:
- Decision trees
- Random forest
- Extra Trees
- Or any kind of boosted trees model (XGBoost, GBM, LightGBM)

**This type of encoding cannot be used in linear models, support vector machines or neural networks as they expect data to be normalized (or standardized).**

In [39]:
df = pd.read_csv("../input/aaamlp/cat_train.csv")
df.ord_2.value_counts(dropna=False)

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
NaN             18075
Name: ord_2, dtype: int64

In [40]:
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE") # Fill NaN values in ord_2 column.
lbl_enc = preprocessing.LabelEncoder()       # LabelEncoder does not handle NaN values.

# fit label encoder and transform values on ord_2 column
# P.S: do not use this directly. fit first, then transform
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)

In [41]:
df.ord_2.value_counts()

2    142726
6    124239
1     97822
0     84790
3     67508
4     64840
5     18075
Name: ord_2, dtype: int64

# 2. One-Hot Encoding

In [42]:
df = pd.read_csv("../input/aaamlp/cat_train.csv")
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE") # Fill NaN values in ord_2 column.
ohe = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False) # Dense array
pd.DataFrame(ohe.fit_transform(df["ord_2"].values.reshape(-1, 1)))

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
599995,0.0,0.0,1.0,0.0,0.0,0.0,0.0
599996,1.0,0.0,0.0,0.0,0.0,0.0,0.0
599997,0.0,0.0,1.0,0.0,0.0,0.0,0.0
599998,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [43]:
df = pd.read_csv("../input/aaamlp/cat_train.csv")
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE") # Fill NaN values in ord_2 column.
ohe = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=True) # Sparse array
pd.DataFrame(ohe.fit_transform(df["ord_2"].values.reshape(-1, 1)))

Unnamed: 0,0
0,"(0, 3)\t1.0"
1,"(0, 6)\t1.0"
2,"(0, 2)\t1.0"
3,"(0, 4)\t1.0"
4,"(0, 1)\t1.0"
...,...
599995,"(0, 2)\t1.0"
599996,"(0, 0)\t1.0"
599997,"(0, 2)\t1.0"
599998,"(0, 6)\t1.0"


# 3. Converting categorical variables to numerical variables

In [44]:
df.groupby(["ord_2"])["id"].count()

ord_2
Boiling Hot     84790
Cold            97822
Freezing       142726
Hot             67508
Lava Hot        64840
NONE            18075
Warm           124239
Name: id, dtype: int64

In [45]:
df.groupby(["ord_2"])["id"].transform("count")

0          67508
1         124239
2         142726
3          64840
4          97822
           ...  
599995    142726
599996     84790
599997    142726
599998    124239
599999     84790
Name: id, Length: 600000, dtype: int64

In [46]:
df.groupby(["ord_1", "ord_2"])["id"].count().reset_index(name="count")

Unnamed: 0,ord_1,ord_2,count
0,Contributor,Boiling Hot,15634
1,Contributor,Cold,17734
2,Contributor,Freezing,26082
3,Contributor,Hot,12428
4,Contributor,Lava Hot,11919
5,Contributor,NONE,3250
6,Contributor,Warm,22774
7,Expert,Boiling Hot,19477
8,Expert,Cold,22956
9,Expert,Freezing,33249


# 4. Create new features from these categorical variables

In [47]:
df["new_feature"] = (df.ord_1.astype(str) + "_" + df.ord_2.astype(str))
df["new_feature"] 

0                 Contributor_Hot
1                Grandmaster_Warm
2                    nan_Freezing
3                 Novice_Lava Hot
4                Grandmaster_Cold
                   ...           
599995            Novice_Freezing
599996         Novice_Boiling Hot
599997       Contributor_Freezing
599998                Master_Warm
599999    Contributor_Boiling Hot
Name: new_feature, Length: 600000, dtype: object

# 5. If you have a fixed test set, you can add your test data to training to know about the categories in a given feature.

If you design your cross-validation in such a way that it replicates the prediction process when you run your model on test data, then it’s never going to overfit.

In [48]:
import pandas as pd
from sklearn import preprocessing

# Read training and test data.
train = pd.read_csv("../input/aaamlp/cat_train.csv")
test = pd.read_csv("../input/aaamlp/cat_test.csv")

# Create a fake target column for test data since this column doesn't exist.
test.loc[:, "target"] = -1

# Concatenate both training and test data.
data = pd.concat([train, test]).reset_index(drop=True)

# Make a list of features we are interested in (id and target is something we should not encode).
features = [x for x in train.columns if x not in ["id", "target"]]

# Loop over the features list.
for feat in features:
    # Create a new instance of LabelEncoder for each feature
    lbl_enc = preprocessing.LabelEncoder()    
    """
    Note the trick here. Since its categorical data, we fillna with a string
    and convert all the data to string type.
    So, no matter its int or float, its converted to string.
    int/float but categorical!!!
    """  
    temp_col = data[feat].fillna("NONE").astype(str).values
    # we can use fit_transform here as we do not have any extra test data that we need to
    # transform on separately.
    data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
    
# split the training and test data again.
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

# 6. Rare categories

In [53]:
df["ord_4"] = df.ord_4.fillna("NONE")
df["ord_4"].value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
G        3404
V        3107
J        1950
L        1657
Name: ord_4, dtype: int64

In [55]:
df.loc[
    df["ord_4"].value_counts()[df["ord_4"]].values < 2000, 
    "ord_4"
] = "RARE"

df.ord_4.value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
RARE     3607
G        3404
V        3107
Name: ord_4, dtype: int64

# Logistic regression with OHE

In [60]:
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds.
    df = pd.read_csv("../input/aaamlp/cat_train_folds.csv")

    # all columns are features except id, target and kfold columns.
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE.    
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # get training & validation data using folds.
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
    
    # fit ohe on training + validation features
    full_data = pd.concat([df_train[features], df_valid[features]], axis=0)
    ohe.fit(full_data[features])
    
    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()

    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
    
    # predict on validation data.
    # we need the probability values as we are calculating AUC.
    # we will use the probability of 1s.
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)    
    print(auc)

In [None]:
if __name__ == "__main__":
    run(0)

# Random Forest with Label Encoding

In [62]:
import pandas as pd
from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("../input/aaamlp/cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:        
        lbl = preprocessing.LabelEncoder()      # initialize LabelEncoder for each feature column.        
        lbl.fit(df[col])                        # fit label encoder on all data.        
        df.loc[:, col] = lbl.transform(df[col]) # transform all the data
        
    # get training & validation data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # get training & validation data
    x_train = df_train[features].values
    x_valid = df_valid[features].values
    
    # initialize random forest model
    model = ensemble.RandomForestClassifier(n_jobs = -1)
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
    
    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    print(f"Fold = {fold}, AUC = {auc}")

In [63]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)

Fold = 0, AUC = 0.7168783658859279
Fold = 1, AUC = 0.715838531629464
Fold = 2, AUC = 0.7164466090008996
Fold = 3, AUC = 0.715129421431247
Fold = 4, AUC = 0.7166935259054454


# Random Forest with Decomposition

We one-hot encode the full data and then fit TruncatedSVD from scikit-learn on sparse matrix with training + validation data. In this way, we reduce the high dimensional sparse matrix to 120 features and then fit random forest classifier.

In [66]:
import pandas as pd
from scipy import sparse
from sklearn import decomposition
from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("../input/aaamlp/cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE    
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # get training & validation data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
    
    # fit ohe on training + validation features
    full_data = pd.concat([df_train[features], df_valid[features]], axis=0)
    ohe.fit(full_data[features])
    
    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Truncated SVD. We are reducing the data to 120 components.
    svd = decomposition.TruncatedSVD(n_components=120)
    
    # fit svd on full sparse training data
    full_sparse = sparse.vstack((x_train, x_valid))
    svd.fit(full_sparse)
    
    # transform sparse training data
    x_train = svd.transform(x_train)
    # transform sparse validation data
    x_valid = svd.transform(x_valid)
    
    # initialize random forest model
    model = ensemble.RandomForestClassifier(n_jobs=-1)
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
    
    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [67]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)

Fold = 0, AUC = 0.7068733733177661
Fold = 1, AUC = 0.7062245490824105
Fold = 2, AUC = 0.709164591352216
Fold = 3, AUC = 0.7050484363896786
Fold = 4, AUC = 0.7063249849954278


# XGBoost with Label Encoding

In [68]:
import pandas as pd
import xgboost as xgb
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("../input/aaamlp/cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE    
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now it’s time to label encode the features
    for col in features:        
        lbl = preprocessing.LabelEncoder()      # initialize LabelEncoder for each feature column.        
        lbl.fit(df[col])                        # fit label encoder on all data.        
        df.loc[:, col] = lbl.transform(df[col]) # transform all the data.
    
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # get training data
    x_train = df_train[features].values
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize xgboost model
    model = xgb.XGBClassifier(n_jobs=-1, max_depth=7, n_estimators=200)
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
    
    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)    
    print(f"Fold = {fold}, AUC = {auc}")


In [69]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)



Fold = 0, AUC = 0.7619427726331363




Fold = 1, AUC = 0.7594102460900842




Fold = 2, AUC = 0.7625341621789485




Fold = 3, AUC = 0.76130659574446




Fold = 4, AUC = 0.7630714164013852
