# load packages

In [1]:
import os
import pandas as pd
from sklearn import preprocessing
print(f"Current Directory --> {os.getcwd()}")
print("Changing Directory...")
os.listdir(os.chdir(os.path.join(os.getcwd(), "../input/cat-in-the-dat-ii")))

Current Directory --> /Users/Paritosh_Gupta/Desktop/aamlp/notebooks
Changing Directory...


['test.csv', 'train_folds.csv', 'train.csv', 'sample_submission.csv']

# Data Prep

In [11]:
# read training data
train = pd.read_csv("train.csv")
# read test data
test = pd.read_csv("test.csv")

print(f"Train columns --> {train.columns}")
print(f"Test columns --> {test.columns}")
train_cols = train.columns
test_cols = test.columns
print(f"Columns of Train not present in Test --> {[x for x in train_cols if x not in test_cols]}")

# create a fake target column for test data
test.loc[:, "target"] = -1

print(f"concatenating Train and Test ...")
# concatenate both training and test data
data = pd.concat([train, test]).reset_index(drop=True)

# make a list of features we are interesred in
# id and target is something we should not encode
features = [x for x in train.columns if x not in ["id", "target"]]

# loop over the features list
for feat in features:
    # create a new instance of Label Encoder for each feature
    lbl_enc = preprocessing.LabelEncoder()
    
    # trick - since its categorical data, we fillna with a string
    # so, no matter its int or float, its converted to string
    
    temp_col = data[feat].fillna("NONE").astype(str).values
    
    data.loc[:, feat] = temp_col

# split the training data and test data    
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)



Train columns --> Index(['id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month',
       'target'],
      dtype='object')
Test columns --> Index(['id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month'],
      dtype='object')
Columns of Train not present in Test --> ['target']
concatenating Train and Test ...
(1000000, 25)
Index(['id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month',
       'target'],
      dtype='object')


# Unknown categories (Solves live setting issue of model failing) 

In [13]:
df = pd.read_csv("train.csv")

In [14]:
df.ord_2.fillna("NONE").value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
NONE            18075
Name: ord_2, dtype: int64

In [17]:
df.ord_4.fillna("NONE").value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
G        3404
V        3107
J        1950
L        1657
Name: ord_4, dtype: int64

## creating rare categories

In [20]:
df.ord_4 = df.ord_4.fillna("NONE")
print(df.ord_4.value_counts())

print("##########")
df.loc[df["ord_4"].value_counts()[df["ord_4"]].values < 2000, "ord_4"] = "RARE"
print(df.ord_4.value_counts())


N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
G        3404
V        3107
J        1950
L        1657
Name: ord_4, dtype: int64
##########
N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
RARE     3607
G        3404
V        3107
Name: ord_4, dtype: int64


# Check new folds

- Run the script **'../src/cat-in-the-dat-ii/create_folds.py'** to get the csv file

In [8]:
import pandas as pd
print(os.getcwd())
df = pd.read_csv("train_folds.csv")
df.kfold.value_counts()

# training data has 60000 samples, and we made five folds. so far 

/Users/Paritosh_Gupta/Desktop/aamlp/input/cat-in-the-dat-ii


4    120000
3    120000
2    120000
1    120000
0    120000
Name: kfold, dtype: int64

In [9]:
print(df[df.kfold==0].target.value_counts())
print(df[df.kfold==1].target.value_counts())
print(df[df.kfold==2].target.value_counts())
print(df[df.kfold==3].target.value_counts())
print(df[df.kfold==4].target.value_counts())

0    97536
1    22464
Name: target, dtype: int64
0    97536
1    22464
Name: target, dtype: int64
0    97535
1    22465
Name: target, dtype: int64
0    97535
1    22465
Name: target, dtype: int64
0    97535
1    22465
Name: target, dtype: int64
