In [1]:
import numpy as np
import pandas as pd
df=pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0.0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0.0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0.0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0.0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0.0


In [2]:
df.columns

Index(['id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month',
       'target'],
      dtype='object')

In [3]:
for idx, val in enumerate(df["ord_2"].unique()):
    print("{}: {}".format(idx, val))

0: Hot
1: Warm
2: Freezing
3: Lava Hot
4: Cold
5: Boiling Hot
6: nan


In [4]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
df["ord_2_le"] = label_enc.fit_transform(df["ord_2"])

In [5]:
df[["ord_2", "ord_2_le"]].head(5)

Unnamed: 0,ord_2,ord_2_le
0,Hot,3
1,Warm,5
2,Freezing,2
3,Lava Hot,4
4,Cold,1


In [6]:
for encoded_label in sorted(df["ord_2_le"].unique().tolist()):
    print(encoded_label, "-", label_enc.inverse_transform([encoded_label])[0])

0 - Boiling Hot
1 - Cold
2 - Freezing
3 - Hot
4 - Lava Hot
5 - Warm
6 - nan


In [7]:
ord_2_label_map = {
    "Freezing": 0,
    "Cold": 1,
    "Warm": 2,
    "Hot": 3,
    "Boiling Hot": 4,
    "Lava Hot": 5
}


df["ord_2_ordered_mapping"] = df["ord_2"].map(ord_2_label_map)


df[["ord_2", "ord_2_ordered_mapping"]].head(10)

Unnamed: 0,ord_2,ord_2_ordered_mapping
0,Hot,3.0
1,Warm,2.0
2,Freezing,0.0
3,Lava Hot,5.0
4,Cold,1.0
5,Hot,3.0
6,Cold,1.0
7,Cold,1.0
8,Boiling Hot,4.0
9,Lava Hot,5.0


In [8]:
df[df["ord_2"].isna()][["ord_2", "ord_2_ordered_mapping"]].head(10)

Unnamed: 0,ord_2,ord_2_ordered_mapping
69,,
186,,
226,,
229,,
276,,
281,,
290,,
307,,
310,,
315,,


In [9]:
print("Categories in nom_1:")
for idx, cat in enumerate(df["nom_1"].unique()):
    print(idx, "-", cat)

Categories in nom_1:
0 - Trapezoid
1 - Star
2 - nan
3 - Circle
4 - Triangle
5 - Polygon
6 - Square


In [10]:
print("Categories in nom_2:")
for idx, cat in enumerate(df["nom_2"].unique()):
    print(idx, "-", cat)

Categories in nom_2:
0 - Hamster
1 - Axolotl
2 - Lion
3 - Dog
4 - Cat
5 - Snake
6 - nan


In [11]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
nom_1_ohe = ohe.fit_transform(df["nom_1"].values.reshape(-1, 1)).toarray()

In [12]:
nom_1_ohe_df = pd.DataFrame(nom_1_ohe, columns=["nom_1_" + str(cat) for cat in ohe.categories_[0]])

nom_1_ohe_df.head()

Unnamed: 0,nom_1_Circle,nom_1_Polygon,nom_1_Square,nom_1_Star,nom_1_Trapezoid,nom_1_Triangle,nom_1_nan
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
df["ord_1"].isna().value_counts()

Unnamed: 0_level_0,count
ord_1,Unnamed: 1_level_1
False,279377
True,8729


In [14]:
df["ord_1"].value_counts()

Unnamed: 0_level_0,count
ord_1,Unnamed: 1_level_1
Novice,76706
Expert,67285
Contributor,52761
Grandmaster,45978
Master,36647


In [15]:
df["ord_1_without_na"] = df["ord_1"].fillna(df["ord_1"].mode()[0])

In [16]:
df[df["ord_1"].isna()][["ord_1", "ord_1_without_na"]].head()

Unnamed: 0,ord_1,ord_1_without_na
2,,Novice
187,,Novice
195,,Novice
211,,Novice
276,,Novice


In [17]:
df["ord_1"].unique()

array(['Contributor', 'Grandmaster', nan, 'Novice', 'Expert', 'Master'],
      dtype=object)

In [18]:
df["ord_1_with_unknown_cat"] = df["ord_1"].fillna("UNKNOWN")

In [19]:
df["ord_1_with_unknown_cat"].unique()

array(['Contributor', 'Grandmaster', 'UNKNOWN', 'Novice', 'Expert',
       'Master'], dtype=object)

In [None]:
#filling most frequent category based on another feature

In [20]:
df["nom_5"].isna().value_counts()

Unnamed: 0_level_0,count
nom_5,Unnamed: 1_level_1
False,279543
True,8563


In [21]:
df["nom_5"].value_counts()

Unnamed: 0_level_0,count
nom_5,Unnamed: 1_level_1
03ea75c83,494
360a16627,488
079b76328,479
251c8292d,477
094b85efd,475
...,...
58aa1b824,1
7335087fd,1
0385d0739,1
b3ad70fcb,1


In [22]:
df.groupby("day")["nom_5"].apply(lambda x: x.mode()[0])

Unnamed: 0_level_0,nom_5
day,Unnamed: 1_level_1
1.0,0b436e288
2.0,c7b304344
3.0,360a16627
4.0,03ea75c83
5.0,03ea75c83
6.0,edd08fbe3
7.0,079b76328


In [23]:
df["nom_5_fill_grouped_na"] = df.groupby("day")["nom_5"].transform(lambda x: x.mode()[0])

In [24]:
df[df["nom_5"].isna()][["nom_5", "nom_5_fill_grouped_na"]].head(10)

Unnamed: 0,nom_5,nom_5_fill_grouped_na
14,,c7b304344
19,,edd08fbe3
31,,03ea75c83
38,,079b76328
58,,edd08fbe3
81,,edd08fbe3
83,,03ea75c83
128,,03ea75c83
130,,edd08fbe3
137,,360a16627


In [None]:
##adding new features

In [25]:
ord_1_freq = df["ord_5"].value_counts()

In [26]:
df["ord_5_frequency"] = df["ord_5"].map(ord_1_freq)


df[["ord_5", "ord_5_frequency"]].head(30)

Unnamed: 0,ord_5,ord_5_frequency
0,Pw,2183.0
1,pE,2056.0
2,eN,1496.0
3,,
4,OZ,2174.0
5,wa,2118.0
6,rg,2102.0
7,PS,2634.0
8,mX,618.0
9,OZ,2174.0


In [None]:
#based on frequency of categories in a feature


In [27]:
df["ord_5_is_rare"] = df["ord_5_frequency"] < 1000

df["ord_5_is_rare"].value_counts()

Unnamed: 0_level_0,count
ord_5_is_rare,Unnamed: 1_level_1
False,258949
True,29157


In [28]:
print(
    "{} out of {} categories in ord_5 are rare.".format(df[df["ord_5_is_rare"]]["ord_5"].nunique(), df["ord_5"].nunique())
)

df[df["ord_5_is_rare"]]["ord_5"].unique()

63 out of 190 categories in ord_5 are rare.


array(['mX', 'xF', 'iv', 'Io', 'sY', 'aA', 'tT', 'Wr', 'Dn', 'gL', 'ze',
       'fO', 'MX', 'BX', 'Yr', 'Sk', 'wU', 'kv', 'pl', 'NT', 'Ro', 'ur',
       'XI', 'sF', 'In', 'uZ', 'lR', 'xB', 'gj', 'vw', 'HO', 'ne', 'Oe',
       'mP', 'wJ', 'cg', 'zf', 'WC', 'Yb', 'kB', 'MU', 'kP', 'FH', 'fF',
       'gt', 'Qm', 'Cn', 'pZ', 'nf', 'ja', 'Tg', 'uP', 'uW', 'FB', 'uI',
       'MF', 'RB', 'SL', 'PG', 'vQ', 'eA', 'gV', 'Zv'], dtype=object)