# Feature Engineering on Categorical Data

In [1]:
# # Import necessary dependencies and settings

import pandas as pd
import numpy as np

# Transforming Nominal Features

In [2]:
# Let’s look at a new dataset pertaining to video game sales. 
# This dataset is also available on Kaggle 
# (https://www.kaggle.com/gregorut/videogamesales).

# # Transforming Nominal Features
vg_df = pd.read_csv('datasets_n_images/datasets_module_4/vgsales.csv')
print(vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7])

                       Name Platform    Year         Genre Publisher
1         Super Mario Bros.      NES  1985.0      Platform  Nintendo
2            Mario Kart Wii      Wii  2008.0        Racing  Nintendo
3         Wii Sports Resort      Wii  2009.0        Sports  Nintendo
4  Pokemon Red/Pokemon Blue       GB  1996.0  Role-Playing  Nintendo
5                    Tetris       GB  1989.0        Puzzle  Nintendo
6     New Super Mario Bros.       DS  2006.0      Platform  Nintendo


In [3]:
vg_df.shape

(16598, 11)

In [4]:
from sklearn.preprocessing import LabelEncoder

gle = LabelEncoder()
genre_labels = gle.fit_transform(vg_df['Genre'])
genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
print(genre_mappings)

# genre_mappings = { index , value }
# here index is the labels index i.e 0,1, ..
# label in enumerate would fetch all values from gle
#gle.classes_ keep a unique copy of all labels with itself and enumerate is used to print it

{0: 'Action', 1: 'Adventure', 2: 'Fighting', 3: 'Misc', 4: 'Platform', 5: 'Puzzle', 6: 'Racing', 7: 'Role-Playing', 8: 'Shooter', 9: 'Simulation', 10: 'Sports', 11: 'Strategy'}


In [5]:
# From the output, we can see that a mapping scheme has been generated 
# where each genre value is mapped to a number with the help of the 
# LabelEncoder object gle. The transformed labels are stored in the
# genre_labels value. Let’s write it back to the original dataframe 

vg_df['GenreLabel'] = genre_labels
print(vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7])

                       Name Platform    Year         Genre  GenreLabel
1         Super Mario Bros.      NES  1985.0      Platform           4
2            Mario Kart Wii      Wii  2008.0        Racing           6
3         Wii Sports Resort      Wii  2009.0        Sports          10
4  Pokemon Red/Pokemon Blue       GB  1996.0  Role-Playing           7
5                    Tetris       GB  1989.0        Puzzle           5
6     New Super Mario Bros.       DS  2006.0      Platform           4


# Transforming Ordinal Features

In [7]:
# Ordinal features are similar to nominal features except that order matters
poke_df = pd.read_csv('datasets_n_images/datasets_module_4/Pokemon.csv')
poke_df = poke_df.sample(random_state=1, frac=1)#frac=1 means take sample 
# from 100 percent of the data
#the above step is redundant n optional. its only used for shuffling the data

print(np.unique(poke_df['Generation']))

['Gen 1' 'Gen 2' 'Gen 3' 'Gen 4' 'Gen 5' 'Gen 6']


In [8]:
gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3, 
               'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}

poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)
print(poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10])

                    Name Generation  GenerationLabel
242            Octillery      Gen 2                2
764           Helioptile      Gen 6                6
540               Dialga      Gen 4                4
430  DeoxysDefense Forme      Gen 3                3
84              Rapidash      Gen 1                1
642               Swanna      Gen 5                5


# Encoding Categorical Features

# One Hot Encoding Scheme

In [9]:
# Considering we have numeric representation of any categorical feature 
# with m labels, the one hot encoding scheme, encodes or transforms 
# the feature into m binary features, which can only contain a value of 1 
# or 0. Each observation in the categorical feature is thus converted 
# into a vector of size m with only one of the values as 1 
# (indicating it as active). 
# Let’s take our Pokémon dataset and perform some one hot encoding
# transformations on some of its categorical features.

print(poke_df[['Name', 'Generation', 'Legendary']].iloc[4:10])
print("-------------------------------------------------------")
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
print(pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).iloc[4:10])

                    Name Generation  Legendary
242            Octillery      Gen 2      False
764           Helioptile      Gen 6      False
540               Dialga      Gen 4       True
430  DeoxysDefense Forme      Gen 3       True
84              Rapidash      Gen 1      False
642               Swanna      Gen 5      False
-------------------------------------------------------
                    Name Generation  Gen 1  Gen 2  Gen 3  Gen 4  Gen 5  Gen 6
242            Octillery      Gen 2      0      1      0      0      0      0
764           Helioptile      Gen 6      0      0      0      0      0      1
540               Dialga      Gen 4      0      0      0      1      0      0
430  DeoxysDefense Forme      Gen 3      0      0      1      0      0      0
84              Rapidash      Gen 1      1      0      0      0      0      0
642               Swanna      Gen 5      0      0      0      0      1      0


# Feature Hashing Scheme

In [10]:
unique_genres = np.unique(vg_df[['Genre']])
print("Total game genres:", len(unique_genres))
print(unique_genres)

Total game genres: 12
['Action' 'Adventure' 'Fighting' 'Misc' 'Platform' 'Puzzle' 'Racing'
 'Role-Playing' 'Shooter' 'Simulation' 'Sports' 'Strategy']


We can clearly see from the output that there are 12 distinct genres and if we used a one hot encoding scheme on the Genre feature, we would end up having 12 binary features. Instead, we will now use a feature hashing scheme by leveraging scikit-learn's FeatureHasher class, which uses a signed 32-bit version of the Murmurhash3 hash function. The following code shows us how to use the feature hashing scheme where we will pre-set the feature vector size to be 6 (6 features instead of 12).

> https://www.quora.com/Can-you-explain-feature-hashing-in-an-easily-understandable-way

In [11]:
from sklearn.feature_extraction import FeatureHasher

fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(vg_df['Genre'])
hashed_features = hashed_features.toarray()#above it was series,
# we r converting it to an array
print(pd.concat([vg_df[['Name', 'Genre']], 
                 pd.DataFrame(hashed_features)], axis=1).iloc[1:7])

                       Name         Genre    0    1    2    3    4    5
1         Super Mario Bros.      Platform  0.0  2.0  2.0 -1.0  1.0  0.0
2            Mario Kart Wii        Racing -1.0  0.0  0.0  0.0  0.0 -1.0
3         Wii Sports Resort        Sports -2.0  2.0  0.0 -2.0  0.0  0.0
4  Pokemon Red/Pokemon Blue  Role-Playing -1.0  1.0  2.0  0.0  1.0 -1.0
5                    Tetris        Puzzle  0.0  1.0  1.0 -2.0  1.0 -1.0
6     New Super Mario Bros.      Platform  0.0  2.0  2.0 -1.0  1.0  0.0


Why does scikit learn's HashingVectorizer give negative values?
https://stats.stackexchange.com/questions/237857/why-does-scikit-learns-hashingvectorizer-give-negative-values

https://github.com/scikit-learn/scikit-learn/issues/7513

In [14]:
from sklearn.feature_extraction import FeatureHasher

fh = FeatureHasher(n_features=4, input_type='string')
hashed_features = fh.fit_transform(vg_df['Genre'])
hashed_features = hashed_features.toarray()#above it was series, we r converting it to an array
print(pd.concat([vg_df[['Name', 'Genre']], 
                 pd.DataFrame(hashed_features)], axis=1).iloc[1:7])

                       Name         Genre    0    1    2    3
1         Super Mario Bros.      Platform  2.0  0.0  1.0  1.0
2            Mario Kart Wii        Racing -1.0 -1.0  0.0  0.0
3         Wii Sports Resort        Sports -1.0  1.0 -1.0 -1.0
4  Pokemon Red/Pokemon Blue  Role-Playing  1.0 -2.0  1.0  2.0
5                    Tetris        Puzzle  2.0 -3.0  0.0  1.0
6     New Super Mario Bros.      Platform  2.0  0.0  1.0  1.0
