In [1]:
import pandas as pd
import numpy as np

In [2]:
vg_df = pd.read_csv('vgsales.csv', encoding='utf-8')
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]

Unnamed: 0,Name,Platform,Year,Genre,Publisher
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo
5,Tetris,GB,1989.0,Puzzle,Nintendo
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo


In [3]:
genres = np.unique(vg_df['Genre'])
genres

array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

In [4]:
from sklearn.preprocessing import LabelEncoder

gle = LabelEncoder()
genre_labels = gle.fit_transform(vg_df['Genre'])
genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
genre_mappings

{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}

In [5]:
vg_df['GenreLabel'] = genre_labels
vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]

Unnamed: 0,Name,Platform,Year,Genre,GenreLabel
1,Super Mario Bros.,NES,1985.0,Platform,4
2,Mario Kart Wii,Wii,2008.0,Racing,6
3,Wii Sports Resort,Wii,2009.0,Sports,10
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,7
5,Tetris,GB,1989.0,Puzzle,5
6,New Super Mario Bros.,DS,2006.0,Platform,4


In [6]:
poke_df = pd.read_csv('Pokemon.csv', encoding='utf-8')
poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)
poke_df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False
1,460,Abomasnow,Grass,Ice,494,90,92,75,92,85,60,4,False
2,161,Sentret,Normal,,215,35,46,34,35,45,20,2,False
3,667,Litleo,Fire,Normal,369,62,50,58,73,54,72,6,False
4,224,Octillery,Water,,480,75,105,75,105,75,45,2,False


In [7]:
np.unique(poke_df['Generation'])

array([1, 2, 3, 4, 5, 6])

In [8]:
gen_ord_map = {1: 'Gen 1', 2: 'Gen 2', 3: 'Gen 3',
               4: 'Gen 4', 5: 'Gen 5', 6: 'Gen 6'}
poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)
poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10]

Unnamed: 0,Name,Generation,GenerationLabel
4,Octillery,2,Gen 2
5,Helioptile,6,Gen 6
6,Dialga,4,Gen 4
7,DeoxysDefense Forme,3,Gen 3
8,Rapidash,1,Gen 1
9,Swanna,5,Gen 5


In [10]:
poke_df[['Name', 'GenerationLabel', 'Legendary']].iloc[4:10]


Unnamed: 0,Name,GenerationLabel,Legendary
4,Octillery,Gen 2,False
5,Helioptile,Gen 6,False
6,Dialga,Gen 4,True
7,DeoxysDefense Forme,Gen 3,True
8,Rapidash,Gen 1,False
9,Swanna,Gen 5,False


In [12]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# transform and map pokemon generations
gen_le = LabelEncoder()
gen_labels = gen_le.fit_transform(poke_df['GenerationLabel'])
poke_df['Gen_Label'] = gen_labels

# transform and map pokemon legend
leg_le = LabelEncoder()
leg_labels = leg_le.fit_transform(poke_df['Legendary'])
poke_df['Legend_Label'] = leg_labels
poke_df_sub = poke_df[['Name', 'GenerationLabel', 'Gen_Label', 'Legendary', 'Legend_Label']]
poke_df_sub.iloc[4:10]

Unnamed: 0,Name,GenerationLabel,Gen_Label,Legendary,Legend_Label
4,Octillery,Gen 2,1,False,0
5,Helioptile,Gen 6,5,False,0
6,Dialga,Gen 4,3,True,1
7,DeoxysDefense Forme,Gen 3,2,True,1
8,Rapidash,Gen 1,0,False,0
9,Swanna,Gen 5,4,False,0


In [17]:
# encode generation labels using one-hot encoding scheme
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(poke_df[['Gen_Label']]).toarray()
gen_feature_labels = list(gen_le.classes_)
gen_features = pd.DataFrame(gen_feature_arr, 
                            columns=gen_feature_labels)
# encode legendary status labels using one-hot encoding scheme
leg_ohe = OneHotEncoder()
leg_feature_arr = leg_ohe.fit_transform(poke_df[['Legend_Label']]).toarray()
leg_feature_labels = ['Legendary'+str(cls_label) for cls_label in leg_le.classes_]
leg_features = pd.DataFrame(leg_feature_arr, columns=leg_feature_labels)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [18]:
gen_features.head(10)

Unnamed: 0,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6
0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,1.0,0.0,0.0
7,0.0,0.0,1.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
leg_features.head()

Unnamed: 0,LegendaryFalse,LegendaryTrue
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [21]:
poke_df_ohe = pd.concat([poke_df_sub, gen_features, leg_features], axis=1)
columns = sum([['Name', 'GenerationLabel', 'Gen_Label'],
              gen_feature_labels, ['Legendary', 'Legend_Label'],
              leg_feature_labels], [])
poke_df_ohe[columns].iloc[4:10]

Unnamed: 0,Name,GenerationLabel,Gen_Label,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6,Legendary,Legend_Label,LegendaryFalse,LegendaryTrue
4,Octillery,Gen 2,1,0.0,1.0,0.0,0.0,0.0,0.0,False,0,1.0,0.0
5,Helioptile,Gen 6,5,0.0,0.0,0.0,0.0,0.0,1.0,False,0,1.0,0.0
6,Dialga,Gen 4,3,0.0,0.0,0.0,1.0,0.0,0.0,True,1,0.0,1.0
7,DeoxysDefense Forme,Gen 3,2,0.0,0.0,1.0,0.0,0.0,0.0,True,1,0.0,1.0
8,Rapidash,Gen 1,0,1.0,0.0,0.0,0.0,0.0,0.0,False,0,1.0,0.0
9,Swanna,Gen 5,4,0.0,0.0,0.0,0.0,1.0,0.0,False,0,1.0,0.0


In [22]:
new_poke_df = pd.DataFrame([['PikaZoom', 'Gen 3', True],
                           ['CharyMyToast', 'Gen 4', False]],
                          columns=['Name', 'GenerationLabel', 'Legendary']
                          )
new_poke_df

Unnamed: 0,Name,GenerationLabel,Legendary
0,PikaZoom,Gen 3,True
1,CharyMyToast,Gen 4,False


In [26]:
new_gen_labels = gen_le.transform(new_poke_df['GenerationLabel'])
new_poke_df['Gen_Label'] = new_gen_labels

new_leg_labels = leg_le.transform(new_poke_df['Legendary'])
new_poke_df['Legend_Label'] = new_leg_labels

new_poke_df[['Name', 'GenerationLabel', 'Gen_Label', 'Legendary', 'Legend_Label']]

Unnamed: 0,Name,GenerationLabel,Gen_Label,Legendary,Legend_Label
0,PikaZoom,Gen 3,2,True,1
1,CharyMyToast,Gen 4,3,False,0


In [31]:
new_gen_feature_arr = gen_ohe.transform(new_poke_df[['Gen_Label']]).toarray()
new_gen_features = pd.DataFrame(new_gen_feature_arr,
                               columns=gen_feature_labels)

new_leg_feature_arr = leg_ohe.transform(new_poke_df[['Legend_Label']]).toarray()
new_leg_features = pd.DataFrame(new_leg_feature_arr,
                               columns=leg_feature_labels)

new_poke_ohe = pd.concat([new_poke_df, new_gen_features, 
                         new_leg_features], axis=1)
columns = sum([['Name', 'GenerationLabel', 'Gen_Label'],
              gen_feature_labels, ['Legendary', 'Legend_Label'],
              leg_feature_labels], [])
new_poke_ohe[columns]

Unnamed: 0,Name,GenerationLabel,Gen_Label,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6,Legendary,Legend_Label,LegendaryFalse,LegendaryTrue
0,PikaZoom,Gen 3,2,0.0,0.0,1.0,0.0,0.0,0.0,True,1,0.0,1.0
1,CharyMyToast,Gen 4,3,0.0,0.0,0.0,1.0,0.0,0.0,False,0,1.0,0.0


In [32]:
gen_onehot_features = pd.get_dummies(poke_df['GenerationLabel'])
pd.concat([poke_df[['Name', 'GenerationLabel']], gen_onehot_features], axis=1).iloc[4:10]

Unnamed: 0,Name,GenerationLabel,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6
4,Octillery,Gen 2,0,1,0,0,0,0
5,Helioptile,Gen 6,0,0,0,0,0,1
6,Dialga,Gen 4,0,0,0,1,0,0
7,DeoxysDefense Forme,Gen 3,0,0,1,0,0,0
8,Rapidash,Gen 1,1,0,0,0,0,0
9,Swanna,Gen 5,0,0,0,0,1,0


In [33]:
gen_dummy_features = pd.get_dummies(poke_df['GenerationLabel'], drop_first=True)
pd.concat([poke_df[['Name', 'GenerationLabel']], gen_dummy_features], axis=1).iloc[4:10]

Unnamed: 0,Name,GenerationLabel,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6
4,Octillery,Gen 2,1,0,0,0,0
5,Helioptile,Gen 6,0,0,0,0,1
6,Dialga,Gen 4,0,0,1,0,0
7,DeoxysDefense Forme,Gen 3,0,1,0,0,0
8,Rapidash,Gen 1,0,0,0,0,0
9,Swanna,Gen 5,0,0,0,1,0


In [34]:
gen_onehot_features = pd.get_dummies(poke_df['GenerationLabel'])
gen_dummy_features = gen_onehot_features.iloc[:,:-1]
pd.concat([poke_df[['Name', 'GenerationLabel']], gen_dummy_features], axis=1).iloc[4:10]

Unnamed: 0,Name,GenerationLabel,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5
4,Octillery,Gen 2,0,1,0,0,0
5,Helioptile,Gen 6,0,0,0,0,0
6,Dialga,Gen 4,0,0,0,1,0
7,DeoxysDefense Forme,Gen 3,0,0,1,0,0
8,Rapidash,Gen 1,1,0,0,0,0
9,Swanna,Gen 5,0,0,0,0,1


In [35]:
gen_onehot_features = pd.get_dummies(poke_df['GenerationLabel'])
gen_effect_features = gen_onehot_features.iloc[:,:-1]
gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
pd.concat([poke_df[['Name', 'GenerationLabel']], gen_effect_features], axis=1).iloc[4:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,Name,GenerationLabel,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5
4,Octillery,Gen 2,0.0,1.0,0.0,0.0,0.0
5,Helioptile,Gen 6,-1.0,-1.0,-1.0,-1.0,-1.0
6,Dialga,Gen 4,0.0,0.0,0.0,1.0,0.0
7,DeoxysDefense Forme,Gen 3,0.0,0.0,1.0,0.0,0.0
8,Rapidash,Gen 1,1.0,0.0,0.0,0.0,0.0
9,Swanna,Gen 5,0.0,0.0,0.0,0.0,1.0


In [38]:
unique_genres = np.unique(vg_df[['Genre']])
print("Total game genres:",len(unique_genres))
unique_genres

Total game genres: 12


array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

In [None]:
from sklearn.feature_extraction import FeatureHasher

fh = FeatureHasher(n_features=6, input_type='string')
hashed_features - fh.fit_transform()