# Feature Engineering

### Importing the required libraries

In [1]:
import numpy as np
import pandas as pd


### Reading the Dataset

In [2]:
data = pd.read_csv('/Users/manis/Downloads/dataset/pokemon.csv')
data.head(10)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
5,5,Charmeleon,Fire,,405,58,64,58,80,65,80,1,False
6,6,Charizard,Fire,Flying,534,78,84,78,109,85,100,1,False
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False
9,7,Squirtle,Water,,314,44,48,65,50,64,43,1,False


### As we have seen during data analysis 'Type2' has about 48% of missing values so lets deal with it

In [3]:
data['Type 2'].fillna('Missing', inplace=True)

### We have filled the NaN values with a new value 'Missing'

In [4]:
data.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,Missing,309,39,52,43,60,50,65,1,False


### Lets drop the 'Name' feature

In [5]:
data.drop('Name', axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,#,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Fire,Missing,309,39,52,43,60,50,65,1,False


### Lets do One Hot Encoding of the categorical features

In [7]:
pd.pandas.set_option('display.max_columns', None)
data = pd.get_dummies(data, columns=['Type 1', 'Type 2'], drop_first=True)

### Lets separate the independent and dependent features

In [8]:
y_data = data['Legendary']
x_data = data.drop('Legendary', axis=1)

In [9]:
x_data.head()

Unnamed: 0,#,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Type 1_Dark,Type 1_Dragon,Type 1_Electric,Type 1_Fairy,Type 1_Fighting,Type 1_Fire,Type 1_Flying,Type 1_Ghost,Type 1_Grass,Type 1_Ground,Type 1_Ice,Type 1_Normal,Type 1_Poison,Type 1_Psychic,Type 1_Rock,Type 1_Steel,Type 1_Water,Type 2_Dark,Type 2_Dragon,Type 2_Electric,Type 2_Fairy,Type 2_Fighting,Type 2_Fire,Type 2_Flying,Type 2_Ghost,Type 2_Grass,Type 2_Ground,Type 2_Ice,Type 2_Missing,Type 2_Normal,Type 2_Poison,Type 2_Psychic,Type 2_Rock,Type 2_Steel,Type 2_Water
0,1,318,45,49,49,65,65,45,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,2,405,60,62,63,80,80,60,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,3,525,80,82,83,100,100,80,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,3,625,80,100,123,122,120,80,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,4,309,39,52,43,60,50,65,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [10]:
y_data

0      False
1      False
2      False
3      False
4      False
       ...  
795     True
796     True
797     True
798     True
799     True
Name: Legendary, Length: 800, dtype: bool

### Lets now do standardization using Standard Scaler

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [12]:
transform = scaler.fit_transform(x_data)

In [13]:
x_trans = pd.DataFrame(transform, columns=x_data.columns)

In [14]:
x_trans.head()

Unnamed: 0,#,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Type 1_Dark,Type 1_Dragon,Type 1_Electric,Type 1_Fairy,Type 1_Fighting,Type 1_Fire,Type 1_Flying,Type 1_Ghost,Type 1_Grass,Type 1_Ground,Type 1_Ice,Type 1_Normal,Type 1_Poison,Type 1_Psychic,Type 1_Rock,Type 1_Steel,Type 1_Water,Type 2_Dark,Type 2_Dragon,Type 2_Electric,Type 2_Fairy,Type 2_Fighting,Type 2_Fire,Type 2_Flying,Type 2_Ghost,Type 2_Grass,Type 2_Ground,Type 2_Ice,Type 2_Missing,Type 2_Normal,Type 2_Poison,Type 2_Psychic,Type 2_Rock,Type 2_Steel,Type 2_Water
0,-1.737705,-0.976765,-0.950626,-0.924906,-0.797154,-0.23913,-0.248189,-0.801503,-1.399637,-0.200779,-0.204124,-0.241249,-0.147348,-0.186893,-0.263664,-0.070888,-0.204124,3.22933,-0.204124,-0.175863,-0.373632,-0.190445,-0.276977,-0.241249,-0.186893,-0.403473,-0.160128,-0.151717,-0.086929,-0.172049,-0.18328,-0.123404,-0.371457,-0.13346,-0.179605,-0.213896,-0.13346,-0.965592,-0.070888,4.746516,-0.207424,-0.13346,-0.16816,-0.13346
1,-1.732902,-0.251088,-0.362822,-0.52413,-0.347917,0.21956,0.291156,-0.285015,-1.399637,-0.200779,-0.204124,-0.241249,-0.147348,-0.186893,-0.263664,-0.070888,-0.204124,3.22933,-0.204124,-0.175863,-0.373632,-0.190445,-0.276977,-0.241249,-0.186893,-0.403473,-0.160128,-0.151717,-0.086929,-0.172049,-0.18328,-0.123404,-0.371457,-0.13346,-0.179605,-0.213896,-0.13346,-0.965592,-0.070888,4.746516,-0.207424,-0.13346,-0.16816,-0.13346
2,-1.7281,0.749845,0.420917,0.092448,0.293849,0.831146,1.010283,0.403635,-1.399637,-0.200779,-0.204124,-0.241249,-0.147348,-0.186893,-0.263664,-0.070888,-0.204124,3.22933,-0.204124,-0.175863,-0.373632,-0.190445,-0.276977,-0.241249,-0.186893,-0.403473,-0.160128,-0.151717,-0.086929,-0.172049,-0.18328,-0.123404,-0.371457,-0.13346,-0.179605,-0.213896,-0.13346,-0.965592,-0.070888,4.746516,-0.207424,-0.13346,-0.16816,-0.13346
3,-1.7281,1.583957,0.420917,0.647369,1.577381,1.503891,1.729409,0.403635,-1.399637,-0.200779,-0.204124,-0.241249,-0.147348,-0.186893,-0.263664,-0.070888,-0.204124,3.22933,-0.204124,-0.175863,-0.373632,-0.190445,-0.276977,-0.241249,-0.186893,-0.403473,-0.160128,-0.151717,-0.086929,-0.172049,-0.18328,-0.123404,-0.371457,-0.13346,-0.179605,-0.213896,-0.13346,-0.965592,-0.070888,4.746516,-0.207424,-0.13346,-0.16816,-0.13346
4,-1.723297,-1.051836,-1.185748,-0.832419,-0.989683,-0.392027,-0.787533,-0.112853,-1.399637,-0.200779,-0.204124,-0.241249,-0.147348,-0.186893,3.792706,-0.070888,-0.204124,-0.309662,-0.204124,-0.175863,-0.373632,-0.190445,-0.276977,-0.241249,-0.186893,-0.403473,-0.160128,-0.151717,-0.086929,-0.172049,-0.18328,-0.123404,-0.371457,-0.13346,-0.179605,-0.213896,-0.13346,1.035635,-0.070888,-0.210681,-0.207424,-0.13346,-0.16816,-0.13346


### We have seen during data analysis that there was class imbalance problem in data so lets deal with it

In [15]:
from imblearn.over_sampling import RandomOverSampler

In [16]:
os = RandomOverSampler()

In [17]:
x, y = os.fit_sample(x_trans, y_data)

In [18]:
y.value_counts()

True     735
False    735
Name: Legendary, dtype: int64

### Now we can see there are equal number of True and False class labels

### Now lets do train test split

In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

### We can apply support vector machine 

In [20]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')

In [21]:
svc.fit(x_train, y_train)

SVC(kernel='linear')

In [22]:
pred = svc.predict(x_test)

### Lets see performance of the model

In [23]:
from sklearn.metrics import classification_report, confusion_matrix

In [24]:
print(confusion_matrix(y_test, pred))

[[176  10]
 [  0 182]]


In [25]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

       False       1.00      0.95      0.97       186
        True       0.95      1.00      0.97       182

    accuracy                           0.97       368
   macro avg       0.97      0.97      0.97       368
weighted avg       0.97      0.97      0.97       368

