# Feature Engineering - OneHot Encoding

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data=pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Sex,Age,Salary,Purchased
0,Bangladesh,Female,44,72000,No
1,India,Male,27,48000,Yes
2,Pakistan,Female,30,54000,No
3,India,Male,38,61000,No
4,Pakistan,Female,40,65000,Yes
5,Bangladesh,Male,35,58000,Yes
6,India,Female,25,52000,No
7,Bangladesh,Male,48,79000,Yes
8,Pakistan,Female,50,83000,No
9,Bangladesh,Male,37,67000,Yes


In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#from sklearn.compose import ColumnTransformer

In [5]:
le=LabelEncoder()
data['class']=le.fit_transform(data.Purchased)
data.head()

Unnamed: 0,Country,Sex,Age,Salary,Purchased,class
0,Bangladesh,Female,44,72000,No,0
1,India,Male,27,48000,Yes,1
2,Pakistan,Female,30,54000,No,0
3,India,Male,38,61000,No,0
4,Pakistan,Female,40,65000,Yes,1


In [6]:
data=data.drop('Purchased', axis=1)
data

Unnamed: 0,Country,Sex,Age,Salary,class
0,Bangladesh,Female,44,72000,0
1,India,Male,27,48000,1
2,Pakistan,Female,30,54000,0
3,India,Male,38,61000,0
4,Pakistan,Female,40,65000,1
5,Bangladesh,Male,35,58000,1
6,India,Female,25,52000,0
7,Bangladesh,Male,48,79000,1
8,Pakistan,Female,50,83000,0
9,Bangladesh,Male,37,67000,1


In [7]:
#categorical data
cat_cols = ['Country', 'Sex'] 

#import pandas as pd
df = pd.get_dummies(data, columns = cat_cols)
df

Unnamed: 0,Age,Salary,class,Country_Bangladesh,Country_India,Country_Pakistan,Sex_Female,Sex_Male
0,44,72000,0,1,0,0,1,0
1,27,48000,1,0,1,0,0,1
2,30,54000,0,0,0,1,1,0
3,38,61000,0,0,1,0,0,1
4,40,65000,1,0,0,1,1,0
5,35,58000,1,1,0,0,0,1
6,25,52000,0,0,1,0,1,0
7,48,79000,1,1,0,0,0,1
8,50,83000,0,0,0,1,1,0
9,37,67000,1,1,0,0,0,1


In [8]:
df.drop(['Country_Bangladesh','Sex_Female'],axis=1,inplace=True)
df

Unnamed: 0,Age,Salary,class,Country_India,Country_Pakistan,Sex_Male
0,44,72000,0,0,0,0
1,27,48000,1,1,0,1
2,30,54000,0,0,1,0
3,38,61000,0,1,0,1
4,40,65000,1,0,1,0
5,35,58000,1,0,0,1
6,25,52000,0,1,0,0
7,48,79000,1,0,0,1
8,50,83000,0,0,1,0
9,37,67000,1,0,0,1


In [9]:
X=df.drop('class',axis=1) # Feature Matrix
X

Unnamed: 0,Age,Salary,Country_India,Country_Pakistan,Sex_Male
0,44,72000,0,0,0
1,27,48000,1,0,1
2,30,54000,0,1,0
3,38,61000,1,0,1
4,40,65000,0,1,0
5,35,58000,0,0,1
6,25,52000,1,0,0
7,48,79000,0,0,1
8,50,83000,0,1,0
9,37,67000,0,0,1


In [10]:
y=df['class']
y

0    0
1    1
2    0
3    0
4    1
5    1
6    0
7    1
8    0
9    1
Name: class, dtype: int32

In [11]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split # Depricated

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
y_train
X_test

Unnamed: 0,Age,Salary,Country_India,Country_Pakistan,Sex_Male
2,30,54000,0,1,0
8,50,83000,0,1,0


In [15]:
y_train
y_test

2    0
8    0
Name: class, dtype: int32

# PCA

In [25]:
from sklearn.decomposition import PCA
pca = PCA()               # All components
#pca = PCA(n_components=1)  # 1 components
#pca = PCA(n_components=2) # 2 components
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
#print(pca)
print(X_train)

[[ 2.25000179e+03 -1.61160406e+00  1.03192929e+00  2.31844234e-01
  -2.98766139e-01]
 [ 4.24999906e+03  2.83393336e+00 -4.16634189e-01  3.86433085e-01
  -9.66060563e-02]
 [-1.47500032e+04 -1.03923254e+00 -3.39852760e-01 -1.08704119e-02
  -4.54698943e-02]
 [-1.07500057e+04  3.92433647e+00  3.85176430e-01 -2.91846532e-01
   3.26715178e-02]
 [ 1.62500039e+04  6.12483052e-01 -4.74033470e-01 -1.06633553e-01
  -1.22020598e-01]
 [-1.74999864e+03 -2.53119709e+00 -4.21536265e-01 -5.11923313e-01
  -1.20918666e-01]
 [ 9.25000284e+03 -4.52786014e-01  4.75574662e-01 -2.27536870e-01
   4.01952584e-01]
 [-4.74999999e+03 -1.73593318e+00 -2.40623697e-01  5.30533361e-01
   2.49157252e-01]]


In [26]:
explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[9.99999945e-01 5.07305788e-08 2.99219846e-09 1.22358056e-09
 4.85145907e-10]


In [27]:
X_train=X_train[:,0:2]
X_train

array([[ 2.25000179e+03, -1.61160406e+00],
       [ 4.24999906e+03,  2.83393336e+00],
       [-1.47500032e+04, -1.03923254e+00],
       [-1.07500057e+04,  3.92433647e+00],
       [ 1.62500039e+04,  6.12483052e-01],
       [-1.74999864e+03, -2.53119709e+00],
       [ 9.25000284e+03, -4.52786014e-01],
       [-4.74999999e+03, -1.73593318e+00]])

In [28]:
X_test=X_test[:,0:2]
X_test

array([[-8.75000257e+03,  3.41579331e-01],
       [ 2.02500043e+04,  1.54161291e+00]])

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
model=RandomForestClassifier()

In [34]:
model.fit(X_train,y_train)

RandomForestClassifier()

In [36]:
y_pred=model.predict(X_test)

In [45]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report

In [42]:
cm=confusion_matrix(y_pred,y_test)
cm

array([[0, 0],
       [2, 0]], dtype=int64)

In [43]:
accuracy_score(y_pred,y_test)

0.0

In [48]:
cr=classification_report(y_pred,y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
print(cr)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



In [39]:
from sklearn.compose import ColumnTransformer

In [None]:
from sklearn.compose import ColumnTransformer
# Country column
ct = ColumnTransformer([("country", OneHotEncoder(), [0])], remainder = 'passthrough')
data = ct.fit_transform(data)
#print(data)

In [None]:
data=data[:,1:]
#data

In [None]:
#data=data.drop('3',axis=1)