In [38]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [39]:
dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [40]:
print(x)

[['Japan' 41.0 73000.0]
 ['Qatar' 27.0 41000.0]
 ['Philippines' 37.0 44000.0]
 ['Indonesia' 35.0 51000.0]
 ['Japan' 30.0 nan]
 ['Brazil' 35.0 28000.0]
 ['Philippines' nan 53000.0]
 ['Indonesia' 55.0 79000.0]
 ['Malaysia' 51.0 83000.0]
 ['Japan' 37.0 67000.0]]


In [41]:
print(y)

['Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [42]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

In [43]:
print(x)

[['Japan' 41.0 73000.0]
 ['Qatar' 27.0 41000.0]
 ['Philippines' 37.0 44000.0]
 ['Indonesia' 35.0 51000.0]
 ['Japan' 30.0 57666.666666666664]
 ['Brazil' 35.0 28000.0]
 ['Philippines' 38.666666666666664 53000.0]
 ['Indonesia' 55.0 79000.0]
 ['Malaysia' 51.0 83000.0]
 ['Japan' 37.0 67000.0]]


In [44]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [45]:
print(x)

[[0.0 0.0 1.0 0.0 0.0 0.0 41.0 73000.0]
 [0.0 0.0 0.0 0.0 0.0 1.0 27.0 41000.0]
 [0.0 0.0 0.0 0.0 1.0 0.0 37.0 44000.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 35.0 51000.0]
 [0.0 0.0 1.0 0.0 0.0 0.0 30.0 57666.666666666664]
 [1.0 0.0 0.0 0.0 0.0 0.0 35.0 28000.0]
 [0.0 0.0 0.0 0.0 1.0 0.0 38.666666666666664 53000.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 55.0 79000.0]
 [0.0 0.0 0.0 1.0 0.0 0.0 51.0 83000.0]
 [0.0 0.0 1.0 0.0 0.0 0.0 37.0 67000.0]]


In [46]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [47]:
print(y)

[1 0 0 0 1 1 0 1 0 1]


In [48]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [49]:
print(x_train)

[[0.0 0.0 0.0 0.0 1.0 0.0 38.666666666666664 53000.0]
 [0.0 0.0 1.0 0.0 0.0 0.0 30.0 57666.666666666664]
 [0.0 0.0 1.0 0.0 0.0 0.0 41.0 73000.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 35.0 51000.0]
 [0.0 0.0 0.0 0.0 0.0 1.0 27.0 41000.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 55.0 79000.0]
 [0.0 0.0 0.0 1.0 0.0 0.0 51.0 83000.0]
 [1.0 0.0 0.0 0.0 0.0 0.0 35.0 28000.0]]


In [50]:
print(x_test)

[[0.0 0.0 0.0 0.0 1.0 0.0 37.0 44000.0]
 [0.0 0.0 1.0 0.0 0.0 0.0 37.0 67000.0]]


In [51]:
print(y_train)

[0 1 1 0 0 1 0 1]


In [52]:
print(y_test)

[0 1]


In [53]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])
x_test[:, 3:] = sc.transform(x_test[:, 3:])

In [54]:
print(x_train)

[[0.0 0.0 0.0 -0.3779644730092272 2.6457513110645903 -0.3779644730092272
  -0.04583685080997373 -0.291509957406274]
 [0.0 0.0 1.0 -0.3779644730092272 -0.3779644730092272 -0.3779644730092272
  -0.9992433476574325 -0.03031703557025239]
 [0.0 0.0 1.0 -0.3779644730092272 -0.3779644730092272 -0.3779644730092272
  0.21084951372588087 0.8278882790338192]
 [0.0 1.0 0.0 -0.3779644730092272 -0.3779644730092272 -0.3779644730092272
  -0.4492011379377446 -0.40344978105028334]
 [0.0 0.0 0.0 -0.3779644730092272 -0.3779644730092272 2.6457513110645903
  -1.3292686734892452 -0.96314889927033]
 [0.0 1.0 0.0 -0.3779644730092272 -0.3779644730092272 -0.3779644730092272
  1.750967700941007 1.1637077499658472]
 [0.0 0.0 0.0 2.6457513110645903 -0.3779644730092272 -0.3779644730092272
  1.3109339331652565 1.387587397253866]
 [1.0 0.0 0.0 -0.3779644730092272 -0.3779644730092272 -0.3779644730092272
  -0.4492011379377446 -1.6907577529563906]]


In [55]:
print(x_test)

[[0.0 0.0 0.0 -0.3779644730092272 2.6457513110645903 -0.3779644730092272
  -0.22918425404986942 -0.795239163804316]
 [0.0 0.0 1.0 -0.3779644730092272 -0.3779644730092272 -0.3779644730092272
  -0.22918425404986942 0.49206880810179127]]
