In [2]:
import numpy as np
import pandas as pd

Reading Data

In [3]:
dataSet = pd.read_csv('Data.csv')
X = dataSet.iloc[:,0:3].values
Y = dataSet.iloc[:,-1].values

print("feature Matrix/Independent Variables\n",X)
print("Dependent Variables\n",Y)


feature Matrix/Independent Variables
 [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
Dependent Variables
 ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# Fill missing data

In [4]:
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# Enoding the Categorial Data

Machine Learning models are based on mathematical values, so it will create problem working with non-numerical values.

Countries column(0) has 3 category of countries so we will encode that into number.

In [5]:
X[:,0]

array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France', 'Spain',
       'France', 'Germany', 'France'], dtype=object)

In [6]:
from sklearn.preprocessing import LabelEncoder

labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
X[:,0]

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0], dtype=object)

Okay... So we have encoded countries into numbers but it has one problem i.e ML algo will learn these values and try to relate these numbers (2>1>0) which doesnt make any sense here(i.e for countries). Suppose if we had categories as(large,small,medium) then it make sense to have numbers for these.

Fix : we can have 3 columns for these 3 categories

In [7]:
from sklearn.preprocessing import OneHotEncoder
X = OneHotEncoder(categorical_features = [0]).fit_transform(X).toarray()

So now we have three columns in order to identify a category.

In [8]:
X

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.0000000

In [9]:
labelencoder_Y = LabelEncoder()
Y = labelencoder_X.fit_transform(Y)

# Train Test Data split

In [13]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= .20, random_state = 69)

In [14]:
X_train, X_test, Y_train, Y_test

(array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
           3.00000000e+01,   5.40000000e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           3.50000000e+01,   5.80000000e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           4.40000000e+01,   7.20000000e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           3.70000000e+01,   6.70000000e+04],
        [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
           2.70000000e+01,   4.80000000e+04],
        [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
           4.00000000e+01,   6.37777778e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           4.80000000e+01,   7.90000000e+04],
        [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
           3.87777778e+01,   5.20000000e+04]]),
 array([[  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
           3.80000000e+01,   6.10000000e+04],
        [  0.0000