In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [23]:
dataset = pd.read_csv('Data.csv')

# last column not included
X = dataset.iloc[:,:-1].values

# last column included
Y = dataset.iloc[:,3].values

print dataset.shape
print '_'*40
print dataset.head()
print '_'*40
print X
print '_'*40
print Y

(10, 4)
________________________________________
   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
________________________________________
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
________________________________________
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [24]:
# Taking care of missing data

from sklearn.preprocessing import Imputer 

# mean means replace nan values with mean of that column. Other 
# strategies available are median and most_frequent. In mean the rows with nan were also counted in total count
imputer = Imputer(missing_values = 'NaN', strategy = 'mean',axis = 0)

# the parameter given are only those columns that contain nan which we have seen manually
# 1:3 means columns with index 1,2. Indexing starts from 0.
imputer = imputer.fit(X[:,1:3])

X[:,1:3] = imputer.transform(X[:,1:3])

print '_'*40

print X

________________________________________
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [25]:
# Encoding categorial Data- Data ie not numeric(Country and Purchased in above case)

from sklearn.preprocessing import OneHotEncoder,LabelEncoder

# new objectof LabelEncoder class
labelEncoder_X = LabelEncoder()

# In 0th column replace 'France','Spain','Germany' by any number throughout
# This is done bcz ML algos generally deal with numbers not strings hence we map for our later ease
X[:,0] = labelEncoder_X.fit_transform(X[:,0])

# parameters an array containing the index of column to be mapped
# In above we converted country column to numbers 
# Now we created columns equal to number of distinct country values and delete the original country col
# where every column corresponds to a value in original column
# if a row has value spain then we just mark 1 in spains new column
oneHotEncoder = OneHotEncoder(categorical_features = [0])
X = oneHotEncoder.fit_transform(X).toarray()

# doing the same as above for Purchased column
labelEncoder_Y = LabelEncoder()
X[:,3] = labelEncoder_Y.fit_transform(X[:,3])
X = oneHotEncoder.fit_transform(X).toarray()
print X

[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    7.00000000e+00   7.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    0.00000000e+00   4.80000000e+04]
 [  1.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    1.00000000e+00   5.40000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    4.00000000e+00   6.10000000e+04]
 [  1.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    6.00000000e+00   6.37777778e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    2.00000000e+00   5.80000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    5.00000000e+00   5.20000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    8.00000000e+00   7.90000000e+04]
 [  1.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    9.00000000e+00   8.30000000e+04]
 [  0.00000000e+00   1.00000000e+00  

In [32]:
# Splitting the dataset into test set and training set

from sklearn.cross_validation import train_test_split

# random state is just for sake of course
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
X_train

array([[  1.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   6.00000000e+00,   6.37777778e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   3.00000000e+00,   6.70000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   4.80000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   5.00000000e+00,   5.20000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   8.00000000e+00,   7.90000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   4.00000000e+00,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   7.00000000e+00,   7.20000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   2.00000000e+00,   5.80000000e+04]])

In [38]:
# Feature Scaling- In practice it's seen that algos converge much faster if data is scaled properly

# standardization Scaling = (x- min(x))/ sd
# Normalization Scaling = (x- min(x))/ max(x) - min(x)

from sklearn.preprocessing import  StandardScaler

sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)