# Introduction to machine learning


# installation
1. python 3.6.1
2. virtualenv (pip3 install --user --upgrade virtualenv)


# Create virtualenv for machine learning

$ virtualenv mlenv --python=/usr/local/bin/python3

$ cd mlenv

$ source bin/activate


# Install python library for machine learning
$ pip3 install --upgrade jupyter matplotlib numpy pandas scipy scikit-learn graphviz


# Verify
$ python3 -c "import jupyter, matplotlib, numpy, pandas, scipy, sklearn"

# Start jupyter notebook
$ ./bin/jupyter notebook


press Shift + Enter to execute code

In [None]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib
import graphviz
pd.__version__

### แสดง function ของไลบรารี่ ว่ามีอะไรสามารถเรียกใช้บ้าง

In [4]:
dir(np)

['ALLOW_THREADS',
 'AxisError',
 'BUFSIZE',
 'CLIP',
 'DataSource',
 'ERR_CALL',
 'ERR_DEFAULT',
 'ERR_IGNORE',
 'ERR_LOG',
 'ERR_PRINT',
 'ERR_RAISE',
 'ERR_WARN',
 'FLOATING_POINT_SUPPORT',
 'FPE_DIVIDEBYZERO',
 'FPE_INVALID',
 'FPE_OVERFLOW',
 'FPE_UNDERFLOW',
 'False_',
 'Inf',
 'Infinity',
 'MAXDIMS',
 'MAY_SHARE_BOUNDS',
 'MAY_SHARE_EXACT',
 'MachAr',
 'NAN',
 'NINF',
 'NZERO',
 'NaN',
 'PINF',
 'PZERO',
 'PackageLoader',
 'RAISE',
 'SHIFT_DIVIDEBYZERO',
 'SHIFT_INVALID',
 'SHIFT_OVERFLOW',
 'SHIFT_UNDERFLOW',
 'ScalarType',
 'Tester',
 'TooHardError',
 'True_',
 'UFUNC_BUFSIZE_DEFAULT',
 'UFUNC_PYVALS_NAME',
 'WRAP',
 '_NoValue',
 '__NUMPY_SETUP__',
 '__all__',
 '__builtins__',
 '__cached__',
 '__config__',
 '__doc__',
 '__file__',
 '__git_revision__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_distributor_init',
 '_globals',
 '_import_tools',
 '_mat',
 'abs',
 'absolute',
 'absolute_import',
 'add',
 'add_docstring',
 'add_newdoc',


### แสดงคู่มือของฟังก์ชั่นที่สนใจ

In [5]:
help(np.ones)

Help on function ones in module numpy.core.numeric:

ones(shape, dtype=None, order='C')
    Return a new array of given shape and type, filled with ones.
    
    Parameters
    ----------
    shape : int or sequence of ints
        Shape of the new array, e.g., ``(2, 3)`` or ``2``.
    dtype : data-type, optional
        The desired data-type for the array, e.g., `numpy.int8`.  Default is
        `numpy.float64`.
    order : {'C', 'F'}, optional
        Whether to store multidimensional data in C- or Fortran-contiguous
        (row- or column-wise) order in memory.
    
    Returns
    -------
    out : ndarray
        Array of ones with the given shape, dtype, and order.
    
    See Also
    --------
    zeros, ones_like
    
    Examples
    --------
    >>> np.ones(5)
    array([ 1.,  1.,  1.,  1.,  1.])
    
    >>> np.ones((5,), dtype=np.int)
    array([1, 1, 1, 1, 1])
    
    >>> np.ones((2, 1))
    array([[ 1.],
           [ 1.]])
    
    >>> s = (2,2)
    >>> np.ones(s)
   

### ทดสอบเรียกใช้ function ones โดยการสร้างอาเรย์ 1 มิติ และมีข้อมูล 5 elements

In [7]:
np.ones(5)

array([ 1.,  1.,  1.,  1.,  1.])

In [9]:
#np.ones((rows, columns))
np.ones((2, 3))

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

### Supervised Learning Example
#### Pedicts  Orange vs Apple

|   Weight  |  Texture  |  Label  |
|-----------|-----------|---------|
|   150g    |    Bumpy  |  Orange |
|   160g    |    Bumpy  |  Orange |
|   170g    |    Bumpy  |  Orange |
|   140g    |    Smooth |  Apple  |
|   130g    |    Smooth |  Apple  |
|   120g    |    Smooth |  Apple  |

In [27]:
import pandas as pd

data = {'Weight' : ['150g', '160g', '170g', '140g', '130g', '120g'],
        'Texture': ['Bumpy', 'Bumpy', 'Bumpy', 'Smooth', 'Smooth', 'Smooth' ],
        'Label' : ['Orange', 'Orange', 'Orange', 'Apple', 'Apple', 'Apple']}

# Prepare and Cleaning Data
df = pd.DataFrame.from_dict(data)
df['Label'] = df['Label'].map({'Orange' : 0, 'Apple' : 1})
df['Texture'] = df['Texture'].map({'Bumpy' : 0, 'Smooth' : 1})
df['Weight'] = df['Weight'].map(lambda input: input.replace('g',''))
df

Unnamed: 0,Label,Texture,Weight
0,0,0,150
1,0,0,160
2,0,0,170
3,1,1,140
4,1,1,130
5,1,1,120


####  Load data from excel file

In [26]:
import pandas as pd

data_url = "https://gist.githubusercontent.com/merxer/0c08bf233971e99b3f77fedce6511f8c/raw/ff3e34aad81426583fb03bbd68e8965403a71e5a/orange_and_apple.csv"
df = pd.read_csv(data_url, nrows=6)
df

Unnamed: 0,Label,Texture,Weight
0,0,0,150
1,0,0,160
2,0,0,170
3,1,1,140
4,1,1,130
5,1,1,120


## First Supervised Learning

In [42]:
import pandas as pd
from sklearn import tree

data_url = "https://gist.githubusercontent.com/merxer/0c08bf233971e99b3f77fedce6511f8c/raw/ff3e34aad81426583fb03bbd68e8965403a71e5a/orange_and_apple.csv"
df = pd.read_csv(data_url, nrows=6)

# Prepare Data
features = df[['Weight', 'Texture']].values
labels = df['Label'].values

# Select Classifier and Training
clf = tree.DecisionTreeClassifier()
# fit = find patterns in data
clf.fit(features, labels)

# Make Predictions
clf.predict([[145, 0], [142, 1]])



array([0, 1])

## Second Supervised Learning |> IRIS flower

https://en.wikipedia.org/wiki/Iris_flower_data_set#Data_set


In [67]:
from sklearn.datasets import load_iris
iris = load_iris()


print(iris.feature_names)
print(iris.data)
print(iris.target_names)
print(iris.target)
print(iris.data[0])
print(iris.target[0])
print(type(iris.data))
print(len(iris.data))

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]
 [ 4.6  3.4  1.4  0.3]
 [ 5.   3.4  1.5  0.2]
 [ 4.4  2.9  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 5.4  3.7  1.5  0.2]
 [ 4.8  3.4  1.6  0.2]
 [ 4.8  3.   1.4  0.1]
 [ 4.3  3.   1.1  0.1]
 [ 5.8  4.   1.2  0.2]
 [ 5.7  4.4  1.5  0.4]
 [ 5.4  3.9  1.3  0.4]
 [ 5.1  3.5  1.4  0.3]
 [ 5.7  3.8  1.7  0.3]
 [ 5.1  3.8  1.5  0.3]
 [ 5.4  3.4  1.7  0.2]
 [ 5.1  3.7  1.5  0.4]
 [ 4.6  3.6  1.   0.2]
 [ 5.1  3.3  1.7  0.5]
 [ 4.8  3.4  1.9  0.2]
 [ 5.   3.   1.6  0.2]
 [ 5.   3.4  1.6  0.4]
 [ 5.2  3.5  1.5  0.2]
 [ 5.2  3.4  1.4  0.2]
 [ 4.7  3.2  1.6  0.2]
 [ 4.8  3.1  1.6  0.2]
 [ 5.4  3.4  1.5  0.4]
 [ 5.2  4.1  1.5  0.1]
 [ 5.5  4.2  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 5.   3.2  1.2  0.2]
 [ 5.5  3.5  1.3  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 4.4  3.   1.3  0.2]
 [ 5.1  3.4  1.5  0.

### Method 1 |> Random testing data

In [3]:
# Import dataset
from sklearn.datasets import load_iris
from sklearn import tree
import numpy as np


iris = load_iris()

test_index = [0, 10, 20, 50, 70, 90, 110, 120, 130]

# Train data
train_target = np.delete(iris.target, test_index)
train_data = np.delete(iris.data, test_index, axis = 0)


# Test data
test_target = iris.target[test_index]
test_data = iris.data[test_index]

# Train a classifier
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_target)

# Predict label for new flower
print(test_target)
print(clf.predict(test_data))

# Visualize the Tree
# code: http://scikit-learn.org/stable/modules/tree.html
# require graphviz : http://www.graphviz.org/

from sklearn.externals.six import StringIO
from IPython.display import Image
import graphviz
import pydotplus
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data, 
                         feature_names=iris.feature_names,  
                         class_names=iris.target_names,  
                         filled=True, rounded=True,  
                         impurity=False)  


graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("iris.pdf")

[0 0 0 1 1 1 2 2 2]
[0 0 0 1 2 1 2 2 2]


True

### Method 2 |>  Split testing data

In [20]:
# disable warning
import warnings
warnings.simplefilter('ignore')

# Import Data
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5)
print(len(X_train), len(X_test))

from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

75 75
0.933333333333


## Change Classifier from DecisionTreeClassifier to KNeighborsClassifier

In [21]:
# disable warning
import warnings
warnings.simplefilter('ignore')

# Import Data
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5)
print(len(X_train), len(X_test))

# Change Classifier to K-nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

75 75
0.96
