In [1]:
from os.path import join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn import metrics

%matplotlib inline

In [2]:
abalone_data = pd.read_csv(join('data', 'abalone.csv')) # 데이터 불러오기
print(abalone_data.shape)
abalone_data.head(10)


(4176, 9)


Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
5,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
6,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
7,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
8,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19
9,F,0.525,0.38,0.14,0.6065,0.194,0.1475,0.21,14


In [21]:
#Explore data
np_abalone_data = np.array(abalone_data)
print(np_abalone_data[0:5, :])

[['M' 0.35 0.265 0.09 0.2255 0.0995 0.0485 0.07 7]
 ['F' 0.53 0.42 0.135 0.677 0.2565 0.1415 0.21 9]
 ['M' 0.44 0.365 0.125 0.516 0.2155 0.114 0.155 10]
 ['I' 0.33 0.255 0.08 0.205 0.0895 0.0395 0.055 7]
 ['I' 0.425 0.3 0.095 0.3515 0.141 0.0775 0.12 8]]


In [22]:
# Divide input and output variable
datax = np_abalone_data[:, 1:]
datay = np_abalone_data[:,0]
print(datax[0:5, :])
print(datay[0:5])

[[0.35 0.265 0.09 0.2255 0.0995 0.0485 0.07 7]
 [0.53 0.42 0.135 0.677 0.2565 0.1415 0.21 9]
 [0.44 0.365 0.125 0.516 0.2155 0.114 0.155 10]
 [0.33 0.255 0.08 0.205 0.0895 0.0395 0.055 7]
 [0.425 0.3 0.095 0.3515 0.141 0.0775 0.12 8]]
['M' 'F' 'M' 'I' 'I']


In [23]:
# trn-tst split
trnx, tstx, trny, tsty = train_test_split(datax, datay, test_size=0.3)
print(trnx.shape, tstx.shape, trny.shape, tsty.shape)

(2923, 8) (1253, 8) (2923,) (1253,)


In [24]:
# scaling
scaler = MinMaxScaler()
scaler.fit(trnx)
trnx_scale = scaler.transform(trnx)
tstx_scale = scaler.transform(tstx)
print(np.min(trnx_scale[:,0]) , np.max(trnx_scale[:,0]))
print(np.min(tstx_scale[:,0]) , np.max(tstx_scale[:,0]))

0.0 1.0
-0.08029197080291973 0.9416058394160587


In [25]:
k=6
knn_model = neighbors.KNeighborsClassifier(n_neighbors=k)
knn_model.fit(X=trnx, y=trny)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                     weights='uniform')

In [26]:
knn_pred_trn = knn_model.predict(X=trnx)
knn_pred_tst = knn_model.predict(X=tstx)

In [27]:
# predict train data and test data (lazy learning)
print(knn_pred_trn)
print(knn_pred_tst)

['M' 'M' 'M' ... 'I' 'F' 'I']
['I' 'F' 'I' ... 'M' 'I' 'I']


In [52]:
# traint error and test error
print(metrics.accuracy_score(trny, knn_pred_trn))
print(metrics.accuracy_score(tsty, knn_pred_tst))

0.6503592199794731
0.547486033519553


In [53]:
# Decision Tree
from sklearn import tree
tree_model = tree.DecisionTreeClassifier(max_depth=5, min_samples_split=4)
tree_model.fit(X=trnx, y=trny)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [54]:
tree_pred_trn = tree_model.predict(X=trnx)
tree_pred = tree_model.predict(X=tstx)
print(metrics.accuracy_score(trny, tree_pred_trn))
print(metrics.accuracy_score(tsty, tree_pred))

0.586725966472802
0.5594573024740622


In [39]:
tree_model.feature_importances_

array([0.03231884, 0.        , 0.03770667, 0.07390123, 0.02176186,
       0.64487896, 0.01695998, 0.17247246])

In [55]:
# draw tree graph visualization
from sklearn.tree import export_graphviz
export_graphviz(tree_model, out_file ='tree.dot')
#tree.plot_tree(tree_model)

In [56]:
from sklearn.metrics import confusion_matrix
confusion_matrix(tsty, tree_model.predict(tstx))

array([[ 81,  64, 247],
       [ 21, 309,  80],
       [ 74,  66, 311]], dtype=int64)

In [42]:
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier()
tree_model.fit(X=trnx, y=trny)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [61]:
# NN
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes = (8,) , max_iter=400)
clf.fit(trnx, trny)
tsty_hat = clf.predict(tstx)

In [62]:
print(clf)
#print(clf.loss_curve_)
print(tsty[0:10])
print(tsty_hat[0:10])

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(8,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=400,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)
['I' 'M' 'I' 'M' 'I' 'F' 'F' 'F' 'M' 'M']
['I' 'M' 'I' 'I' 'I' 'M' 'M' 'F' 'M' 'M']


In [63]:
clf2 = MLPClassifier(hidden_layer_sizes=(8,13,8,), max_iter=400)
clf2.fit(trnx, trny)
tsty_hat2 = clf2.predict(tstx)
print(tsty[0:10])
print(tsty_hat2[0:10])

['I' 'M' 'I' 'M' 'I' 'F' 'F' 'F' 'M' 'M']
['I' 'M' 'I' 'I' 'I' 'M' 'M' 'F' 'M' 'M']


In [64]:
from sklearn.metrics import accuracy_score
print(accuracy_score(tsty, tsty_hat), accuracy_score(tsty, tsty_hat2))

0.5466879489225858 0.5626496408619314
