# Classification

### Dataset

In [1]:
from sklearn.datasets import load_wine

dataset = load_wine()

In [2]:
for key, value in dataset.items():
    print(key, len(value))

data 178
target 178
target_names 3
DESCR 3482
feature_names 13


In [3]:
print(dataset['DESCR'])

Wine Data Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- 1) Alcohol
 		- 2) Malic acid
 		- 3) Ash
		- 4) Alcalinity of ash  
 		- 5) Magnesium
		- 6) Total phenols
 		- 7) Flavanoids
 		- 8) Nonflavanoid phenols
 		- 9) Proanthocyanins
		- 10)Color intensity
 		- 11)Hue
 		- 12)OD280/OD315 of diluted wines
 		- 13)Proline
        	- class:
                - class_0
                - class_1
                - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:     

In [4]:
for index, datum in enumerate(zip(dataset['data'], dataset['target'])):
    if index in [1, 69, 134]:
        print(datum)

(array([1.32e+01, 1.78e+00, 2.14e+00, 1.12e+01, 1.00e+02, 2.65e+00,
       2.76e+00, 2.60e-01, 1.28e+00, 4.38e+00, 1.05e+00, 3.40e+00,
       1.05e+03]), 0)
(array([1.221e+01, 1.190e+00, 1.750e+00, 1.680e+01, 1.510e+02, 1.850e+00,
       1.280e+00, 1.400e-01, 2.500e+00, 2.850e+00, 1.280e+00, 3.070e+00,
       7.180e+02]), 1)
(array([1.251e+01, 1.240e+00, 2.250e+00, 1.750e+01, 8.500e+01, 2.000e+00,
       5.800e-01, 6.000e-01, 1.250e+00, 5.450e+00, 7.500e-01, 1.510e+00,
       6.500e+02]), 2)


### Data splitting

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.asarray(dataset['data'])
y = np.asarray(dataset['target'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

### K-nearest neighbors

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

#Models
neigh_3 = KNeighborsClassifier(n_neighbors=3)
neigh_3_scaling = make_pipeline(StandardScaler(), 
                                 KNeighborsClassifier(n_neighbors=3))
neigh_7 = KNeighborsClassifier(n_neighbors=7)
neigh_7_scaling = make_pipeline(StandardScaler(), 
                                 KNeighborsClassifier(n_neighbors=7))
neigh_11 = KNeighborsClassifier(n_neighbors=11)
neigh_11_scaling = make_pipeline(StandardScaler(), 
                                 KNeighborsClassifier(n_neighbors=11))

#Fitting
neigh_3.fit(X_train, y_train)
neigh_3_scaling.fit(X_train, y_train)
neigh_7.fit(X_train, y_train)
neigh_7_scaling.fit(X_train, y_train)
neigh_11.fit(X_train, y_train)
neigh_11_scaling.fit(X_train, y_train)

#Score
print("K-nearest neighbors\n")
print("3 neighbors: {:.5f}".format(neigh_3.score(X_test, y_test)))
print("3 neighbors + scaling: {:.5f}\n".format(neigh_3_scaling.score(X_test, y_test)))
print("7 neighbors: {:.5f}".format(neigh_7.score(X_test, y_test)))
print("7 neighbors + scaling: {:.5f}\n".format(neigh_7_scaling.score(X_test, y_test)))
print("11 neighbors: {:.5f}".format(neigh_11.score(X_test, y_test)))
print("11 neighbors + scaling: {:.5f}".format(neigh_11_scaling.score(X_test, y_test)))

K-nearest neighbors

3 neighbors: 0.77966
3 neighbors + scaling: 0.94915

7 neighbors: 0.77966
7 neighbors + scaling: 0.98305

11 neighbors: 0.81356
11 neighbors + scaling: 1.00000


### Naive Bayes

In [7]:
from sklearn.naive_bayes import GaussianNB

#Model
Gauss = GaussianNB()

#Fitting
Gauss.fit(X_train, y_train)

#Score
print("Naive Bayes: {:.5f}".format(Gauss.score(X_test, y_test)))

Naive Bayes: 0.98305


### Decision tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

#Models
tree = DecisionTreeClassifier()
tree_random_1 = DecisionTreeClassifier(splitter='random')
tree_random_2 = DecisionTreeClassifier(splitter='random')
tree_random_3 = DecisionTreeClassifier(splitter='random')

#Fitting
tree.fit(X_train, y_train)
tree_random_1.fit(X_train, y_train)
tree_random_2.fit(X_train, y_train)
tree_random_3.fit(X_train, y_train)

#Score
print("Decision tree\n")
print("Tree: {:.5f}".format(tree.score(X_test, y_test)))
print("Tree_random (1): {:.5f}".format(tree_random_1.score(X_test, y_test)))
print("Tree_random (2): {:.5f}".format(tree_random_2.score(X_test, y_test)))
print("Tree_random (3): {:.5f}".format(tree_random_3.score(X_test, y_test)))

Decision tree

Tree: 0.86441
Tree_random (1): 0.88136
Tree_random (2): 0.93220
Tree_random (3): 0.91525


### SVM 

In [9]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA

#Models
SVM_sigm = SVC(kernel='sigmoid')
SVM_sigm_scaling = make_pipeline(StandardScaler(), SVC(kernel='sigmoid'))
SVM_sigm_scaling_PCA = make_pipeline(StandardScaler(), 
                              PCA(n_components=2), SVC(kernel='sigmoid'))

SVM_rbf = SVC(kernel='rbf')
SVM_rbf_scaling = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
SVM_rbf_scaling_PCA = make_pipeline(StandardScaler(), 
                              PCA(n_components=2), SVC(kernel='rbf'))

SVM_linear = SVC(kernel='linear')
SVM_linear_scaling = make_pipeline(StandardScaler(), SVC(kernel='linear'))
SVM_linear_scaling_PCA = make_pipeline(StandardScaler(), 
                                 PCA(n_components=2), SVC(kernel='linear'))


SVM_poly_3 = SVC(kernel='poly', degree=3)
SVM_poly_3_scaling = make_pipeline(StandardScaler(), SVC(kernel='poly'))
SVM_poly_3_scaling_PCA = make_pipeline(StandardScaler(), 
                                 PCA(n_components=2), 
                                 SVC(kernel='poly', degree=3))

#Fitting
SVM_sigm.fit(X_train, y_train)
SVM_sigm_scaling.fit(X_train, y_train)
SVM_sigm_scaling_PCA.fit(X_train, y_train)

SVM_rbf.fit(X_train, y_train)
SVM_rbf_scaling.fit(X_train, y_train)
SVM_rbf_scaling_PCA.fit(X_train, y_train)

SVM_linear.fit(X_train, y_train)
SVM_linear_scaling.fit(X_train, y_train)
SVM_linear_scaling_PCA.fit(X_train, y_train)

SVM_poly_3.fit(X_train, y_train)
SVM_poly_3_scaling.fit(X_train, y_train)
SVM_poly_3_scaling_PCA.fit(X_train, y_train)

#Score
print("SVM\n")
print("Sigmoid: {:.5f}".format(SVM_sigm.score(X_test, y_test)))
print("Sigmoid (scaling): {:.5f}".format(SVM_sigm_scaling.score(X_test, y_test)))
print("Sigmoid (scaling + PCA): {:.5f}\n".format(SVM_sigm_scaling_PCA.score(X_test, y_test)))

print("RBF: {:.5f}".format(SVM_rbf.score(X_test, y_test)))
print("RBF (scaling): {:.5f}".format(SVM_rbf_scaling.score(X_test, y_test)))
print("RBF (scaling + PCA): {:.5f}\n".format(SVM_rbf_scaling_PCA.score(X_test, y_test)))

print("Linear: {:.5f}".format(SVM_linear.score(X_test, y_test)))
print("Linear (scaling): {:.5f}".format(SVM_linear_scaling.score(X_test, y_test)))
print("Linear (scaling + PCA): {:.5f}\n".format(SVM_linear_scaling_PCA.score(X_test, y_test)))

print("Poly-3: {:.5f}".format(SVM_poly_3.score(X_test, y_test)))
print("Poly-3 (scaling): {:.5f}".format(SVM_poly_3_scaling.score(X_test, y_test)))
print("Poly-3 (scaling + PCA): {:.5f}".format(SVM_poly_3_scaling_PCA.score(X_test, y_test)))


SVM

Sigmoid: 0.38983
Sigmoid (scaling): 0.98305
Sigmoid (scaling + PCA): 0.84746

RBF: 0.40678
RBF (scaling): 0.98305
RBF (scaling + PCA): 0.94915

Linear: 0.96610
Linear (scaling): 0.98305
Linear (scaling + PCA): 0.94915

Poly-3: 0.96610
Poly-3 (scaling): 0.96610
Poly-3 (scaling + PCA): 0.94915


### Random Forest

In [10]:
#Bagging & RSM of decision trees
from sklearn.ensemble import RandomForestClassifier

#Model
RF = RandomForestClassifier(n_estimators=50)

#Fitting
RF.fit(X_train, y_train)

#Score
print("Random Forest: {:.5f}".format(RF.score(X_test, y_test)))


Random Forest: 0.98305


### AdaBoost

In [11]:
#Adaptive boosting
from sklearn.ensemble import AdaBoostClassifier

#Models
Ada_DT = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), 
                            n_estimators=50)
Ada_SVM = AdaBoostClassifier(base_estimator=SVC(kernel='linear'), 
                            n_estimators=50, algorithm='SAMME')
Ada_NB = AdaBoostClassifier(base_estimator=GaussianNB(), 
                            n_estimators=50)

#Fitting
Ada_DT.fit(X_train, y_train)
Ada_SVM.fit(X_train, y_train)
Ada_NB.fit(X_train, y_train)

#Score
print("AdaBoost\n")
print("Decision Tree: {:.5f}".format(Ada_DT.score(X_test, y_test)))
print("SVM: {:.5f}".format(Ada_SVM.score(X_test, y_test)))
print("Naive Bayes: {:.5f}".format(Ada_NB.score(X_test, y_test)))

AdaBoost

Decision Tree: 0.89831
SVM: 0.96610
Naive Bayes: 0.94915


### Gradient Boosting

In [12]:
#Boosting of regression trees
from sklearn.ensemble import GradientBoostingClassifier

#Model
GB = GradientBoostingClassifier(n_estimators=50)

#Fitting
GB.fit(X_train, y_train)

#Score
print("Gradient Boosting: {:.5f}".format(GB.score(X_test, y_test)))

Gradient Boosting: 0.91525


### Voting Classifier

In [13]:
from sklearn.ensemble import VotingClassifier

#Models
RF = RandomForestClassifier(n_estimators=50)
GB = GradientBoostingClassifier(n_estimators=50)
Ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=50)

#Voting
VC = VotingClassifier(estimators=[("RF", RF), ("GB", GB), ("Ada", Ada)])

#Fitting
VC.fit(X_train, y_train)

#Score
print("Voting Classifier: {:.5f}".format(VC.score(X_test, y_test)))

Voting Classifier: 0.91525


  if diff:
