<a href="https://colab.research.google.com/github/moralesbang/ml-classification/blob/master/ML_VertebraColumn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Carga de librerías

In [0]:
import numpy as np
import sklearn as sk
from numpy import random, matlib
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

## 2. Carga de datos

In [78]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
data_path = '/content/drive/My Drive/TrabajoSimulación/MachineLearning/column_3C.dat'
X = np.loadtxt(data_path, usecols=np.arange(0, 6))
Y = np.loadtxt(data_path, usecols=(6), dtype='str')

# -- ENCODING CLASSES
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

## 3. Información de los datos


In [80]:
encoded_classes, counts = np.unique(Y, return_counts=True)
print('INFORMACIÓN GENERAL')
print('Número de muestras:', np.size(X, 0))
print('Número de características', np.size(X, 1))
print('Número de clases:', encoded_classes.size)

# -- PRINTING NUMBER OF SAMPLES PER CLASS --
print('\nNÚMERO DE MUESTRAS POR CLASE')

for i in range(0, encoded_classes.size):
  print('Class', le.classes_[i], ':', counts[i])

INFORMACIÓN GENERAL
Número de muestras: 310
Número de características 6
Número de clases: 3

NÚMERO DE MUESTRAS POR CLASE
Class DH : 60
Class NO : 100
Class SL : 150


## 4. Algoritmos de clasificación

### Helpers

In [0]:
def validate(X, Y, classifier):
  scores_train, scores_test = [], []
  skf = StratifiedKFold(n_splits=3)

  for train_index, test_index in skf.split(X, Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Data normalizing
    # WARN: Try StandardScaler!
    X_train, X_test = preprocessing.scale(X_train), preprocessing.scale(X_test)

    classifier.fit(X_train, Y_train.ravel())

    score_train = classifier.score(X_train, Y_train.ravel())
    score_test = classifier.score(X_test, Y_test.ravel())

    scores_train.append(score_train)
    scores_test.append(score_test)

  train_accuracy, test_accuracy = np.mean(scores_train), np.mean(scores_test)

  return train_accuracy, test_accuracy

### 4.1. Naive Bayes


In [82]:
from sklearn.naive_bayes import GaussianNB

def nb_experiment(X, Y):
  clf = GaussianNB()
  return validate(X, Y, clf)

train_accuracy, test_accuracy = nb_experiment(X, Y)
print('Training Accuracy:\t', train_accuracy)
print('Testing Accuracy:\t', test_accuracy)

Training Accuracy:	 0.8419633225458468
Testing Accuracy:	 0.8323375653472741


### 4.2. KNN (k-nearest neighbors algorithm )

In [0]:
def knn_experiment(X, Y, k=5):
  clf = KNeighborsClassifier(n_neighbors = k) # n_neighbors = 5 by default
  return validate(X, Y, clf)

### 4.1.1. Parámetros más óptimos

In [48]:
accuracy = []

for k in range(3, 16): # -- TESTING FOR DIFFERENT VALUES OF K --
    train_accuracy, test_accuracy = knn_experiment(X, Y, k)
    accuracy.append([train_accuracy, test_accuracy, k])
    
accuracy_matrix  = np.array(accuracy)
most_accurate_index = np.argmax(accuracy_matrix[:, 1])
most_accurate = accuracy[most_accurate_index]

print('Maximum Training Accuracy:\t', most_accurate[0])
print('Maximum Testing Accuracy:\t', most_accurate[1])
print('Value of K:\t\t\t', int(most_accurate[2]))

Maximum Training Accuracy:	 0.8339274267936151
Maximum Testing Accuracy:	 0.8001929300472991
Value of K:			 6


### 4.3. Redes Neuronales Artificiales


In [0]:
# random.seed(19680801)

def ann_experiment(X, Y):
  epochs = 2500
  neurons = 12
  mlp = MLPClassifier(hidden_layer_sizes=(neurons,), activation='logistic', max_iter=epochs)
  return validate(X, Y, mlp)

### 4.4. Random Forest

In [0]:
def rf_experiment(X, Y, n_estimators, max_features):
  clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features)
  return validate(X, Y, clf)

### 4.4.1. Parámetros más óptimos

In [124]:
accuracy = []
for n_estimators in [5, 10, 20, 50, 100]:
  train_accuracy, test_accuracy = rf_experiment(X, Y, n_estimators, 'auto')
  accuracy.append([train_accuracy, test_accuracy, n_estimators])
    
accuracy_matrix  = np.array(accuracy)
most_accurate_index = np.argmax(accuracy_matrix[:, 1])
most_accurate = accuracy[most_accurate_index]

print('Maximum Training Accuracy:\t', most_accurate[0])
print('Maximum Testing Accuracy:\t', most_accurate[1])
print('Number of estimators:\t\t', most_accurate[2])
# print('Number of max features:\t\t\t', most_accurate[3])

Maximum Training Accuracy:	 0.9967715710645217
Maximum Testing Accuracy:	 0.8419218322130942
Number of estimators:		 20
