#Lista 3 - Aprendizado de maquina
##Aluno: Pedro Gabriel Castelo Garcez
##Matricula: 535926

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from prettytable import PrettyTable
from sklearn import tree

#Utilidadades

In [None]:

def display_accuracy_tables(means, std_deviations):
    global_table = PrettyTable()
    global_table.title = "Metricas"
    global_table.field_names = ["Metrica", "Media", "Desvio Padrao"]
    metrics = ["Acuracia", "Revocacao", "Precisao", "F1"]

    for idx, metric in enumerate(metrics):
      global_table.add_row([
          metric,
          f"{means[idx]:.4f}",
          f"{std_deviations[idx]:.4f}"
      ])

    print(global_table)

#Questao 1

In [None]:
df = pd.read_csv("kc2.csv")
x = df.values[:, 0:21]
y = df.values[:, -1:]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
class KFoldValidator:
  def __init__(self, X, Y, k=10):
    n, _ = X.shape
    self.k = k
    idxs = np.arange(n)
    fold_ratio = n // k
    folds = []
    for fold in range(k):
      test_indices = idxs[fold * fold_ratio: (fold + 1) * fold_ratio]
      train_indices = np.concatenate([idxs[:fold * fold_ratio], idxs[(fold + 1) * fold_ratio:]])
      folds.append((train_indices, test_indices))
    self.folds = folds
    self.X = X
    self.Y = Y


  def validate_model(self, model, calculate_score):
    g_scores = None
    for train_indices, test_indices in self.folds:
      X_train, y_train = self.X[train_indices], self.Y[train_indices]
      X_test, y_test = self.X[test_indices], self.Y[test_indices]
      model.fit(X_train, y_train)
      pred = model.predict(X_test)
      scores = calculate_score(y_test, pred)
      g_scores = scores if g_scores is None else np.r_[g_scores, scores]

    return (np.mean(g_scores, axis=0), np.std(g_scores, axis=0))

In [None]:
class KNearestNeighbor:
    def __init__(self, distance_func='euclidian', k=1):
        self.training_data = None
        self.class_data = None
        self.k = k
        self.inv_cov = None
        match distance_func:
            case 'mahalanobis':
                self.distance_func = self._mahalanobis_distance
            case 'euclidian':
                self.distance_func = self._euclidian_distance

    def _mahalanobis_distance(self, X):
        candidates = None
        for x in X:
            diff = self.training_data - x
            distance = np.sqrt(np.sum(diff @ self.inv_cov @ diff.T,axis=1)).reshape(-1, 1)
            min_values = np.argpartition(distance, self.k, axis=0)[0:self.k].reshape(1, -1)
            candidates = min_values if candidates is None else np.r_[candidates, min_values]

        return candidates


    def _euclidian_distance(self, X):
        candidates = None
        for x in X:
            distance = np.sqrt(np.sum((self.training_data - x) ** 2, axis=1)).reshape(-1, 1)
            min_values_idx = np.argpartition(distance, self.k, axis=0)[0:self.k].reshape(1, -1)
            candidates = min_values_idx if candidates is None else np.r_[candidates, min_values_idx]

        return candidates


    def fit(self, X, Y):
        self.training_data = X
        self.class_data = Y

        features_means = np.mean(X, axis=0).reshape(1, -1)
        centered_features = X - features_means
        n = X.shape[0]
        cov = 1 / (n - 1) * (np.transpose(centered_features) @ centered_features)

        self.inv_cov = np.linalg.pinv(cov)


    def predict(self, X):
        candidates = self.distance_func(X)
        pred = []
        for candidate in candidates:
            classes = self.class_data[candidate]
            classes_count = np.unique(classes, return_counts=True)

            if len(classes_count[0]) == 1 or self.k == 1:
              pred.append(int(classes_count[0][0]))
              continue

            predicted_class = None
            predicted_class_count = None
            for class_, count in zip(classes_count[0], classes_count[1]):
              if predicted_class is None or count > predicted_class:
                predicted_class = int(class_)
                predicted_class_count = count

            pred.append(predicted_class)

        result = np.array(pred).reshape(-1, 1)
        return result

In [None]:
def calculate_scores(y_test, predictions):
  n, m = y_test.shape
  if (predictions.ndim == 1):
    predictions = predictions.reshape(-1, 1)

  validate_predictions = y_test == predictions
  accuracy = np.count_nonzero(validate_predictions) / n

  (classes, counts) = np.unique(validate_predictions[(y_test == 1)], return_counts=True)
  true_positives = 0
  false_negatives = 0
  for class_, count in zip (classes, counts):
    if class_:
      true_positives = count
    else:
      false_negatives = count

  recall = true_positives / (true_positives + false_negatives)

  (classes, counts) = np.unique(validate_predictions[(predictions == 1)], return_counts=True)
  false_positives = 0
  for class_, count in zip(classes, counts):
    if not class_:
      false_positives = count

  precision = true_positives / (true_positives + false_positives)

  f1_score = 2 * (precision * recall / (precision + recall))
  result = np.array([accuracy, recall, precision, f1_score]).reshape(1, -1)
  return result



In [None]:
validator = KFoldValidator(X_train, y_train)
validator

<__main__.KFoldValidator at 0x7f8c0c6759d0>

In [None]:
knn = KNearestNeighbor()
means, std_deviations = validator.validate_model(knn, calculate_scores)
display_accuracy_tables(means, std_deviations)

+------------------------------------+
|              Metricas              |
+-----------+--------+---------------+
|  Metrica  | Media  | Desvio Padrao |
+-----------+--------+---------------+
|  Acuracia | 0.6882 |     0.0913    |
| Revocacao | 0.6719 |     0.1752    |
|  Precisao | 0.6942 |     0.0976    |
|     F1    | 0.6720 |     0.1158    |
+-----------+--------+---------------+


In [None]:
knn = KNearestNeighbor(k=5)
means, std_deviation = validator.validate_model(knn, calculate_scores)
display_accuracy_tables(means, std_deviations)

+------------------------------------+
|              Metricas              |
+-----------+--------+---------------+
|  Metrica  | Media  | Desvio Padrao |
+-----------+--------+---------------+
|  Acuracia | 0.6353 |     0.0913    |
| Revocacao | 0.9664 |     0.1752    |
|  Precisao | 0.5847 |     0.0976    |
|     F1    | 0.7246 |     0.1158    |
+-----------+--------+---------------+


In [None]:
knn_mahalanobis = KNearestNeighbor(distance_func="mahalanobis")
means, std_deviation = validator.validate_model(knn_mahalanobis, calculate_scores)
display_accuracy_tables(means, std_deviations)

+------------------------------------+
|              Metricas              |
+-----------+--------+---------------+
|  Metrica  | Media  | Desvio Padrao |
+-----------+--------+---------------+
|  Acuracia | 0.6647 |     0.0913    |
| Revocacao | 0.7099 |     0.1752    |
|  Precisao | 0.6552 |     0.0976    |
|     F1    | 0.6742 |     0.1158    |
+-----------+--------+---------------+


  distance = np.sqrt(np.sum(diff @ self.inv_cov @ diff.T,axis=1)).reshape(-1, 1)


In [None]:
knn_mahalanobis = KNearestNeighbor(k=5,distance_func="mahalanobis")
means, std_deviation = validator.validate_model(knn_mahalanobis, calculate_scores)
display_accuracy_tables(means, std_deviations)

+------------------------------------+
|              Metricas              |
+-----------+--------+---------------+
|  Metrica  | Media  | Desvio Padrao |
+-----------+--------+---------------+
|  Acuracia | 0.5882 |     0.0913    |
| Revocacao | 0.9532 |     0.1752    |
|  Precisao | 0.5503 |     0.0976    |
|     F1    | 0.6950 |     0.1158    |
+-----------+--------+---------------+


  distance = np.sqrt(np.sum(diff @ self.inv_cov @ diff.T,axis=1)).reshape(-1, 1)


In [None]:
clf = tree.DecisionTreeClassifier()
means, std_deviation = validator.validate_model(clf, calculate_scores)
display_accuracy_tables(means, std_deviations)

(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
(17, 1)
+------------------------------------+
|              Metricas              |
+-----------+--------+---------------+
|  Metrica  | Media  | Desvio Padrao |
+-----------+--------+---------------+
|  Acuracia | 0.7471 |     0.0913    |
| Revocacao | 0.7177 |     0.1752    |
|  Precisao | 0.7600 |     0.0976    |
|     F1    | 0.7349 |     0.1158    |
+-----------+--------+---------------+
