In [None]:
import numpy as np
import pandas as pd

In [None]:
import numpy as np
import pandas as pd

# Carregar os dados
df_kc2 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AM/lista-3/kc2.csv")

# Dividindo os dados em características (X) e rótulo (y)
X = df_kc2.iloc[:, :-1].values  # 21 primeiras colunas são características
y = df_kc2.iloc[:, -1].values.reshape(-1, 1)  # Última coluna é a saída

# Normalizando os dados de entrada
X_normalized = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

df_kc2.head()

Unnamed: 0,4.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00.1,1.000000000000000000e+00.2,4.000000000000000000e+00.1,8.000000000000000000e+00,6.700000000000000400e-01,1.500000000000000000e+00,5.330000000000000071e+00,1.200000000000000000e+01,...,2.000000000000000000e+00,0.000000000000000000e+00.1,0.000000000000000000e+00.2,0.000000000000000000e+00.3,3.000000000000000000e+00,1.000000000000000000e+00.3,3.000000000000000000e+00.1,1.000000000000000000e+00.4,1.000000000000000000e+00.5,0.000000000000000000e+00.4
0,39.0,4.0,1.0,2.0,105.0,520.19,0.07,13.89,37.44,7227.91,...,29.0,1.0,4.0,2.0,12.0,19.0,61.0,44.0,7.0,0.0
1,1.0,1.0,1.0,1.0,6.0,15.51,0.4,2.5,6.2,38.77,...,0.0,0.0,0.0,0.0,5.0,1.0,5.0,1.0,1.0,0.0
2,15.0,1.0,1.0,1.0,55.0,224.81,0.17,5.73,39.25,1287.55,...,12.0,0.0,1.0,0.0,6.0,11.0,34.0,21.0,1.0,0.0
3,12.0,2.0,1.0,2.0,15.0,45.0,0.17,6.0,7.5,270.0,...,8.0,1.0,0.0,0.0,6.0,2.0,11.0,4.0,3.0,0.0
4,8.0,1.0,1.0,1.0,13.0,43.19,0.27,3.75,11.52,161.94,...,6.0,0.0,0.0,0.0,6.0,4.0,8.0,5.0,1.0,0.0


In [None]:
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

def mahalanobis_distance(a, b, VI):
    delta = a - b
    return np.sqrt(np.dot(np.dot(delta, VI), delta.T))


In [None]:
def cross_val_indices(n_samples, n_folds):
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    fold_sizes = np.full(n_folds, n_samples // n_folds, dtype=int)
    fold_sizes[:n_samples % n_folds] += 1
    current = 0
    folds = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        folds.append(indices[start:stop])
        current = stop
    return folds

n_samples = X.shape[0]
n_folds = 10
folds = cross_val_indices(n_samples, n_folds)


In [None]:
VI = np.linalg.inv(np.cov(X.T))

def knn_predict(X_train, y_train, X_test, k, distance_metric):
    y_pred = []
    for test_point in X_test:
        distances = []
        for i, train_point in enumerate(X_train):
            if distance_metric == 'euclidean':
                dist = euclidean_distance(test_point, train_point)
            elif distance_metric == 'mahalanobis':
                dist = mahalanobis_distance(test_point, train_point, VI)
            distances.append((dist, y_train[i][0]))  # Acessar o valor escalar de y_train

        distances.sort(key=lambda x: x[0])
        neighbors = [label for _, label in distances[:k]]
        pred = max(set(neighbors), key=neighbors.count)
        y_pred.append(pred)

    return np.array(y_pred)



In [None]:
def evaluate_knn(X, y, folds, k, metric):
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for i in range(n_folds):
        test_idx = folds[i]
        train_idx = np.hstack([folds[j] for j in range(n_folds) if j != i])

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        y_pred = knn_predict(X_train, y_train, X_test, k, metric)

        # métricas
        accuracy = np.sum(y_pred == y_test.flatten()) / len(y_test)
        precision_score = np.sum(np.logical_and(y_pred == 1, y_test == 1)) / np.sum(y_pred == 1)
        recall_score = np.sum(np.logical_and(y_pred == 1, y_test == 1)) / np.sum(y_test == 1)
        f1_score = 2 * precision_score * recall_score / (precision_score + recall_score)

        # armazenando as métricas
        precisions.append(precision_score)
        recalls.append(recall_score)
        f1_scores.append(f1_score)
        accuracies.append(accuracy)

    return np.mean(accuracies), np.std(accuracies), np.mean(precisions), np.std(precisions), np.mean(recalls), np.std(recalls), np.mean(f1_scores), np.std(f1_scores)




In [None]:
metrics = ['euclidean', 'mahalanobis']
k_values = [1, 5]

results = {}

for k in k_values:
    for metric in metrics:
        mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score = evaluate_knn(X_normalized, y, folds, k, metric)
        results[(k, metric)] = (mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score)

for key in results:
    k, metric = key
    mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score = results[key]
    print(f"K={k}, Metric={metric}:\n"
          f"  Mean Accuracy={mean_accuracy}, Std Accuracy={std_accuracy}\n"
          f"  Mean Precision={mean_precision}, Std Precision={std_precision}\n"
          f"  Mean Recall={mean_recall}, Std Recall={std_recall}\n"
          f"  Mean F1 Score={mean_f1_score}, Std F1 Score={std_f1_score}\n")


K=1, Metric=euclidean:
  Mean Accuracy=0.6491341991341992, Std Accuracy=0.12346563964251593
  Mean Precision=10.7, Std Precision=1.2688577540449522
  Mean Recall=13.6, Std Recall=1.9078784028338913
  Mean F1 Score=11.842701096005445, Std F1 Score=0.8944571434815637

K=1, Metric=mahalanobis:
  Mean Accuracy=0.6341991341991342, Std Accuracy=0.13628459194663728
  Mean Precision=10.7, Std Precision=1.2688577540449522
  Mean Recall=12.3, Std Recall=1.7916472867168918
  Mean F1 Score=11.2836295878035, Std F1 Score=0.6222798534118399

K=5, Metric=euclidean:
  Mean Accuracy=0.7796536796536795, Std Accuracy=0.07993316298730271
  Mean Precision=10.7, Std Precision=1.2688577540449522
  Mean Recall=10.2, Std Recall=1.8330302779823362
  Mean F1 Score=10.351657694962043, Std F1 Score=1.2984267851579292

K=5, Metric=mahalanobis:
  Mean Accuracy=0.7696969696969698, Std Accuracy=0.07032600439239273
  Mean Precision=10.7, Std Precision=1.2688577540449522
  Mean Recall=9.8, Std Recall=1.661324772583615
 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def evaluate_decision_tree(X, y, folds, criterion):
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for i in range(n_folds):
        test_idx = folds[i]
        train_idx = np.hstack([folds[j] for j in range(n_folds) if j != i])

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf = DecisionTreeClassifier(criterion=criterion)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision_score = np.sum(np.logical_and(y_pred == 1, y_test == 1)) / np.sum(y_pred == 1)
        recall_score = np.sum(np.logical_and(y_pred == 1, y_test == 1)) / np.sum(y_test == 1)
        f1_score = 2 * precision_score * recall_score / (precision_score + recall_score)

        precisions.append(precision_score)
        recalls.append(recall_score)
        f1_scores.append(f1_score)
        accuracies.append(accuracy)

    return np.mean(accuracies), np.std(accuracies), np.mean(precisions), np.std(precisions), np.mean(recalls), np.std(recalls), np.mean(f1_scores), np.std(f1_scores)


In [None]:
criteria = ['gini', 'entropy']
results = {}



for criterion in criteria:
    mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score = evaluate_decision_tree(X_normalized, y, folds, criterion)
    results[(k, metric)] = (mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score)


for key in results:
    mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score = results[key]
    print(
          f"  Mean Accuracy={mean_accuracy}, Std Accuracy={std_accuracy}\n"
          f"  Mean Precision={mean_precision}, Std Precision={std_precision}\n"
          f"  Mean Recall={mean_recall}, Std Recall={std_recall}\n"
          f"  Mean F1 Score={mean_f1_score}, Std F1 Score={std_f1_score}\n")



  Mean Accuracy=0.7004329004329004, Std Accuracy=0.06565402846214842
  Mean Precision=10.7, Std Precision=1.2688577540449522
  Mean Recall=10.3, Std Recall=2.238302928559939
  Mean F1 Score=10.29339356295878, Std F1 Score=1.0892693843196826

