In [3]:
# Imports
import numpy as np
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB
from sklearn import neighbors
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_predict
import random

for fichier in ['spam.csv', 'galaxy_feature_vectors.csv']:
    print(f"\n==={fichier}===\n")
    entries = np.loadtxt(fichier, delimiter = ',' ,)
    nb_features = len(entries[1]) - 1
    random.shuffle(entries,random.random)

    X = entries[:,:-1]
    Y = entries[:,nb_features]

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)
    X_test, X_validation, Y_test, Y_validation = train_test_split(X_test, Y_test, test_size = 0.66)    

    # ARBRE DE DECISION
    print("ARBRE DE DÉCISION -----")
    depths = [3 ,5 ,10]

    for depth in depths :
        model = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = depth)
        model = model.fit(X_train, Y_train)

        Y_valid_pred = model.predict(X_validation)
        Y_test_pred = model.predict(X_test)

        accuracy_test = accuracy_score(Y_validation, Y_valid_pred)
        f1 = f1_score(Y_validation, Y_valid_pred, average='weighted')
        print('Correct classification rate for validation dataset (depth: ' + str(depth) + ') = ' + str(accuracy_test * 100) + '%')
        print('f1 score: %s' % f1)
        
    model = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = None)
    model = model.fit(X_train, Y_train)

    Y_valid_pred = model.predict(X_validation)
    Y_test_pred = model.predict(X_test)

    accuracy_test = accuracy_score(Y_validation, Y_valid_pred)
    f1 = f1_score(Y_validation, Y_valid_pred, average='weighted')
    print('Correct classification rate for validation dataset (depth: None) = ' + str(accuracy_test * 100) + '%')
    print('f1 score: %s' % f1)
    
    # BAYES NAÏF (BERNOULLI)
    print("")
    print("BERNOULLI -------------")
    model = BernoulliNB(alpha = 1.0, binarize = 0.0, class_prior = None, fit_prior = True)
    model = model.fit(X_train, Y_train)

    Y_valid_pred = model.predict(X_validation)
    Y_valid_pred_prob = model.predict_proba(X_validation)

    acc_digits_data = accuracy_score(Y_validation, Y_valid_pred)
    f1 = f1_score(Y_validation, Y_valid_pred, average='weighted')
    print('Correct classification rate for the validation dataset = ' + str(acc_digits_data * 100) + '%')
    print('f1 score: %s' % f1)
    
    print("BERNOULLI AVEC NORMALISATION")
    X_train_norm = MinMaxScaler().fit_transform(X_train)
    X_valid_norm = MinMaxScaler().fit_transform(X_validation)
    model = BernoulliNB(alpha = 1.0, binarize = 0.0, class_prior = None, fit_prior = True)
    model = model.fit(X_train_norm, Y_train)
    
    Y_valid_pred = model.predict(X_valid_norm)

    acc_digits_data = accuracy_score(Y_validation, Y_valid_pred)
    f1 = f1_score(Y_validation, Y_valid_pred, average='weighted')
    print('Correct classification rate for the validation dataset = ' + str(acc_digits_data * 100) + '%')
    print('f1 score: %s' % f1)

    # KNN
    print("")
    print("KNN -------------------")
    n_neighbors = [3, 5, 10]
    weights = ['uniform', 'distance']

    for nn in n_neighbors:
        for w in weights:
            metric = 'euclidean'
            algorithm = 'brute'

            model = neighbors.KNeighborsClassifier(nn, weights = w, algorithm = algorithm, metric = metric )
            model = model.fit(X_train, Y_train)

            Y_valid_pred = model.predict(X_validation)

            Y_valid_pred_prob = model.predict_proba(X_validation)

            acc_digits_data = accuracy_score(Y_validation, Y_valid_pred)
            f1 = f1_score(Y_validation, Y_valid_pred, average='weighted')
            print('Correct classification rate for the validation dataset (neighbors: ' + str(nn) + ', weight: '+ str(w) +') = ' + str(acc_digits_data * 100) + '%')
            print('f1 score: %s' % f1)
        
    # ==== 10-fold CV ====

    # === Decision tree ===
    print('\nDecision Tree\n')
    model = tree.DecisionTreeClassifier(max_depth=None)
    y_cv_pred = cross_val_predict(model, X, Y, cv=10)
    acc = accuracy_score(Y, y_cv_pred)
    f1 = f1_score(Y, y_cv_pred, average='weighted')
    print(f'Correct classification rate with 10-fold ({str(acc * 100)}' + '%')
    print('f1 score: %s' % f1)

    # === Bernoulli ===
    print('\nBernoulli\n')
    model = BernoulliNB(alpha = 1.0, binarize = 0.0, class_prior = None, fit_prior = True)
    y_cv_pred = cross_val_predict(model, X, Y, cv=10)
    acc = accuracy_score(Y, y_cv_pred)
    f1 = f1_score(Y, y_cv_pred, average='weighted')
    print(f'Correct classification rate with 10-fold ({str(acc * 100)}' + '%')
    print('f1 score: %s' % f1)

    # === KNN ===
    print('\nKNN\n')
    model = neighbors.KNeighborsClassifier(3, weights = 'distance', algorithm = 'brute', metric = 'euclidean' )
    y_cv_pred = cross_val_predict(model, X, Y, cv=10)
    acc = accuracy_score(Y, y_cv_pred)
    f1 = f1_score(Y, y_cv_pred, average='weighted')
    print(f'Correct classification rate with 10-fold ({str(acc * 100)}' + '%')
    print('f1 score: %s' % f1)



===spam.csv===

ARBRE DE DÉCISION -----
Correct classification rate for validation dataset (depth: 3) = 87.38574040219378%
f1 score: 0.8726586755015376
Correct classification rate for validation dataset (depth: 5) = 90.49360146252286%
f1 score: 0.9047440679254657
Correct classification rate for validation dataset (depth: 10) = 93.96709323583181%
f1 score: 0.9396412162576825
Correct classification rate for validation dataset (depth: None) = 95.24680073126143%
f1 score: 0.9524680073126143

BERNOULLI -------------
Correct classification rate for the validation dataset = 90.12797074954297%
f1 score: 0.9009758546794986
BERNOULLI AVEC NORMALISATION
Correct classification rate for the validation dataset = 90.12797074954297%
f1 score: 0.9010803782302913

KNN -------------------
Correct classification rate for the validation dataset (neighbors: 3, weight: uniform) = 85.74040219378428%
f1 score: 0.8579248135680351
Correct classification rate for the validation dataset (neighbors: 3, weight: dis