# Logistic regression, statistical methods, KNN and decision trees

### 1 - Dataset: jsvulnerability_balanced.csv. 

### a) cv = 10 folds. Use the following models:
**Logistic Regression; </br>
Gaussian Discriminant Analysis;</br>
Naive Bayes Gaussian;</br>
KNN (k = 1 and k = 5; distances: Euclidean and Mahalanobis);</br>
Decision tree (gini and entropy).**</br>

### b) For each model created, report mean value and standard deviation of accuracy, recall, precision and F1-score metrics.

In [7]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import DistanceMetric
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier


In [2]:
database = np.genfromtxt("../data/jsvulnerability_balanced.csv", delimiter = ',')
database

array([[7.67400000e+03, 1.27000000e+02, 7.67400000e+03, ...,
        0.00000000e+00, 1.00000000e+02, 0.00000000e+00],
       [2.00000000e+00, 3.13700000e+04, 2.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+02, 0.00000000e+00],
       [1.69700000e+03, 3.30000000e+01, 1.70000000e+03, ...,
        1.00000000e+00, 5.00000000e+01, 0.00000000e+00],
       ...,
       [6.09600000e+03, 5.10000000e+01, 6.10200000e+03, ...,
        2.00000000e+00, 3.33333333e+01, 1.00000000e+00],
       [6.09800000e+03, 3.80000000e+01, 6.10000000e+03, ...,
        1.00000000e+00, 1.00000000e+02, 1.00000000e+00],
       [6.10300000e+03, 5.50000000e+01, 6.10800000e+03, ...,
        1.00000000e+00, 1.11111111e+01, 1.00000000e+00]])

In [3]:
x = database[:,:38]
y = database[:,38]

In [4]:
x.shape, y.shape

((2404, 38), (2404,))

### a) Cross-validation and evaluation of models:

In [9]:
import warnings
warnings.filterwarnings("ignore")

In [10]:
y_test_original = []
pred_lr_1 = []
pred_lr = []
pred_adgq = []
pred_nbg = []
pred_euc_1 = []
pred_mah_1 = []
pred_euc_5 = []
pred_mah_5 = []
pred_tree_gini = []
pred_tree_etp = []

kf = KFold(n_splits=10, shuffle = True) 

for train_index, test_index in kf.split(x):
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    y_test_original.append(y_test)
    
    # Logistic Regression: default
    clf_lr_1 = LogisticRegression(max_iter=200).fit(X_train_scaled,y_train) # default: penalty = l2, solver = lbfgs
    pred_clf_lr_1 = clf_lr_1.predict(X_test_scaled)
    pred_lr_1.append(pred_clf_lr_1)
    
    # Logistic Regression: penalty = 'none', solver='newton-cg'
    clf_lr = LogisticRegression(penalty='none', solver = 'newton-cg').fit(X_train_scaled,y_train)
    pred_clf_lr = clf_lr.predict(X_test_scaled)
    pred_lr.append(pred_clf_lr)
    
    # Gaussian Discriminant Analysis - Quadratic:                  
    clf_adgq = QuadraticDiscriminantAnalysis().fit(X_train_scaled, y_train)
    pred_clf_adgq = clf_adgq.predict(X_test_scaled)
    pred_adgq.append(pred_clf_adgq)
    
    # Naive Bayes Gaussian:
    clf_nbg = GaussianNB().fit(X_train_scaled, y_train)
    pred_clf_nbg = clf_nbg.predict(X_test_scaled)
    pred_nbg.append(pred_clf_nbg)
    
    # KNN => k = 1
    neigh_euc_1 = KNeighborsClassifier(n_neighbors=1, metric = 'euclidean').fit(X_train_scaled, y_train)
    pred_neigh_euc_1 = neigh_euc_1.predict(X_test_scaled)
    pred_euc_1.append(pred_neigh_euc_1)

    neigh_mah_1 = KNeighborsClassifier(n_neighbors=1, metric = 'mahalanobis', 
                                       metric_params={'VI': np.linalg.inv(np.cov(X_train_scaled.T))}).fit(X_train_scaled, y_train)
    pred_neigh_mah_1 = neigh_mah_1.predict(X_test_scaled)
    pred_mah_1.append(pred_neigh_mah_1)
    
    # KNN => k = 5
    neigh_euc_5 = KNeighborsClassifier(n_neighbors=5, metric = 'euclidean').fit(X_train_scaled, y_train)
    pred_neigh_euc_5 = neigh_euc_5.predict(X_test_scaled)
    pred_euc_5.append(pred_neigh_euc_5)

    neigh_mah_5 = KNeighborsClassifier(n_neighbors=5, metric = 'mahalanobis', 
                                       metric_params={'VI': np.linalg.inv(np.cov(X_train_scaled.T))}).fit(X_train_scaled, y_train)
    pred_neigh_mah_5 = neigh_mah_5.predict(X_test_scaled)
    pred_mah_5.append(pred_neigh_mah_5)
    
    # Decision Tree: criterion = gini (default)
    clf_tree_gini = tree.DecisionTreeClassifier(criterion='gini').fit(X_train_scaled, y_train)
    pred_clf_tree_gini = clf_tree_gini.predict(X_test_scaled)
    pred_tree_gini.append(pred_clf_tree_gini)
    
    # Decision Tree: criterion = entropy
    clf_tree_etp = tree.DecisionTreeClassifier(criterion='entropy').fit(X_train_scaled, y_train)
    pred_clf_tree_etp = clf_tree_etp.predict(X_test_scaled)
    pred_tree_etp.append(pred_clf_tree_etp)

### b) Metrics

In [11]:
def metrics(y_test, pred):
    precision = []
    recall = []
    f1_score = []
    accuracy = []
    for i in range(len(y_test)):
        a = classification_report(y_test[i], pred[i], output_dict=True)
        precision.append(a['macro avg']['precision'])
        recall.append(a['macro avg']['recall'])
        f1_score.append(a['macro avg']['f1-score'])
        accuracy.append(a['accuracy'])
    
    results = print(f' Acurácia: {np.mean(accuracy)} +- {np.std(accuracy)} \n Precisão: {np.mean(precision)} +- {np.std(precision)} \n Revocação: {np.mean(recall)} +- {np.std(recall)} \n F1-score: {np.mean(f1_score)} +- {np.std(f1_score)}')
    return results

In [13]:
print('Logistic Regression - default:')
metrics(y_test_original, pred_lr_1)
print('\n')
print('Logistic Regression - penalty L2 and solver = newton-cg:')
metrics(y_test_original, pred_lr)

Logistic Regression - default:
 Acurácia: 0.7920055325034577 +- 0.02550414360191054 
 Precisão: 0.7942683272219808 +- 0.025906136130885427 
 Revocação: 0.7930790062277875 +- 0.02524614553820512 
 F1-score: 0.7914037689389367 +- 0.025511736109082098


Logistic Regression - penalty L2 and solver = newton-cg:
 Acurácia: 0.7965923236514523 +- 0.0224722353947443 
 Precisão: 0.7971873789525485 +- 0.023515192015251905 
 Revocação: 0.7970952650106613 +- 0.022878566647799636 
 F1-score: 0.7960591810067392 +- 0.022580766050030118


In [15]:
print('Gaussian Discriminant Analysis - Quadratic: ')
metrics(y_test_original, pred_adgq)

Gaussian Discriminant Analysis - Quadratic: 
 Acurácia: 0.7304581604426003 +- 0.03001102170252861 
 Precisão: 0.7747049622025081 +- 0.03366421705779762 
 Revocação: 0.7298496678375547 +- 0.026278677129690932 
 F1-score: 0.7183439376749668 +- 0.02932625818655259


In [16]:
print('Gaussian Naive Bayes : ')
metrics(y_test_original, pred_nbg)

Gaussian Naive Bayes : 
 Acurácia: 0.5990248962655602 +- 0.037772895138225564 
 Precisão: 0.6653546516094611 +- 0.05188123614442771 
 Revocação: 0.5993970291511654 +- 0.029263114607328188 
 F1-score: 0.5548224560597814 +- 0.03880519616665795


In [17]:
# KNN => k = 1 e k = 5
print('k = 1 and Distance: Euclidean ')
metrics(y_test_original, pred_euc_1)
print('\n')
print('k = 1 and Distance: Mahalanobis ')
metrics(y_test_original, pred_mah_1)
print('\n')
print('k = 5 and Distance: Euclidean ')
metrics(y_test_original, pred_euc_5)
print('\n')
print('k = 5 and Distance: Mahalanobis ')
metrics(y_test_original, pred_mah_5)

k = 1 and Distance: Euclidean 
 Acurácia: 0.8027991009681881 +- 0.020500033329396174 
 Precisão: 0.8032342294308062 +- 0.021077327770256813 
 Revocação: 0.8019848036718231 +- 0.019350852120842046 
 F1-score: 0.8019114829334469 +- 0.019850727259052996


k = 1 and Distance: Mahalanobis 
 Acurácia: 0.7229062932226833 +- 0.06766440390373905 
 Precisão: 0.7244281029739871 +- 0.06665989460104255 
 Revocação: 0.7237172744885326 +- 0.0665900213113159 
 F1-score: 0.7222915982078252 +- 0.0676187882455649


k = 5 and Distance: Euclidean 
 Acurácia: 0.8040681189488244 +- 0.025505740830760317 
 Precisão: 0.8049147605312642 +- 0.02520351308585955 
 Revocação: 0.8042416207538731 +- 0.02540320468091198 
 F1-score: 0.8033884047757429 +- 0.025543836464920793


k = 5 and Distance: Mahalanobis 
 Acurácia: 0.733366182572614 +- 0.07047669095230319 
 Precisão: 0.7370528497467875 +- 0.07170026307828199 
 Revocação: 0.7350667332038594 +- 0.07028069923859348 
 F1-score: 0.7327350168781636 +- 0.07033666376666577

In [19]:
# Decision Tree: criterion = gini and entropy
print('Decision Tree: criterion = gini: ')
metrics(y_test_original, pred_tree_gini)
print('\n')
print('Decision Tree: criterion = entropy: ')
metrics(y_test_original, pred_tree_etp)

Decision Tree: criterion = gini: 
 Acurácia: 0.8331621715076072 +- 0.03326392853121862 
 Precisão: 0.8332716681237688 +- 0.033085740247334045 
 Revocação: 0.8326057775933879 +- 0.03286953959423914 
 F1-score: 0.8325023892978545 +- 0.03303974804584576


Decision Tree: criterion = entropy: 
 Acurácia: 0.8302489626556018 +- 0.030421659234771872 
 Precisão: 0.8299453271359288 +- 0.030594045499836575 
 Revocação: 0.8301366456454226 +- 0.030344412239406145 
 F1-score: 0.8297229880334713 +- 0.030415730444142464


Decision Tree with criterion gini is the model with the best metrics.