# Classifiers Tests

This notebook quickly tests different types of classifiers locally, before choosing what is the best approach for deployment.

In [1]:
import numpy as np
import pandas as pd

import os

## Load Data

We load the test data and the training data

In [2]:
def load_data(data_dir, filename):
    """
    Load data and split it into features X and labels y.
        
        Args:
            data_dir : data directory
            filename : data file name
        Outputs :
            X : features
            y : labels
    """
    data = pd.read_csv(os.path.join(data_dir, train_filename), header=None).to_numpy()
    X, y = data[:, 1:], data[:, 0]
    return X, y

data_dir = "plagiarism_data"
train_filename = "train.csv"
test_filename = "test.csv"

X_train, y_train = load_data(data_dir, train_filename)
X_test, y_test = load_data(data_dir, test_filename)

## Create models

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

logreg_clf = LogisticRegression()
logreg_clf.fit(X_train, y_train)

SVC_model = SVC()
SVC_model.fit(X_train, y_train)

KNN_model = KNeighborsClassifier(n_neighbors=5)
KNN_model.fit(X_train, y_train)

rdf_classifier = RandomForestClassifier()
rdf_classifier.fit(X_train, y_train)

xgboost_clf = GradientBoostingClassifier()
xgboost_clf.fit(X_train, y_train)



GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

# Results

We measure the accuracy, precision, and recall of each of this classifier and check how they perform on test data set

In [4]:
def evaluate(model_name, model, X_test, y_test):
    """
    Evaluate a model on a test set given the prediction endpoint.  
    Return binary classification metrics.
    :param predictor: A prediction endpoint
    :param test_features: Test features
    :param test_labels: Class labels for test data
    :param verbose: If True, prints a table of all performance metrics
    :return: A dictionary of performance metrics.
    """
    
    # rounding and squeezing array
    test_preds = model.predict(X_test)
    
    # calculate true positives, false positives, true negatives, false negatives
    tp = np.logical_and(y_test, test_preds).sum()
    fp = np.logical_and(1-y_test, test_preds).sum()
    tn = np.logical_and(1-y_test, 1-test_preds).sum()
    fn = np.logical_and(y_test, 1-test_preds).sum()
    
    # calculate binary classification metrics
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    
    df = pd.DataFrame()
    df["model"] = [model_name]
    df["TP"] = [tp]
    df["FP"] = [fp]
    df["TN"] = [tn]
    df["FN"] = [fn]
    df["recall"] = [recall]
    df["precision"] = [precision]
    df["accuracy"] = [accuracy]

    return df

knn_result = evaluate("KNN", KNN_model, X_test, y_test)
svc_result = evaluate("SVC", SVC_model, X_test, y_test)
logreg_clf_result = evaluate("LogReg", logreg_clf, X_test, y_test)
rdf_classifier_result = evaluate("RFC", rdf_classifier, X_test, y_test)
xgboost_clf_result = evaluate("XGBoost", xgboost_clf, X_test, y_test)

pd.concat([knn_result, svc_result, logreg_clf_result, rdf_classifier_result, xgboost_clf_result])

Unnamed: 0,model,TP,FP,TN,FN,recall,precision,accuracy
0,KNN,35,1,27,7,0.833333,0.972222,0.885714
0,SVC,32,0,28,10,0.761905,1.0,0.857143
0,LogReg,38,3,25,4,0.904762,0.926829,0.9
0,RFC,40,0,28,2,0.952381,1.0,0.971429
0,XGBoost,42,0,28,0,1.0,1.0,1.0


XGBoost algorythm seems to be extremelly effective and does not show any mistakes on the test data set. Hopefully, it is not over fitting. We could check that if we implement a cross validation during the fitting but it requires more data.

In [5]:
type(None)

NoneType