In [1]:
!pip list

Package                Version
---------------------- -----------------------
absl-py                0.10.0
argon2-cffi            20.1.0
astor                  0.8.1
astunparse             1.6.3
attrs                  20.1.0
backcall               0.2.0
bleach                 3.1.5
cachetools             4.1.1
certifi                2020.6.20
cffi                   1.14.2
chainer                7.7.0
chardet                3.0.4
click                  7.1.2
cloudpickle            1.3.0
cntk-gpu               2.7
cupy                   7.8.0
cycler                 0.10.0
Cython                 0.29.21
decorator              4.4.2
defusedxml             0.6.0
dm-sonnet              2.0.0
dm-tree                0.1.5
entrypoints            0.3
enum34                 1.1.10
fastrlock              0.5
filelock               3.0.12
funcsigs               1.0.2
future                 0.18.2
gast                   0.3.3
google-auth            1.21.0
google-auth-oauthlib   0.4.1
google-pasta  

In [2]:
import sys
import os
import numpy as np
import yaml as yl
import matplotlib.pyplot as plt
import argparse
import glob
import pandas as pd
import time

from typing import List, Tuple
from functools import partial

from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import manifold
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline

In [3]:
def get_model(model_name: str, **kwargs):
    model_name = model_name.lower()
    if model_name == 'svm':
        return svm.SVC(**kwargs)
    elif model_name == 'random_forest':
        return RandomForestClassifier(**kwargs)
    elif model_name == 'knn':
        return KNeighborsClassifier(**kwargs)
    else:
        raise ValueError(f"Invalid model {model_name}")

In [4]:
def do_train_and_predict(model, x_train: np.array, x_test: np.array, y_train: np.array, y_test: np.array):
    # Do fit and predict
    start = time.time()
    model = model.fit(x_train, y_train)
    print(f"Fit took {time.time()-start:.3f} seconds")
    model_predict = model.predict(x_test)
    # Calculate accuracy and F1 score
    model_accuracy = accuracy_score(y_test, model_predict)
    model_f1 = f1_score(y_test, model_predict, average="weighted")
    # Print metrics
    accuracy = model_accuracy*100
    f1_acc = model_f1*100
    print(f"Accuracy: {accuracy:.3f}")
    print(f"F1-Score: {f1_acc:.3f}")
    return accuracy, f1_acc

In [5]:
feature_names = [str(i) for i in range(65)]
def extract_data(data_df: pd.DataFrame, train: float = 0.8, test: float = 0.2, random_state: int = 0, must_scale: bool = False, must_normalize: bool = False):
    x_data = data_df.loc[:, feature_names]
    if must_scale:
        x_data = StandardScaler().fit_transform(x_data)
    if must_normalize:
        x_data = normalize(x_data, norm="l1")
    
    y_data = data_df["class"]
    X_train, X_test, y_train, y_test = model_selection.train_test_split(x_data, y_data, train_size=train, test_size=test, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [6]:
filename = "data/POJBestSeqsSBLP2021_merge_2classes.csv"
df = pd.read_csv(filename)
X_train, X_test, y_train, y_test = extract_data(df)

accuracies = []
degree = 3
for C in [0.5, 1.0, 2.0, 10.0, 20.0, 100.0, 200.0, 500.0, 1000.0, 1500.0, 2000.0, 10000.0]:
    for kernel in ["rbf", "poly"]:
        print(f"Executing SVM with C={C}, kernel={kernel}")
        model = get_model("svm", gamma="scale", C=C, kernel=kernel, degree=degree)
        accuracy, f1 = do_train_and_predict(model, X_train, X_test, y_train, y_test)
        accuracies.append((accuracy, f1, C, kernel, degree))
        print("-"*40)
    
best = list(reversed(sorted(accuracies, key=lambda x: x[0])))[0]
print(f"Best: accuracy={best[0]:.4f}, f1={best[1]:.4f}, C={best[2]}, kernel={best[3]}, degree={best[4]}")

Executing SVM with C=0.5, kernel=rbf
Fit took 0.100 seconds
Accuracy: 77.000
F1-Score: 76.728
----------------------------------------
Executing SVM with C=0.5, kernel=poly
Fit took 0.084 seconds
Accuracy: 60.000
F1-Score: 59.808
----------------------------------------
Executing SVM with C=1.0, kernel=rbf
Fit took 0.082 seconds
Accuracy: 85.333
F1-Score: 84.947
----------------------------------------
Executing SVM with C=1.0, kernel=poly
Fit took 0.077 seconds
Accuracy: 66.667
F1-Score: 66.389
----------------------------------------
Executing SVM with C=2.0, kernel=rbf
Fit took 0.067 seconds
Accuracy: 89.333
F1-Score: 89.156
----------------------------------------
Executing SVM with C=2.0, kernel=poly
Fit took 0.071 seconds
Accuracy: 72.667
F1-Score: 71.581
----------------------------------------
Executing SVM with C=10.0, kernel=rbf
Fit took 0.045 seconds
Accuracy: 92.667
F1-Score: 92.577
----------------------------------------
Executing SVM with C=10.0, kernel=poly
Fit took 0.0

In [7]:
filename = "data/POJBestSeqsSBLP2021_merge_2classes.csv"
df = pd.read_csv(filename)
X_train, X_test, y_train, y_test = extract_data(df, must_normalize=True)

accuracies = []
degree = 3
kernel = "rbf"
for C in [0.5, 1.0, 2.0, 10.0, 20.0, 100.0, 200.0, 500.0, 1000.0, 1500.0, 2000.0, 10000.0]:
    for kernel in ["rbf", "poly"]:
        print(f"Executing SVM with C={C}, kernel={kernel}")
        model = get_model("svm", gamma="scale", C=C, kernel=kernel, degree=degree)
        accuracy, f1 = do_train_and_predict(model, X_train, X_test, y_train, y_test)
        accuracies.append((accuracy, f1, C, kernel, degree))
        print("-"*40)
    
best = list(reversed(sorted(accuracies, key=lambda x: x[0])))[0]
print(f"Best: accuracy={best[0]:.4f}, f1={best[1]:.4f}, C={best[2]}, kernel={best[3]}, degree={best[4]}")

Executing SVM with C=0.5, kernel=rbf
Fit took 0.086 seconds
Accuracy: 83.667
F1-Score: 83.486
----------------------------------------
Executing SVM with C=0.5, kernel=poly
Fit took 0.048 seconds
Accuracy: 87.333
F1-Score: 87.393
----------------------------------------
Executing SVM with C=1.0, kernel=rbf
Fit took 0.070 seconds
Accuracy: 89.333
F1-Score: 89.360
----------------------------------------
Executing SVM with C=1.0, kernel=poly
Fit took 0.039 seconds
Accuracy: 89.333
F1-Score: 89.321
----------------------------------------
Executing SVM with C=2.0, kernel=rbf
Fit took 0.056 seconds
Accuracy: 91.000
F1-Score: 90.956
----------------------------------------
Executing SVM with C=2.0, kernel=poly
Fit took 0.033 seconds
Accuracy: 91.000
F1-Score: 90.936
----------------------------------------
Executing SVM with C=10.0, kernel=rbf
Fit took 0.037 seconds
Accuracy: 93.667
F1-Score: 93.624
----------------------------------------
Executing SVM with C=10.0, kernel=poly
Fit took 0.0

In [8]:
filename = "data/POJBestSeqsSBLP2021_merge_2classes.csv"
df = pd.read_csv(filename)
X_train, X_test, y_train, y_test = extract_data(df)

accuracies = []


for n_estimators in [10, 20, 50, 100, 200, 300, 500, 1000]:
    for criterion in ["gini", "entropy"]:
        print(f"Executing Random Forest with n_estimators={n_estimators}, criterion={criterion}")
        model = get_model("random_forest", n_estimators=n_estimators)
        accuracy, f1 = do_train_and_predict(model, X_train, X_test, y_train, y_test)
        accuracies.append((accuracy, f1, C, kernel, degree))
        print("-"*40)
    
best = list(reversed(sorted(accuracies, key=lambda x: x[0])))[0]
print(f"Best: accuracy={best[0]:.4f}, f1={best[1]:.4f}, C={best[2]}, kernel={best[3]}, degree={best[4]}")

Executing Random Forest with n_estimators=10, criterion=gini
Fit took 0.022 seconds
Accuracy: 93.000
F1-Score: 92.977
----------------------------------------
Executing Random Forest with n_estimators=10, criterion=entropy
Fit took 0.022 seconds
Accuracy: 95.000
F1-Score: 94.979
----------------------------------------
Executing Random Forest with n_estimators=20, criterion=gini
Fit took 0.041 seconds
Accuracy: 93.667
F1-Score: 93.616
----------------------------------------
Executing Random Forest with n_estimators=20, criterion=entropy
Fit took 0.041 seconds
Accuracy: 97.333
F1-Score: 97.319
----------------------------------------
Executing Random Forest with n_estimators=50, criterion=gini
Fit took 0.101 seconds
Accuracy: 96.333
F1-Score: 96.326
----------------------------------------
Executing Random Forest with n_estimators=50, criterion=entropy
Fit took 0.100 seconds
Accuracy: 96.667
F1-Score: 96.665
----------------------------------------
Executing Random Forest with n_estima

In [9]:
filename = "data/POJBestSeqsSBLP2021_merge_2classes.csv"
df = pd.read_csv(filename)
X_train, X_test, y_train, y_test = extract_data(df, must_normalize=True)

accuracies = []

for n_estimators in [10, 20, 50, 100, 200, 300, 500, 1000]:
    for criterion in ["gini", "entropy"]:
        print(f"Executing Random Forest with n_estimators={n_estimators}, criterion={criterion}")
        model = get_model("random_forest", n_estimators=n_estimators)
        accuracy, f1 = do_train_and_predict(model, X_train, X_test, y_train, y_test)
        accuracies.append((accuracy, f1, C, kernel, degree))
        print("-"*40)
    
best = list(reversed(sorted(accuracies, key=lambda x: x[0])))[0]
print(f"Best: accuracy={best[0]:.4f}, f1={best[1]:.4f}, C={best[2]}, kernel={best[3]}, degree={best[4]}")

Executing Random Forest with n_estimators=10, criterion=gini
Fit took 0.023 seconds
Accuracy: 94.000
F1-Score: 93.942
----------------------------------------
Executing Random Forest with n_estimators=10, criterion=entropy
Fit took 0.023 seconds
Accuracy: 94.667
F1-Score: 94.630
----------------------------------------
Executing Random Forest with n_estimators=20, criterion=gini
Fit took 0.046 seconds
Accuracy: 95.333
F1-Score: 95.290
----------------------------------------
Executing Random Forest with n_estimators=20, criterion=entropy
Fit took 0.045 seconds
Accuracy: 95.667
F1-Score: 95.643
----------------------------------------
Executing Random Forest with n_estimators=50, criterion=gini
Fit took 0.113 seconds
Accuracy: 94.667
F1-Score: 94.622
----------------------------------------
Executing Random Forest with n_estimators=50, criterion=entropy
Fit took 0.112 seconds
Accuracy: 94.000
F1-Score: 93.962
----------------------------------------
Executing Random Forest with n_estima

In [11]:
filename = "data/POJBestSeqsSBLP2021_merge_2classes.csv"
df = pd.read_csv(filename)
X_train, X_test, y_train, y_test = extract_data(df)

accuracies = []

for weight in ["uniform", "distance"]:
    for algo in ["ball_tree", "kd_tree"]:
        print(f"Executing KNN with weight={weight}, algo={algo}")
        model = get_model("knn", n=5, algorithm=algo, weights=weight)
        accuracy, f1 = do_train_and_predict(model, X_train, X_test, y_train, y_test)
        accuracies.append((accuracy, f1, C, kernel, degree))
        print("-"*40)
    
best = list(reversed(sorted(accuracies, key=lambda x: x[0])))[0]
print(f"Best: accuracy={best[0]:.4f}, f1={best[1]:.4f}, C={best[2]}, kernel={best[3]}, degree={best[4]}")

Executing KNN with weight=uniform, algo=ball_tree
Fit took 0.024 seconds
Accuracy: 91.667
F1-Score: 91.719
----------------------------------------
Executing KNN with weight=uniform, algo=kd_tree
Fit took 0.006 seconds
Accuracy: 91.667
F1-Score: 91.719
----------------------------------------
Executing KNN with weight=distance, algo=ball_tree
Fit took 0.005 seconds
Accuracy: 91.333
F1-Score: 91.388
----------------------------------------
Executing KNN with weight=distance, algo=kd_tree
Fit took 0.006 seconds
Accuracy: 91.333
F1-Score: 91.388
----------------------------------------
Best: accuracy=91.6667, f1=91.7194, C=10000.0, kernel=poly, degree=3


In [12]:
filename = "data/POJBestSeqsSBLP2021_merge_2classes.csv"
df = pd.read_csv(filename)
X_train, X_test, y_train, y_test = extract_data(df, must_normalize=True)

accuracies = []

for weight in ["uniform", "distance"]:
    for algo in ["ball_tree", "kd_tree"]:
        print(f"Executing KNN with weight={weight}, algo={algo}")
        model = get_model("knn", n=5, algorithm=algo, weights=weight)
        accuracy, f1 = do_train_and_predict(model, X_train, X_test, y_train, y_test)
        accuracies.append((accuracy, f1, C, kernel, degree))
        print("-"*40)
    
best = list(reversed(sorted(accuracies, key=lambda x: x[0])))[0]
print(f"Best: accuracy={best[0]:.4f}, f1={best[1]:.4f}, C={best[2]}, kernel={best[3]}, degree={best[4]}")

Executing KNN with weight=uniform, algo=ball_tree
Fit took 0.003 seconds
Accuracy: 93.000
F1-Score: 93.017
----------------------------------------
Executing KNN with weight=uniform, algo=kd_tree
Fit took 0.004 seconds
Accuracy: 93.000
F1-Score: 93.017
----------------------------------------
Executing KNN with weight=distance, algo=ball_tree
Fit took 0.003 seconds
Accuracy: 93.000
F1-Score: 93.006
----------------------------------------
Executing KNN with weight=distance, algo=kd_tree
Fit took 0.004 seconds
Accuracy: 93.000
F1-Score: 93.006
----------------------------------------
Best: accuracy=93.0000, f1=93.0056, C=10000.0, kernel=poly, degree=3
