In [1]:
!pip list

Package                Version
---------------------- -----------------------
absl-py                0.10.0
argon2-cffi            20.1.0
astor                  0.8.1
astunparse             1.6.3
attrs                  20.1.0
backcall               0.2.0
bleach                 3.1.5
cachetools             4.1.1
certifi                2020.6.20
cffi                   1.14.2
chainer                7.7.0
chardet                3.0.4
click                  7.1.2
cloudpickle            1.3.0
cntk-gpu               2.7
cupy                   7.8.0
cycler                 0.10.0
Cython                 0.29.21
decorator              4.4.2
defusedxml             0.6.0
dm-sonnet              2.0.0
dm-tree                0.1.5
entrypoints            0.3
enum34                 1.1.10
fastrlock              0.5
filelock               3.0.12
funcsigs               1.0.2
future                 0.18.2
gast                   0.3.3
google-auth            1.21.0
google-auth-oauthlib   0.4.1
google-pasta  

In [2]:
import sys
import os
import numpy as np
import yaml as yl
import matplotlib.pyplot as plt
import argparse
import glob
import pandas as pd
import time

from typing import List, Tuple
from functools import partial

from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import manifold
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [3]:
filename = "data/POJBestSeqsSBLP2021_merge_2classes.csv"
df = pd.read_csv(filename)
df.tail()

Unnamed: 0.1,Unnamed: 0,name,class,0,1,2,3,4,5,6,...,55,56,57,58,59,60,61,62,63,64
1495,1495,38_1810,2,1.0,20.0,0.0,0.0,0.0,0.0,0.0,...,11.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
1496,1496,38_1421,2,1.0,24.0,0.0,0.0,0.0,0.0,0.0,...,12.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
1497,1497,54_1233,2,3.0,29.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0
1498,1498,38_1978,2,2.0,20.0,0.0,0.0,0.0,0.0,0.0,...,11.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
1499,1499,38_1685,2,1.0,46.0,0.0,0.0,0.0,0.0,0.0,...,24.0,0.0,1.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0


In [4]:
def get_model(model_name: str, **kwargs):
    model_name = model_name.lower()
    if model_name == 'svm':
        return svm.SVC(**kwargs)
    else:
        raise ValueError(f"Invalid model {model_name}")

In [5]:
def do_train_and_predict(model, x_train: np.array, x_test: np.array, y_train: np.array, y_test: np.array):
    # Do fit and predict
    start = time.time()
    model = model.fit(x_train, y_train)
    print(f"Fit took {time.time()-start:.3f} seconds")
    model_predict = model.predict(x_test)
    # Calculate accuracy and F1 score
    model_accuracy = accuracy_score(y_test, model_predict)
    model_f1 = f1_score(y_test, model_predict, average="weighted")
    # Print metrics
    accuracy = model_accuracy*100
    f1_acc = model_f1*100
    print(f"Accuracy: {accuracy:.3f}")
    print(f"F1-Score: {f1_acc:.3f}")
    return accuracy, f1_acc

In [6]:
feature_names = [str(i) for i in range(65)]
def extract_data(data_df: pd.DataFrame, train: float = 0.8, test: float = 0.2, random_state: int = 0):
    x_data = data_df.loc[:, feature_names]
    y_data = data_df["class"]
    X_train, X_test, y_train, y_test = model_selection.train_test_split(x_data, y_data, train_size=train, test_size=test, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = extract_data(df)

In [8]:
accuracies = []
degree = 3
kernel = "rbf"
for C in [0.1, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 100.0, 200.0, 500.0, 1000.0, 1500.0, 2000.0, 10000.0]:
    print(f"Executing SVM with C={C}, kernel={kernel}, degree={degree}")
    model = get_model("svm", gamma="scale", C=C, kernel=kernel, degree=degree)
    accuracy, f1 = do_train_and_predict(model, X_train, X_test, y_train, y_test)
    accuracies.append((accuracy, f1, C, kernel, degree))
    print()

Executing SVM with C=0.1, kernel=rbf, degree=3
Fit took 0.180 seconds
Accuracy: 61.000
F1-Score: 60.254

Executing SVM with C=0.5, kernel=rbf, degree=3
Fit took 0.098 seconds
Accuracy: 77.000
F1-Score: 76.728

Executing SVM with C=1.0, kernel=rbf, degree=3
Fit took 0.082 seconds
Accuracy: 85.333
F1-Score: 84.947

Executing SVM with C=2.0, kernel=rbf, degree=3
Fit took 0.067 seconds
Accuracy: 89.333
F1-Score: 89.156

Executing SVM with C=3.0, kernel=rbf, degree=3
Fit took 0.060 seconds
Accuracy: 91.667
F1-Score: 91.510

Executing SVM with C=4.0, kernel=rbf, degree=3
Fit took 0.056 seconds
Accuracy: 91.333
F1-Score: 91.154

Executing SVM with C=5.0, kernel=rbf, degree=3
Fit took 0.053 seconds
Accuracy: 91.333
F1-Score: 91.169

Executing SVM with C=10.0, kernel=rbf, degree=3
Fit took 0.045 seconds
Accuracy: 92.667
F1-Score: 92.577

Executing SVM with C=20.0, kernel=rbf, degree=3
Fit took 0.039 seconds
Accuracy: 94.000
F1-Score: 93.955

Executing SVM with C=100.0, kernel=rbf, degree=3
Fit 

In [9]:
best = list(reversed(sorted(accuracies, key=lambda x: x[0])))[0]
print(f"Best: accuracy={best[0]:.4f}, f1={best[1]:.4f}, C={best[2]}, kernel={best[3]}, degree={best[4]}")

Best: accuracy=96.3333, f1=96.3479, C=1000.0, kernel=rbf, degree=3
