In [None]:
# C:\Program Files (x86)\mingw-w64\i686-8.1.0-posix-dwarf-rt_v6-rev0\mingw32\bin
%time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

from math import exp, sqrt
from collections import defaultdict

from subprocess import run, PIPE
from sys import stdout
from io import StringIO

In [2]:
def linear(x, y):
    return x @ y


linear = [linear]
polynomial = []
guassian = []

for p in range(2, 5 + 1):
    name = f'polynomial_{p}'
    exec(f'def {name}(x, y): return (1 + x @ y)**{p}')
    exec(f'polynomial.append({name})')

for b in range(1, 5 + 1):
    name = f'guassian_{b}'
    exec(f'def {name}(x, y): return exp(-{b} * np.linalg.norm(x - y)**2)')
    exec(f'guassian.append({name})')

In [3]:
class SVM(BaseEstimator):
    def __init__(self, kernel=None, C=None):
        super().__init__()
        self.kernel = kernel
        self.C = C
        self.x_train = None
        self.y_train = None
        self.lambdas = None
        self.b = None
        self.distances = None
        self.io = StringIO()
        self.score = None


    def calc_matrix(self, x):
        n = x.shape[0]
        self.distances = np.fromfunction(np.vectorize(lambda i, j: self.kernel(x[i], x[j])), (n, n), dtype=int)


    def fit(self, x, y):
        self.x_train = x
        self.y_train = y
        self.calc_matrix(x)

        np.savetxt(self.io, np.c_[self.distances, y], fmt='%.8f')
        args = str(self.distances.shape[0]) + '\n' + self.io.getvalue() + str(self.C)
        output = run(['smo/cmake-build-debug/smo.exe'], stdout=PIPE, input=args, encoding='ascii').stdout.split('\n')

        *self.lambdas, self.b  = map(float, output)
        

    def predict(self, x):
        ans = []
        for obj in x:
            res = self.b
            for l, xi, yi in zip(self.lambdas, self.x_train, self.y_train):
                res += l * yi * self.kernel(obj, xi)
            ans.append(res / abs(res))
        return np.asarray(ans)

In [4]:
dataframes = list(map(lambda name: pd.read_csv(name + '.csv'), ['chips', 'geyser']))
for d in dataframes:
    d.replace({'P': 1, 'N': -1}, inplace=True)
dataframes[0];

In [5]:
models = defaultdict(list)

def compute_models(df_i):
    df = dataframes[df_i]
    x_raw = df[['x', 'y']].to_numpy()
    y_raw = df['class'].to_numpy()

    for kernel_type in [linear, polynomial, guassian]:
        param_grid = {'kernel': kernel_type,
                      'C': [0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]}
    
        clf = GridSearchCV(SVM(), param_grid, cv=5, iid='deprecated', scoring='accuracy')    
        clf.fit(x_raw, y_raw)
        models[df_i].append(clf.best_estimator_)

In [6]:
def plot(df_i, model):
    df = dataframes[df_i]
    x_min = df['x'].min()
    x_max = df['x'].max()
    y_min = df['y'].min()
    y_max = df['y'].max()
    x = xx = np.ogrid[x_min:x_max:50j]
    y = yy = np.ogrid[y_min:y_max:50j]
    space = [(xi, yi) for xi in x for yi in y]
    x, y = zip(*space)

    background = pd.DataFrame({'x': x, 'y': y, 'class': model.predict(space)})    
    zz = background['class'].to_numpy().reshape(len(xx), len(yy))
    plt.contourf(xx, yy, zz, cmap=plt.cm.coolwarm, alpha=0.8)
    plt.scatter(df['x'], df['y'], c=df['class'], cmap=plt.cm.coolwarm)
    plt.show()

    # sns.jointplot(x='x', y='y', data=background, hue='class')

In [None]:
# compute_models(0)
compute_models(1)
models

In [None]:
for ds, ms in models.items():
    print(ds)
    for m in ms:
        print(m.C, m.kernel.__name__)

In [None]:
plot(1, models[1][0])

In [None]:
show('geyser')