In [1]:
import os
from pathlib import Path

import pandas as pd
from sklearn.svm import SVC

In [2]:
data_path = Path(os.getcwd()).parent/"data"

df = pd.read_csv(data_path/"svm-data.csv", header=None)

In [3]:
X = df.iloc[:, 1:].to_numpy()
y = df.iloc[:, 0].to_numpy()

In [4]:
X, y

(array([[0.7 , 0.29],
        [0.23, 0.55],
        [0.72, 0.42],
        [0.98, 0.68],
        [0.48, 0.39],
        [0.34, 0.73],
        [0.44, 0.06],
        [0.4 , 0.74],
        [0.18, 0.18],
        [0.53, 0.53]]),
 array([0., 1., 0., 0., 0., 1., 0., 1., 0., 1.]))

In [5]:
clf = SVC(kernel="linear", C=100000, random_state=241)

In [6]:
clf.fit(X, y)

SVC(C=100000, kernel='linear', random_state=241)

In [7]:
clf.support_

array([3, 4, 9], dtype=int32)

## SVM texts 

In [8]:
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, KFold
import numpy as np

In [9]:
newsgroups = datasets.fetch_20newsgroups(
    subset="all",
    categories=["alt.atheism", "sci.space"]
)

In [10]:
newsgroups.data[0]

'From: 9051467f@levels.unisa.edu.au (The Desert Brat)\nSubject: Re: Keith Schneider - Stealth Poster?\nOrganization: Cured, discharged\nLines: 24\n\nIn article <1pa0f4INNpit@gap.caltech.edu>, keith@cco.caltech.edu (Keith Allan Schneider) writes:\n\n> But really, are you threatened by the motto, or by the people that use it?\n\nEvery time somone writes something and says it is merely describing the norm,\nit is infact re-inforcing that norm upon those programmed not to think for\nthemselves. The motto is dangerous in itself, it tells the world that every\n*true* American is god-fearing, and puts down those who do not fear gods. It\ndoesn\'t need anyone to make it dangerous, it does a good job itself by just\nexisting on your currency.\n\n> keith\n\nThe Desert Brat\n-- \nJohn J McVey, Elc&Eltnc Eng, Whyalla, Uni S Australia,    ________\n9051467f@levels.unisa.edu.au      T.S.A.K.C.            \\/Darwin o\\\nFor replies, mail to whjjm@wh.whyalla.unisa.edu.au      /\\________/\nDisclaimer:

In [11]:
X = newsgroups.data
y = newsgroups.target

In [12]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X)

In [None]:
grid = {"C": np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel="linear", random_state=241)
gs = GridSearchCV(clf, grid, scoring="accuracy", cv=cv, verbose=2)
gs.fit(X, y)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV] C=1e-05 .........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......................................... C=1e-05, total=   3.4s
[CV] C=1e-05 .........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.4s remaining:    0.0s


[CV] .......................................... C=1e-05, total=   3.4s
[CV] C=1e-05 .........................................................
[CV] .......................................... C=1e-05, total=   3.1s
[CV] C=1e-05 .........................................................
[CV] .......................................... C=1e-05, total=   3.5s
[CV] C=1e-05 .........................................................
[CV] .......................................... C=1e-05, total=   3.2s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   4.0s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   3.4s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   3.5s
[CV] C=0.0001 ........................................................
[CV] .

In [None]:
gs.best_params_

In [None]:
clf = SVC(kernel="linear", random_state=241, C=1.0)
clf.fit(X, y)

In [None]:
idxs = np.argsort([abs(i) for i in clf.coef_.A])[0][-10:]

In [None]:
print(idxs), idxs.shape

In [None]:
names = tfidf.get_feature_names()

In [None]:
res = [names[i] for i in idxs]

with open("res", "w") as f:
    f.write(",".join(sorted(res)))

In [None]:
",".join(sorted(res))