# Support Vector Machines

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC, LinearSVC, OneClassSVM # LinearSVC работи по-оптимизирано; SVC има повече неща
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer

In [3]:
income_data = pd.read_csv("data/adult.data", header = None, sep = ", ", engine = "python")

In [4]:
income_data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income_class"]

In [5]:
income_data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [6]:
income_attributes = income_data.drop(columns = ["income_class"])
income_classes = income_data.income_class

In [7]:
income_attributes = pd.get_dummies(income_attributes)

In [8]:
scaler = MinMaxScaler()
income_attributes_scaled = scaler.fit_transform(income_attributes)

In [9]:
income_atrributes_train, income_attributes_test, income_classes_train, income_classes_test = train_test_split(
    income_attributes_scaled,
    income_classes,
    stratify = income_classes,
    test_size = 0.2
)

In [10]:
income_atrributes_train.shape, income_attributes_test.shape, income_classes_train.shape, income_classes_test.shape

((26048, 108), (6513, 108), (26048,), (6513,))

In [11]:
linear_classifier = LinearSVC(C = 1e6, max_iter = 10000)

In [12]:
linear_classifier.fit(income_atrributes_train, income_classes_train)



In [13]:
linear_classifier.coef_ # тези коефициенти могат да ни послужат за интерпретация(може да направим feature importances)
# колкото по-голяма е неговата абсолютна стойност толкова по-важен е

array([[ 0.6910769 ,  0.62594488,  1.22407816,  9.83562937,  1.18092045,
         1.01262751,  0.13511989,  0.07083236,  0.28531036, -0.38577491,
        -0.43002799, -0.01173176,  0.31371971, -0.13755244, -0.9130573 ,
         0.40494151, -0.16586239, -0.17054877, -0.09150146,  0.5931198 ,
        -0.0965634 ,  0.12967101,  0.07012678, -0.3151357 ,  0.12150085,
         0.4681857 ,  0.02780969,  0.22328684, -2.88981363,  0.48342381,
         0.13419726, -0.41727259,  0.44328147,  0.39232993, -0.36514654,
        -0.3995905 , -0.42087612, -0.30588773, -0.25065502,  0.38757029,
        -0.01146441,  0.21355606,  0.08488141, -0.34365284, -0.09315936,
         0.05123059, -0.55565108, -1.28490696,  0.53323706,  0.05025301,
         0.08965487,  0.20735914, -0.15141485, -0.11993311,  0.21690884,
        -0.54750279, -0.70375202, -0.20060466,  0.28172167, -0.38847821,
        -0.1355315 , -0.15243401, -0.42120316,  0.02448481, -0.74067095,
        -0.33249113,  0.10896946,  0.54149513,  0.1

In [14]:
linear_grid_search = GridSearchCV(
    LinearSVC(max_iter = 1000), 
        param_grid = {
        "C": [0.01, 0.1, 1, 10, 100],
        "loss": ["hinge", "squared"]
    },
    scoring = make_scorer(f1_score, pos_label = ">50K")
)

In [15]:
linear_grid_search.fit(income_atrributes_train, income_classes_train)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\stoic\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\stoic\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "C:\Users\stoic\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py", line 1204, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, los

In [16]:
linear_grid_search.best_estimator_

In [17]:
linear_grid_search.best_params_

{'C': 100, 'loss': 'hinge'}

In [18]:
linear_grid_search.cv_results_

{'mean_fit_time': array([0.06004758, 0.02699084, 0.06942339, 0.03064008, 0.09555507,
        0.02458539, 0.3680285 , 0.02751746, 1.39891438, 0.03391209]),
 'std_fit_time': array([0.00715298, 0.00239394, 0.00299908, 0.00484951, 0.00748572,
        0.00130972, 0.03796774, 0.00048161, 0.12130962, 0.00140951]),
 'mean_score_time': array([0.02711973, 0.        , 0.03197093, 0.        , 0.0272223 ,
        0.        , 0.02616167, 0.        , 0.03042035, 0.        ]),
 'std_score_time': array([0.00187333, 0.        , 0.00271262, 0.        , 0.00212209,
        0.        , 0.00114446, 0.        , 0.00101204, 0.        ]),
 'param_C': masked_array(data=[0.01, 0.01, 0.1, 0.1, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_loss': masked_array(data=['hinge', 'squared', 'hinge', 'squared', 'hinge',
                    'squared', 'hinge', 'squared', 'hi

In [19]:
test_predictions = linear_grid_search.best_estimator_.predict(income_attributes_test)

In [20]:
train_predictions = linear_grid_search.best_estimator_.predict(income_atrributes_train)

In [21]:
f1_score(income_classes_test, test_predictions, pos_label = ">50K")

0.5792880258899678

In [22]:
print(classification_report(income_classes_test, test_predictions))

              precision    recall  f1-score   support

       <=50K       0.85      0.96      0.90      4945
        >50K       0.79      0.46      0.58      1568

    accuracy                           0.84      6513
   macro avg       0.82      0.71      0.74      6513
weighted avg       0.83      0.84      0.82      6513



In [23]:
print(classification_report(income_classes_train, train_predictions))

              precision    recall  f1-score   support

       <=50K       0.85      0.96      0.90     19775
        >50K       0.80      0.46      0.59      6273

    accuracy                           0.84     26048
   macro avg       0.82      0.71      0.74     26048
weighted avg       0.84      0.84      0.83     26048



In [24]:
# see Cover's theorem

In [25]:
# see Kernel function (eg. Polinomila features)

In [26]:
svc = SVC(kernel = "poly", degree = 2, max_iter = 1000)

In [27]:
svc.fit(income_atrributes_train, income_classes_train)



In [28]:
svc.decision_function(income_atrributes_train[:10])

array([-0.36518566, -0.18334076,  0.06452041, -0.10795701,  0.21561405,
        0.44382225, -0.03668643, -0.10249734, -0.24132868,  0.29165165])

In [29]:
svc.predict(income_atrributes_train[:10])

array(['<=50K', '<=50K', '>50K', '<=50K', '>50K', '>50K', '<=50K',
       '<=50K', '<=50K', '>50K'], dtype=object)

In [30]:
test_predictions_2 = svc.predict(income_attributes_test)

In [31]:
train_predictions_2 = svc.predict(income_atrributes_train)

In [32]:
print(classification_report(income_classes_test, test_predictions_2))

              precision    recall  f1-score   support

       <=50K       0.90      0.58      0.71      4945
        >50K       0.38      0.79      0.51      1568

    accuracy                           0.63      6513
   macro avg       0.64      0.69      0.61      6513
weighted avg       0.77      0.63      0.66      6513



In [33]:
print(classification_report(income_classes_train, train_predictions_2))

              precision    recall  f1-score   support

       <=50K       0.90      0.60      0.72     19775
        >50K       0.39      0.79      0.52      6273

    accuracy                           0.65     26048
   macro avg       0.64      0.70      0.62     26048
weighted avg       0.78      0.65      0.67     26048



In [34]:
gaussian_svc = SVC(kernel = "rbf", gamma = 0.1, C = 100)
gaussian_svc.fit(income_atrributes_train, income_classes_train)
test_predictions_3 = gaussian_svc.predict(income_attributes_test)
train_predictions_3 = svc.predict(income_atrributes_train)

In [35]:
print(classification_report(income_classes_test, test_predictions_3))

              precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90      4945
        >50K       0.71      0.59      0.65      1568

    accuracy                           0.84      6513
   macro avg       0.80      0.76      0.77      6513
weighted avg       0.84      0.84      0.84      6513



In [36]:
print(classification_report(income_classes_train, train_predictions_3))

              precision    recall  f1-score   support

       <=50K       0.90      0.60      0.72     19775
        >50K       0.39      0.79      0.52      6273

    accuracy                           0.65     26048
   macro avg       0.64      0.70      0.62     26048
weighted avg       0.78      0.65      0.67     26048



## K nearest neighbours

In [37]:
knn = KNeighborsClassifier(n_neighbors = 20)

In [38]:
knn.fit(income_atrributes_train, income_classes_train)

In [39]:
knn.predict(income_atrributes_train)

array(['<=50K', '<=50K', '>50K', ..., '<=50K', '>50K', '<=50K'],
      dtype=object)

In [40]:
# see Voranoi diagram
f1_score(income_classes_test, knn.predict(income_attributes_test), pos_label = ">50K")

0.6238859180035651

## Anomaly detection

In [41]:
one_class_svm = OneClassSVM(nu = 0.1)

In [42]:
one_class_svm.fit(income_atrributes_train)

In [43]:
train_predictions_4 = one_class_svm.predict(income_atrributes_train)

In [44]:
len(train_predictions_4[train_predictions_4 == -1]) # аномалии

2605

In [45]:
len(train_predictions_4[train_predictions_4 == 1]) # Нормални примери

23443

In [46]:
test_predictions_4 = one_class_svm.predict(income_attributes_test)

In [47]:
len(test_predictions_4[test_predictions_4 == -1]) # аномалии

631

In [48]:
len(test_predictions_4[test_predictions_4 == 1]) # Нормални примери

5882