# Cluster classifier
In this notebook we train a classifier on the found partition, so that we can classify projects into one of the group types from the found cluster typology

In [1]:
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

#### Prepare data for training and validation

In [4]:
#load feature vectors
feature_vecs_8to20=pickle.load( open("data/feature_vecs_8to20.p", "rb"))
#load partition
partition= pickle.load(open("data/final_partition", "rb"))

In [5]:
X_vecs=[]
y=[]
for key,val in partition.items():
    X_vecs.append(feature_vecs_8to20_crop[key])
    y.append(val)

In [6]:
# Split the dataset in two parts, a training set and a test set
X_train_vec, X_test_vec, y_train_vec, y_test_vec  = train_test_split(
    X_vecs, y, test_size=0.2, random_state=0)

We perform a grid search for cross-validation of the best hyper parameter set for the support vector machine

In [7]:
# Set the parameters by cross-validation
tuned_parameters = [{ 'gamma': [0.003, 0.004, 0.005, 0.006],
                     'C': [8,9,10, 11, 12]}]

scores = ['neg_mean_squared_error', 'accuracy']

for score in scores:
    print()
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1, kernel="rbf"), tuned_parameters,
                       scoring='%s' % score)
    clf.fit(X_train_vec, y_train_vec)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))


# Tuning hyper-parameters for neg_mean_squared_error





Best parameters set found on development set:

{'C': 12, 'gamma': 0.006}

Grid scores on development set:

-0.018 (+/-0.006) for {'C': 8, 'gamma': 0.003}
-0.018 (+/-0.012) for {'C': 8, 'gamma': 0.004}
-0.015 (+/-0.003) for {'C': 8, 'gamma': 0.005}
-0.015 (+/-0.003) for {'C': 8, 'gamma': 0.006}
-0.016 (+/-0.006) for {'C': 9, 'gamma': 0.003}
-0.016 (+/-0.006) for {'C': 9, 'gamma': 0.004}
-0.015 (+/-0.003) for {'C': 9, 'gamma': 0.005}
-0.015 (+/-0.003) for {'C': 9, 'gamma': 0.006}
-0.016 (+/-0.006) for {'C': 10, 'gamma': 0.003}
-0.015 (+/-0.003) for {'C': 10, 'gamma': 0.004}
-0.017 (+/-0.009) for {'C': 10, 'gamma': 0.005}
-0.015 (+/-0.003) for {'C': 10, 'gamma': 0.006}
-0.016 (+/-0.006) for {'C': 11, 'gamma': 0.003}
-0.015 (+/-0.003) for {'C': 11, 'gamma': 0.004}
-0.017 (+/-0.009) for {'C': 11, 'gamma': 0.005}
-0.013 (+/-0.009) for {'C': 11, 'gamma': 0.006}
-0.016 (+/-0.006) for {'C': 12, 'gamma': 0.003}
-0.017 (+/-0.009) for {'C': 12, 'gamma': 0.004}
-0.017 (+/-0.009) for {'C': 12, 'gamm



Best parameters set found on development set:

{'C': 12, 'gamma': 0.006}

Grid scores on development set:

0.997 (+/-0.002) for {'C': 8, 'gamma': 0.003}
0.997 (+/-0.003) for {'C': 8, 'gamma': 0.004}
0.998 (+/-0.002) for {'C': 8, 'gamma': 0.005}
0.997 (+/-0.001) for {'C': 8, 'gamma': 0.006}
0.997 (+/-0.002) for {'C': 9, 'gamma': 0.003}
0.997 (+/-0.002) for {'C': 9, 'gamma': 0.004}
0.998 (+/-0.002) for {'C': 9, 'gamma': 0.005}
0.997 (+/-0.001) for {'C': 9, 'gamma': 0.006}
0.997 (+/-0.002) for {'C': 10, 'gamma': 0.003}
0.998 (+/-0.002) for {'C': 10, 'gamma': 0.004}
0.997 (+/-0.002) for {'C': 10, 'gamma': 0.005}
0.997 (+/-0.001) for {'C': 10, 'gamma': 0.006}
0.997 (+/-0.002) for {'C': 11, 'gamma': 0.003}
0.998 (+/-0.002) for {'C': 11, 'gamma': 0.004}
0.997 (+/-0.002) for {'C': 11, 'gamma': 0.005}
0.998 (+/-0.002) for {'C': 11, 'gamma': 0.006}
0.997 (+/-0.002) for {'C': 12, 'gamma': 0.003}
0.997 (+/-0.002) for {'C': 12, 'gamma': 0.004}
0.997 (+/-0.002) for {'C': 12, 'gamma': 0.005}
0.998 (+

We set the hyper parameter according to the best result on the grid search and train the model on the training data 

In [8]:
# train model 
svm = SVC(kernel='rbf', C=12, gamma=0.006)
svm.fit(X_train_vec,y_train_vec)

SVC(C=12, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.006, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
accuracy_score(y_test_vec, svm.predict(X_test_vec))

0.9961538461538462

In [10]:

print("None: ", f1_score(y_test_vec, svm.predict(X_test_vec), average=None))
print("micro: ", f1_score(y_test_vec, svm.predict(X_test_vec), average="micro"))
print("macro: ", f1_score(y_test_vec, svm.predict(X_test_vec), average="macro"))
print("weighted: ", f1_score(y_test_vec, svm.predict(X_test_vec), average="weighted"))

None:  [0.9977221  1.         0.99526066 0.98591549 0.98550725]
micro:  0.9961538461538462
macro:  0.9928810997027299
weighted:  0.9961579464795132


In [11]:
pickle.dump(svm, open('./data/svm_classifier.p', 'wb'))