In [1]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import preprocessing

## Importing the data and Preprocessing

In [2]:
X, y, class_names = preprocessing.create_data_tensor()

  Y[sl] = X[sl]
  Y[sl] = X[sl]
  Y[sl] /= 2  # halve the component at -N/2
  temp = Y[sl]
  Y[sl] = temp  # set that equal to the component at -N/2


In [3]:
print('Dimensions of X:', X.shape)
print('Possible Classes:', class_names.values())

Dimensions of X: (2565, 22, 57)
Possible Classes: dict_values(['make', 'polite', 'draw', 'soon', 'money', 'cost', 'when', 'innocent', 'pen', 'name', 'know', 'paper', 'no', 'I', 'tray', 'research', 'computer_PC_', 'ready', 'God', 'what', 'wait_notyet_', 'building', 'yes', 'different', 'sad', 'man', 'right', 'later', 'all', 'hurry', 'his_hers', 'hear', 'danger', 'eat', 'drink', 'share', 'thank', 'you', 'temper', 'juice', 'hurt', 'wild', 'please', 'give', 'come', 'glove', 'forget', 'more', 'which', 'shop', 'lose', 'maybe', 'stubborn', 'question', 'where', 'sorry', 'spend', 'girl', 'Norway', 'write', 'science', 'zero', 'buy', 'happy', 'hot', 'not', 'take', 'will', 'head', 'go', 'is_true_', 'think', 'why', 'deaf', 'answer', 'surprise', 'how', 'read', 'love', 'flash', 'boy', 'voluntary', 'hello', 'cold', 'change_mind_', 'mine', 'crazy', 'responsible', 'who', 'joke', 'same', 'wrong', 'alive', 'us', 'exit'])


In [4]:
X_flat = preprocessing.flatten_data(X)
print('Dimensions of X after flattening: ', X_flat.shape)

Dimensions of X after flattening:  (2565, 1254)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_flat, y, test_size=0.3)
print('Training set dimensions: ', X_train.shape)
print('Test set dimensions: ', X_test.shape)

Training set dimensions:  (1795, 1254)
Test set dimensions:  (770, 1254)


## Setting up parameters for GridSearch

In [10]:
# tuned_parameters = [{'kernel': ['linear'], 'C': [0.01, 0.1, 1, 10]}]
tuned_parameters = [{'kernel': ['rbf'], 'C': [1, 10, 100], 'gamma': [0.01, 0.1]}, {'kernel': ['linear'], 'C': [0.01, 0.1, 1, 10]}]

In [11]:
scores = ['precision', 'recall']
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=3,
                       scoring='%s_macro' % score, n_jobs=-1, verbose=5)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(metrics.classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   45.1s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.9min finished


Best parameters set found on development set:

{'C': 10, 'kernel': 'linear'}

Grid scores on development set:

0.554 (+/-0.017) for {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.832 (+/-0.042) for {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.904 (+/-0.016) for {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.849 (+/-0.032) for {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
0.923 (+/-0.009) for {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.849 (+/-0.032) for {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
0.407 (+/-0.039) for {'C': 0.01, 'kernel': 'linear'}
0.885 (+/-0.006) for {'C': 0.1, 'kernel': 'linear'}
0.930 (+/-0.013) for {'C': 1, 'kernel': 'linear'}
0.931 (+/-0.013) for {'C': 10, 'kernel': 'linear'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         5
        

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   53.8s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.0min finished


Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}

Grid scores on development set:

0.551 (+/-0.018) for {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.780 (+/-0.053) for {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.889 (+/-0.019) for {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.815 (+/-0.042) for {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
0.907 (+/-0.014) for {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.815 (+/-0.042) for {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
0.403 (+/-0.012) for {'C': 0.01, 'kernel': 'linear'}
0.866 (+/-0.012) for {'C': 0.1, 'kernel': 'linear'}
0.918 (+/-0.017) for {'C': 1, 'kernel': 'linear'}
0.917 (+/-0.017) for {'C': 10, 'kernel': 'linear'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         5
         