# SVM Pipeline

In [1]:
# !python -m spacy download es_core_news_md

In [2]:
# import
from datasets import load_dataset
import pandas as pd
import numpy as np
import scipy
from collections import defaultdict, Counter
import os
import json
from itertools import compress
import pickle

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SequentialFeatureSelector # requires sklearn 0.24 and above
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report, accuracy_score


import spacy
from spacy.lang.es.stop_words import STOP_WORDS

In [3]:
nlp = spacy.load("es_core_news_md")
np.random.seed(123)

## Load data

In [4]:
# read train and val data
with open('../data/train_features.json', 'r') as f:
    train_feat = json.load(f)
with open('../data/val_features.json', 'r') as f:
    val_feat = json.load(f)

train_feat_df = pd.DataFrame(train_feat)
val_feat_df = pd.DataFrame(val_feat)


X_train = train_feat_df.drop(['level'], axis=1)
X_val = val_feat_df.drop(['level'], axis=1)

y_train = train_feat_df['level'].tolist()
y_val = val_feat_df['level'].tolist()

In [5]:
print(f'Number of features: {len(X_train.columns)}')
display(X_train.head())

Number of features: 45


Unnamed: 0,preprocessed_text,total_tokens,total_tokens_w/o_stopwords,avg_sent_length,proportion_of_A_level_tokens,proportion_of_A_level_types,num_connectives,logical_operator_density,pronoun_density,type_token_ratio,...,PROPN,PUNCT,SCONJ,SYM,VERB,X,EOL,SPACE,CONTENT,FUNCTION
0,¡estoy encantada! desde esta mañana respiro el...,2585,1118,35.410959,0.339893,0.18255,18,0.050386,0.047407,0.360155,...,0.011605,0.109865,0.032882,0.0,0.07853,0.0,0.0,0.0,0.635376,0.364624
1,era un mañana a fines del mes de abril. el bue...,1539,622,14.941748,0.326367,0.214612,18,0.03497,0.067268,0.388564,...,0.010396,0.152697,0.038337,0.005198,0.107862,0.0,0.0,0.0,0.621914,0.378086
2,a mi perro curro le gusta pasear por el parque...,159,70,22.714286,0.557143,0.395833,7,0.039216,0.143885,0.578616,...,0.012579,0.075472,0.025157,0.0,0.169811,0.0,0.0,0.0,0.673469,0.326531
3,"en la ribera ven, sigue de la mano al que te a...",291,117,22.384615,0.307692,0.25,9,0.043011,0.024648,0.580756,...,0.013746,0.189003,0.034364,0.0,0.089347,0.0,0.0,0.003436,0.629787,0.370213
4,la vuelta a la patria mirad al peregrino10 ¡cu...,401,193,21.105263,0.295337,0.23125,5,0.046997,0.028205,0.551122,...,0.01995,0.164589,0.022444,0.0,0.109726,0.0,0.0,0.004988,0.654655,0.345345


## Model set up

In [6]:
feat_names = list(X_train.drop(['preprocessed_text'], axis=1).columns)

scoring = ['accuracy']

# dictionary to store results for comparison
results_df = {}

# tokenizer
def tokenizer(text):
    return [tok.text for tok in nlp(text)]

In [7]:
def validate_and_display(preprocessor, model, name, train_set, val_set, results_df):
    '''
    Construct a sklearn pipeline using the given preprocessor and model, then train the pipeline and return its validation results.
    
    preprocessor: (sklearn ColumnTransformer) sklearn object for feature transformation
    model: (sklearn Classifier) initialized sklearn classifier
    name: (str) a name that is shown when the result is displayed
    train_set: (DataFrame) the input train set
    val_set: (DataFrame) the input validation set
    results_df: (dict) the dictionary to store validation results
    
    return: (dict) results_df
    '''
    pipeline = make_pipeline(
        preprocessor, model
    )
    
    pipeline.fit(train_set, y_train)
    y_pred_val = pipeline.predict(val_set)
    
    results_df[name] = {'train score': pipeline.score(train_set, y_train), 'validation score': accuracy_score(y_pred_val, y_val)}
    
    print('Classification report on validation:')
    print(classification_report(y_pred_val, y_val))
    return results_df

## Baseline model
The baseline model only uses bag of word feature

In [8]:
X_train_base = X_train['preprocessed_text']
X_val_base = X_val['preprocessed_text']

results_df = validate_and_display(CountVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), 
                                  SVC(random_state=123), 
                                  'SVM baseline', 
                                  X_train_base,
                                  X_val_base,
                                  results_df)
display(pd.DataFrame(results_df))

Classification report on validation:
              precision    recall  f1-score   support

          A1       0.73      0.73      0.73        11
          A2       0.00      0.00      0.00         0
           B       1.00      0.71      0.83        21

    accuracy                           0.72        32
   macro avg       0.58      0.48      0.52        32
weighted avg       0.91      0.72      0.80        32



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,SVM baseline
train score,0.70428
validation score,0.71875


## Forward feature selection
Use SequentialFeatureSelector from sklearn to perform forward and backward feature selection

In [9]:
# keep track of the best validation score and the best train score to determine the current best model
best_val_score = results_df['SVM baseline']['validation score']
best_train_score = results_df['SVM baseline']['train score']
best_feats = ['preprocessed_text']

In [10]:
def sfs_and_validate(n_features, direction, results_df):
    '''
    Generate a feature selection pipeline for svm models, train the pipeline and store the validation results
    
    n_features: (int) argument passed into the `n_features_to_select` argument in SequentialFeatureSelector
    direction: (str) {'forward', 'backward'}, argument passe dinto the direction argument in SequentialFeatureSelector
    results_df: (dict) the dictionary to store validation results
    
    return: (dict) results_df
    '''
    numeric_features = feat_names
    text_feature = 'preprocessed_text'
    sfs_X_train = X_train.drop(['preprocessed_text'], axis=1)
    
    # initialize selector
    sfs = SequentialFeatureSelector(SVC(random_state=123), n_features_to_select=n_features, scoring='accuracy', 
                                    direction=direction)
    
    sfs_preprocessor = make_column_transformer(
        (StandardScaler(), numeric_features)
    )
    sfs_pipeline = make_pipeline(
        sfs_preprocessor,
        sfs,
        SVC(random_state=123)
    )
    
    # fit
    sfs_pipeline.fit(sfs_X_train, y_train)
    
    # features selected
    feats_selected = list(compress(sfs_X_train.columns, sfs_pipeline[1].get_support()))
    print(f'features selected: {feats_selected}')
    
    # fit on selected features
    val_preprocessor = make_column_transformer(
        (StandardScaler(), feats_selected),
        (TfidfVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), text_feature)
    )
    
    val_model = SVC(random_state=123)
    
    input_X_train = X_train[feats_selected+['preprocessed_text']]
    input_X_val = X_val[feats_selected+['preprocessed_text']]
    
    results_df = validate_and_display(val_preprocessor, val_model, f'SVM + {n_features}', input_X_train, input_X_val, results_df)
    return results_df, feats_selected

In [11]:
def loop_sfs_and_pick(max_n_features, direction, results_df, best_val_score, best_train_score, best_feats, early_stop = 3):
    '''
    Loop and find best i features for all i <= max_n_features.
    Append results and return the best scores along with the list of features selected.
    
    max_n_features: (int) the max number of features to select
    direction: (str) {'forward', 'backward'}, argument passe dinto the direction argument in SequentialFeatureSelector
    results_df: (dict) the dictionary to store validation results
    best_val_score: (float) current best validation score obtained by a svm model
    best_train_score: (float) current best train score obtained by a svm model
    best_feats: (list) current list of features that the best model is trained on
    early_stop: (int) should be < max_n_features. Break the loop if the performance does not improve for this many iterations. Default is 3
    
    return: (dict) results_df, (float) best_val_score, (float) best_train_score, (list) best_feats
    '''
    early_stop_count = 0
    if direction == 'forward':
        order = range(1, max_n_features+1)
    else:
        order = reversed(range(1, max_n_features))
        
    for i in order:
        print(f'Picking the top {i} feature(s)')
        results_df, feats = sfs_and_validate(i, direction, results_df)
        
        # update best scores if encounter a better model
        print(f'Current best val score: {best_val_score}')
        print(f'Current best train score: {best_train_score}')
        
        if results_df[f'SVM + {i}']['validation score'] > best_val_score: # better val score
            early_stop_count = 0
            
            print('Found a better model, update best scores')
            best_val_score = results_df[f'SVM + {i}']['validation score']
            best_train_score = results_df[f'SVM + {i}']['train score']
            best_feats = feats
            print(f'Current best val score: {best_val_score}')
            print(f'Current best train score: {best_train_score}')
            
        elif results_df[f'SVM + {i}']['validation score'] == best_val_score \
        and results_df[f'SVM + {i}']['train score'] < best_train_score: # same val score but less overfitting
            early_stop_count = 0
            
            print('Found a better model, update best scores')
            best_val_score = results_df[f'SVM + {i}']['validation score']
            best_train_score = results_df[f'SVM + {i}']['train score']
            best_feats = feats
            print(f'Current best val score: {best_val_score}')
            print(f'Current best train score: {best_train_score}')
        else: # early stop mechanism
            early_stop_count += 1
            
        print('--------------------')
        if early_stop_count == early_stop:
            break
        
    return results_df, best_val_score, best_train_score, best_feats

In [12]:
# forward search with early stop
results_df, best_val_score_fwd, best_train_score_fwd, best_feats_fwd = loop_sfs_and_pick(len(feat_names), 'forward', results_df, 
                                                                             best_val_score, best_train_score, best_feats, early_stop=5)

Picking the top 1 feature(s)
features selected: ['syllables_per_sentence']
Classification report on validation:
              precision    recall  f1-score   support

          A1       0.82      0.75      0.78        12
          A2       0.00      0.00      0.00         0
           B       1.00      0.75      0.86        20

    accuracy                           0.75        32
   macro avg       0.61      0.50      0.55        32
weighted avg       0.93      0.75      0.83        32

Current best val score: 0.71875
Current best train score: 0.7042801556420234
Found a better model, update best scores
Current best val score: 0.75
Current best train score: 0.8871595330739299
--------------------
Picking the top 2 feature(s)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


features selected: ['proportion_of_A_level_types', 'syllables_per_sentence']
Classification report on validation:
              precision    recall  f1-score   support

          A1       0.64      0.64      0.64        11
          A2       0.00      0.00      0.00         2
           B       0.93      0.74      0.82        19

    accuracy                           0.66        32
   macro avg       0.52      0.46      0.49        32
weighted avg       0.77      0.66      0.71        32

Current best val score: 0.75
Current best train score: 0.8871595330739299
--------------------
Picking the top 3 feature(s)
features selected: ['proportion_of_A_level_types', 'num_connectives', 'syllables_per_sentence']
Classification report on validation:
              precision    recall  f1-score   support

          A1       0.64      0.70      0.67        10
          A2       0.17      1.00      0.29         1
           B       0.93      0.67      0.78        21

    accuracy                  

In [13]:
# backward search with early stop
results_df, best_val_score_bwd, best_train_score_bwd, best_feats_bwd = loop_sfs_and_pick(len(feat_names), 'backward', results_df, 
                                                                             best_val_score, best_train_score, best_feats, early_stop=5)

Picking the top 43 feature(s)
features selected: ['total_tokens', 'total_tokens_w/o_stopwords', 'avg_sent_length', 'proportion_of_A_level_tokens', 'proportion_of_A_level_types', 'num_connectives', 'logical_operator_density', 'pronoun_density', 'type_token_ratio', 'avg_rank_of_lemmas_in_freq_list', 'fernandez_huerta_score', 'syllables_per_sentence', 'avg_degree_of_abstraction', 'min_degree_of_abstraction', 'avg_ambiguation_all_words', 'avg_ambiguation_content_words', 'noun_phrase_density', 'avg_parse_tree_depth', 'Imp', 'Past', 'Pres', 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'EOL', 'SPACE', 'CONTENT', 'FUNCTION']
Classification report on validation:
              precision    recall  f1-score   support

          A1       0.82      0.82      0.82        11
          A2       0.17      0.33      0.22         3
           B       0.93      0.78      0.85        18

    accuracy               

In [14]:
# results from forward search
print('Results from forward search:')
print(f'Best validation score: {best_val_score_fwd}')
print(f'Best train score: {best_train_score_fwd}')
print(f'Number of features selected: {len(best_feats_fwd)}')
print(f'Features: {best_feats_fwd}')

print('--------------------')

# results from backward search
print('Results from backward search:')
print(f'Best validation score: {best_val_score_bwd}')
print(f'Best train score: {best_train_score_bwd}')
print(f'Number of features selected: {len(best_feats_bwd)}')
print(f'Features: {best_feats_bwd}')

Results from forward search:
Best validation score: 0.75
Best train score: 0.8871595330739299
Number of features selected: 1
Features: ['syllables_per_sentence']
--------------------
Results from backward search:
Best validation score: 0.75
Best train score: 0.8404669260700389
Number of features selected: 36
Features: ['total_tokens', 'total_tokens_w/o_stopwords', 'avg_sent_length', 'proportion_of_A_level_types', 'num_connectives', 'logical_operator_density', 'avg_rank_of_lemmas_in_freq_list', 'fernandez_huerta_score', 'syllables_per_sentence', 'avg_degree_of_abstraction', 'min_degree_of_abstraction', 'avg_ambiguation_all_words', 'noun_phrase_density', 'avg_parse_tree_depth', 'Imp', 'Past', 'Pres', 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PUNCT', 'SCONJ', 'VERB', 'X', 'EOL', 'CONTENT', 'FUNCTION']


In [15]:
# compare the two results and pick one from the two

# if either one has better validation score than the other, then update best scores
if best_val_score_fwd > best_val_score_bwd:
    best_val_score = best_val_score_fwd
    best_train_score = best_train_score_fwd
    best_feats = best_feats_fwd
elif best_val_score_fwd < best_val_score_bwd:
    best_val_score = best_val_score_bwd
    best_train_score = best_train_score_bwd
    best_feats = best_feats_bwd
else: # if they have equal val scores, store the less overfitting one
    if best_train_score_fwd < best_train_score_bwd:
        best_val_score = best_val_score_fwd
        best_train_score = best_train_score_fwd
        best_feats = best_feats_fwd
    else:
        best_val_score = best_val_score_bwd
        best_train_score = best_train_score_bwd
        best_feats = best_feats_bwd

## Best model

In [16]:
text_feature = 'preprocessed_text'

# best model input matrices
best_X_train = X_train[best_feats+['preprocessed_text']]
best_X_val = X_val[best_feats+['preprocessed_text']]

# best model pipeline
best_preprocessor = make_column_transformer(
    (StandardScaler(), best_feats),
    (TfidfVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), text_feature)
)
best_model = SVC(random_state=123)
best_pipeline = make_pipeline(
    best_preprocessor, best_model
)


## Hyperparameter tuning

In [17]:
# gamma and C value range taken from https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
param_grid = {
    "svc__gamma": 2.0 ** np.arange(-15, 3),
    "svc__C": 2.0 ** np.arange(-5, 15)
}

random_search = RandomizedSearchCV(best_pipeline, 
                                   scoring='accuracy', 
                                   param_distributions=param_grid, 
                                   n_jobs=-1, 
                                   n_iter=15, 
                                   cv=3,
                                   return_train_score=True,
                                   verbose=10) # default n_iter=10
random_search.fit(best_X_train, y_train) 

Fitting 3 folds for each of 15 candidates, totalling 45 fits


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('standardscaler',
                                                                               StandardScaler(),
                                                                               ['total_tokens',
                                                                                'total_tokens_w/o_stopwords',
                                                                                'avg_sent_length',
                                                                                'proportion_of_A_level_types',
                                                                                'num_connectives',
                                                                                'logical_operator_density',
                                                                                'avg_rank_o

In [18]:
pd.DataFrame(random_search.cv_results_)[
    [
        'mean_test_score',
        'mean_train_score',
        'param_svc__gamma',
        'param_svc__C',
        'mean_fit_time',
        'rank_test_score',
    ]
].set_index("rank_test_score").sort_index()

Unnamed: 0_level_0,mean_test_score,mean_train_score,param_svc__gamma,param_svc__C,mean_fit_time
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.712358,1.0,0.0625,512.0,35.35357
2,0.712175,0.78596,0.001953,8.0,35.033666
2,0.712175,0.780124,3.1e-05,512.0,35.026665
4,0.71213,0.708214,0.000122,16.0,34.955272
5,0.712084,0.838479,0.007812,4.0,34.90893
6,0.704378,0.717916,0.000244,16.0,31.536939
7,0.50187,1.0,0.5,2048.0,35.399941
8,0.474692,1.0,4.0,256.0,31.927849
8,0.474692,0.474704,0.0625,0.03125,32.938977
8,0.474692,0.474704,0.000977,0.125,34.640057


In [19]:
print("Random Search best hyperparameters: %s" % (random_search.best_params_))
print("Random Search best model score: %0.3f" % (random_search.best_score_))
print(
    "Train score on the full train set: %0.3f" % (random_search.score(best_X_train, y_train))
)
print(
    "Validation score on the full validation set: %0.3f" % (random_search.score(best_X_val, y_val))
)

Random Search best hyperparameters: {'svc__gamma': 0.0625, 'svc__C': 512.0}
Random Search best model score: 0.712
Train score on the full train set: 1.000
Validation score on the full validation set: 0.781


## Update best model, train and evaluate

In [20]:
# validation score
if random_search.score(best_X_val, y_val) > best_val_score:
    best_model = SVC(random_state=123, gamma=random_search.best_params_['svc__gamma'], C=random_search.best_params_['svc__C'])
else:
    best_model = SVC(random_state=123)
    
best_pipeline = make_pipeline(
        best_preprocessor, best_model
    )
best_pipeline.fit(best_X_train, y_train)
    
results_df['SVM best'] = {'train score': best_pipeline.score(best_X_train, y_train), 
                          'validation score': best_pipeline.score(best_X_val, y_val)}


In [21]:
# test score
with open('../data/test_features.json', 'r') as f:
    test_feat = json.load(f)

test_feat_df = pd.DataFrame(test_feat)

X_test = test_feat_df.drop(['level'], axis=1)[best_feats+['preprocessed_text']]
y_test = test_feat_df['level'].tolist()

results_df['SVM best']['test score'] = best_pipeline.score(X_test, y_test)

In [22]:
display(pd.DataFrame(results_df)[['SVM best']])

Unnamed: 0,SVM best
train score,1.0
validation score,0.78125
test score,0.78125


In [26]:
# save model
best_model_trained = best_pipeline[1]
pickle.dump(best_model_trained, open('../models/svm_best_0611', 'wb'))

## Output analysis

In [23]:
y_pred_val = best_pipeline.predict(best_X_val)
assert len(y_pred_val) == len(y_val)

In [25]:
for i in range(len(y_val)):
    if y_pred_val[i] != y_val[i]:
        print(f'Text: {best_X_val["preprocessed_text"][i][:500]}...')
        print(f'Predicted: {y_pred_val[i]}')
        print(f'Gold: {y_val[i]}')
        print('-----------')

Text: tres palabras un jornalero pobre llegó por la noche a una posada. estaba muy cansado y tenía hambre y sed. pero no tenía dinero. sin dinero no pudo obtener nada. ¿cómo obtener dinero para comer? se sentó a una mesa. a la mesa estaban sentados dos panaderos que comían y bebían. el jornalero les contaba de sus viajes. su cuento era muy interesante y ellos lo escuchaban atentamente. finalmente él les dijo: —- propongo una apuesta. diré tres palabras que vds. no pueden repetir. —es absurdo,—contesta...
Predicted: A1
Gold: A2
-----------
Text: me llamo elena sánchez y viajé a roma por primera vez hace seis años, en 2011. visité la ciudad italiana con mi novio durante cinco días. lo que más nos gustó fue el coliseo, pero también estuvimos en la fontana di trevi y en el vaticano. el momento más divertido del viaje fue cuando estábamos dando un paseo por el barrio del trastévere y entramos en una tienda de mascotas para comprar una tortuga. ¡siete años después, aún es nuestra mascota! mi