## Set up

In [None]:
# !python -m spacy download es_core_news_md

In [43]:
# import
from datasets import load_dataset
import pandas as pd
import numpy as np
import scipy
from collections import defaultdict, Counter
import os
import json
from itertools import compress
import pickle

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SequentialFeatureSelector # requires sklearn 0.24 and above
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report


import spacy
from spacy.lang.es.stop_words import STOP_WORDS

In [3]:
nlp = spacy.load("es_core_news_md")
np.random.seed(123)

## Prepare data

In [4]:
# read train and val data
with open(os.path.abspath('../data/train_features.json'), 'r') as f:
    train_feat = json.load(f)
with open(os.path.abspath('../data/val_features.json'), 'r') as f:
    val_feat = json.load(f)

train_feat_df = pd.DataFrame(train_feat)
val_feat_df = pd.DataFrame(val_feat)


X_train = train_feat_df.drop(['level'], axis=1)
X_val = val_feat_df.drop(['level'], axis=1)

y_train = [level[0] for level in train_feat_df['level'].tolist()]
y_train_fine = train_feat_df['level'].tolist()
y_val = [level[0] for level in val_feat_df['level'].tolist()]
y_val_fine = val_feat_df['level'].tolist()

In [5]:
X_train.head()

Unnamed: 0,preprocessed_text,total_tokens,total_tokens_w/o_stopwords,avg_sent_length,proportion_of_A_level_tokens,proportion_of_A_level_types,num_connectives,logical_operator_density,pronoun_density,type_token_ratio,...,PROPN,PUNCT,SCONJ,SYM,VERB,X,EOL,SPACE,CONTENT,FUNCTION
0,capítulo � las actuaciones en el dorado se suc...,596,235,9.933333,0.357447,0.262774,9,0.036522,0.056738,0.39094,...,0.052013,0.233221,0.016779,0.003356,0.104027,0.0,0.0,0.0,0.635165,0.364835
1,de lo que le sucedió a nuestro caballero cuand...,2995,1211,39.407895,0.327002,0.190896,16,0.065078,0.082009,0.304508,...,0.035392,0.147245,0.063105,0.001669,0.122204,0.0,0.0,0.0,0.586505,0.413495
2,mi abuelo tomás tiene ochenta años y vive con ...,121,47,17.285714,0.638298,0.514286,5,0.034188,0.080357,0.628099,...,0.0,0.082645,0.033058,0.0,0.190083,0.0,0.0,0.0,0.630631,0.369369
3,capítulo vi y vii (resumen) del donoso y gran ...,3817,1524,38.555556,0.311024,0.199755,21,0.061162,0.075514,0.282683,...,0.043752,0.132827,0.065234,0.000262,0.123395,0.0,0.0,0.0,0.603506,0.396494
4,frases de cortesía el señor blanco:—buenos día...,86,34,6.142857,0.558824,0.454545,3,0.02381,0.02381,0.476744,...,0.104651,0.302326,0.0,0.0,0.034884,0.0,0.0,0.0,0.716667,0.283333


## Model set up

In [6]:
feat_names = list(X_train.drop(['preprocessed_text'], axis=1).columns)

scoring = ['accuracy']

# results dictionary
results_df = {}

# tokenizer
def tokenizer(text):
    return [tok.text for tok in nlp(text)]

In [7]:
def cv_and_display(preprocessor, model, name, train_set, results_df, fine_grained=False):
    '''
    train model and return the cross validation results
    
    preprocessor: (sklearn ColumnTransformer) sklearn object for feature transformation
    model: (sklearn Classifier) initialized sklearn classifier
    name: (str) a name that is shown when the result is displayed
    train_set: (DataFrame) the input train set encoding features
    results_df: (dict) the dictionary to store cross validation results
    fine_grained: (str) {True, False} True: model trains with 4 class classification instead of 2. Default is False
    
    return: (dict) results_df
    '''
    pipeline = make_pipeline(
        preprocessor, model
    )
    if fine_grained == False:
        scores = cross_validate(pipeline, train_set, y_train, scoring = scoring, return_train_score=True)
    else:
        scores = cross_validate(pipeline, train_set, y_train_fine, scoring = scoring, return_train_score=True)
    
    results_df[name] = pd.DataFrame(scores).mean()
#     display(pd.DataFrame(results_df))
    return results_df

## Experiment 1 - Baseline
Develop the baseline model with only bag of word feature

In [8]:
X_train_base = X_train['preprocessed_text']
X_val_base = X_val['preprocessed_text']

results_df = cv_and_display(CountVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), 
                         SVC(random_state=123), 
                         'SVM baseline', 
                         X_train_base,
                         results_df)
display(pd.DataFrame(results_df))

Unnamed: 0,SVM baseline
fit_time,17.271622
score_time,4.277825
test_accuracy,0.800974
train_accuracy,0.837548


## Classification analysis - Baseline

In [9]:
baseline_pipeline = Pipeline(
    steps=[
        ("bow", CountVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer)), 
        ("model", SVC(random_state=123))
    ]
)

In [10]:
baseline_pipeline.fit(X_train_base, y_train)

Pipeline(steps=[('bow',
                 CountVectorizer(max_features=30000, ngram_range=(1, 2),
                                 tokenizer=<function tokenizer at 0x108465af0>)),
                ('model', SVC(random_state=123))])

In [11]:
baseline_pipeline.classes_

array(['A', 'B'], dtype='<U1')

In [12]:
y_pred = baseline_pipeline.predict(X_train_base)
print(classification_report(y_pred, y_train))

## the model seems to be predicting some B level texts as A level
## precision: 71% of texts that are predicted as A level are actually A level
## recall: 99% of texts that are actually A level are predicted as A level

              precision    recall  f1-score   support

           A       0.74      0.97      0.84       106
           B       0.98      0.78      0.87       171

    accuracy                           0.86       277
   macro avg       0.86      0.88      0.85       277
weighted avg       0.89      0.86      0.86       277



In [13]:
y_val_pred = baseline_pipeline.predict(X_val_base)
print(classification_report(y_val_pred, y_val))

              precision    recall  f1-score   support

           A       0.62      1.00      0.77        10
           B       1.00      0.71      0.83        21

    accuracy                           0.81        31
   macro avg       0.81      0.86      0.80        31
weighted avg       0.88      0.81      0.81        31



## Forward feature selection - Simple
Forward feature selection (simple) adds features to the model one by one, and retrains at every iteration. This is a simple approach to see which features contribute to the model the most by examining the cross validation score

In [14]:
# add features one at a time
text_feature = 'preprocessed_text'
model = SVC(random_state=123)

for i in range(1,len(feat_names)+1):
    incl_feats = feat_names[:i]
    incl_cols = ['preprocessed_text'] + incl_feats
    input_X = X_train[incl_cols]
    numeric_features = feat_names[:i]
    
    preprocessor = make_column_transformer(
        (StandardScaler(), numeric_features),
        (TfidfVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), text_feature)
    )
    
    results_df = cv_and_display(preprocessor, model, f'SVM + {incl_feats[-1]}', input_X, results_df)

display(pd.DataFrame(results_df))

Unnamed: 0,SVM baseline,SVM + total_tokens,SVM + total_tokens_w/o_stopwords,SVM + avg_sent_length,SVM + proportion_of_A_level_tokens,SVM + proportion_of_A_level_types,SVM + num_connectives,SVM + logical_operator_density,SVM + pronoun_density,SVM + type_token_ratio,...,SVM + PROPN,SVM + PUNCT,SVM + SCONJ,SVM + SYM,SVM + VERB,SVM + X,SVM + EOL,SVM + SPACE,SVM + CONTENT,SVM + FUNCTION
fit_time,17.779408,17.269422,16.977124,17.31278,17.115083,17.216004,18.69506,18.386396,17.818401,17.377884,...,18.223208,17.468371,17.277654,17.226802,18.811928,18.492685,18.704315,18.88259,18.238134,19.427591
score_time,4.39849,4.248667,4.215148,4.252367,4.207298,4.260027,4.550556,4.34467,4.45188,4.356583,...,4.61117,4.246609,4.225385,4.287266,4.677798,4.614491,4.721105,4.523897,4.552824,4.955831
test_accuracy,0.800974,0.913442,0.880779,0.848182,0.859091,0.859026,0.859091,0.855325,0.840909,0.855455,...,0.859091,0.866429,0.866364,0.855584,0.859221,0.859221,0.859221,0.851948,0.851948,0.855584
train_accuracy,0.837548,0.987371,0.978334,0.920566,0.914272,0.919681,0.925078,0.92057,0.915152,0.919673,...,0.925983,0.924182,0.935897,0.934092,0.935902,0.935902,0.935902,0.935906,0.937707,0.939517


In [19]:
# sort by test_accuracy and display
display(pd.DataFrame(results_df).sort_values('test_accuracy', 1, ascending=False))

Unnamed: 0,SVM + total_tokens,SVM + total_tokens_w/o_stopwords,SVM + DET,SVM + CCONJ,SVM + ADJ,SVM + CONJ,SVM + AUX,SVM + ADP,SVM + PUNCT,SVM + SCONJ,...,SVM + avg_sent_length,SVM + min_degree_of_abstraction,SVM + avg_degree_of_abstraction,SVM + syllables_per_sentence,SVM + pronoun_density,SVM + avg_ambiguation_all_words,SVM + avg_rank_of_lemmas_in_freq_list,SVM + avg_ambiguation_content_words,SVM + fernandez_huerta_score,SVM baseline
fit_time,17.269422,16.977124,17.372564,17.281626,18.413441,17.212542,17.331981,17.423291,17.468371,17.277654,...,17.31278,17.186274,17.191589,17.352464,17.818401,17.276768,17.254481,17.255753,17.214096,17.779408
score_time,4.248667,4.215148,4.388665,4.267236,4.562978,4.250835,4.296562,4.516386,4.246609,4.225385,...,4.252367,4.239893,4.315977,4.272356,4.45188,4.25286,4.330015,4.344602,4.265563,4.39849
test_accuracy,0.913442,0.880779,0.877143,0.873506,0.873442,0.86987,0.86987,0.86987,0.866429,0.866364,...,0.848182,0.848117,0.840909,0.840909,0.840909,0.840909,0.840909,0.840844,0.833701,0.800974
train_accuracy,0.987371,0.978334,0.932306,0.927789,0.917859,0.925083,0.925083,0.918772,0.924182,0.935897,...,0.920566,0.912441,0.915148,0.916053,0.915152,0.914247,0.927793,0.912454,0.921467,0.837548


## Forward feature selection - Greedy
Forward feature selection (greedy) is the feature selection process accomplished through sklearn's `SequentialFeatureSelector` function. It determines the top k features (k is a variable) using feature importance determined by the model

In [14]:
# uncomment below if running feature forward selection (simple) before running this part

# baseline_score = results_df['SVM baseline']
# results_df = {'SVM baseline': baseline_score}

In [17]:
def perform_sfs_cv_and_display(n_features, direction, results_df):
    '''
    generate a feature selection pipeline for svm models and perform cross validation
    
    n_features: (int) argument passed into the `n_features_to_select` argument in SequentialFeatureSelector
    direction: (str) {'forward', 'backward'}, argument passe dinto the direction argument in SequentialFeatureSelector
    results_df: (dict) the dictionary to store cross validation results
    
    return: (dict) results_df
    '''
    numeric_features = feat_names
    text_feature = 'preprocessed_text'
    sfs_X_train = X_train.drop(['preprocessed_text'], axis=1)
    
    # initialize selector
    sfs = SequentialFeatureSelector(SVC(random_state=123), n_features_to_select=n_features, scoring='accuracy', direction=direction)
    
    sfs_preprocessor = make_column_transformer(
        (StandardScaler(), numeric_features)
    )
    sfs_pipeline = make_pipeline(
        sfs_preprocessor,
        sfs,
        SVC(random_state=123)
    )
    
    # fit
    sfs_pipeline.fit(sfs_X_train, y_train)
    
    # features selected
    feats_selected = list(compress(sfs_X_train.columns, sfs_pipeline[1].get_support()))
    print(f'features selected: {feats_selected}')
    
    # cv with selected features
    cv_preprocessor = make_column_transformer(
        (StandardScaler(), feats_selected),
        (TfidfVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), text_feature)
    )
    
    cv_model = SVC(random_state=123)
    cv_X_train = X_train[feats_selected+['preprocessed_text']]
    
    results_df = cv_and_display(cv_preprocessor, cv_model, f'SVM + {n_features}', cv_X_train, results_df)
    display(pd.DataFrame(results_df))
    
    return results_df

In [18]:
# loop through number of features to find the best combination
for i in range(1, 40):
    results_df = perform_sfs_cv_and_display(i, 'forward', results_df)

features selected: ['avg_sent_length']


Unnamed: 0,SVM baseline,SVM + 1
fit_time,17.271622,16.656463
score_time,4.277825,4.133008
test_accuracy,0.800974,0.851688
train_accuracy,0.837548,0.956671


features selected: ['avg_sent_length', 'num_connectives']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2
fit_time,17.271622,16.656463,17.706907
score_time,4.277825,4.133008,4.207182
test_accuracy,0.800974,0.851688,0.884481
train_accuracy,0.837548,0.956671,0.935017


features selected: ['avg_sent_length', 'num_connectives', 'type_token_ratio']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3
fit_time,17.271622,16.656463,17.706907,16.708823
score_time,4.277825,4.133008,4.207182,4.152337
test_accuracy,0.800974,0.851688,0.884481,0.873636
train_accuracy,0.837548,0.956671,0.935017,0.929603


features selected: ['avg_sent_length', 'num_connectives', 'type_token_ratio', 'VERB']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427
score_time,4.277825,4.133008,4.207182,4.152337,4.168415
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042


features selected: ['avg_sent_length', 'num_connectives', 'type_token_ratio', 'PROPN', 'VERB']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405


features selected: ['avg_sent_length', 'num_connectives', 'type_token_ratio', 'CONJ', 'PROPN', 'VERB']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5,SVM + 6
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613,16.649096
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915,4.080338
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403,0.877403
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405,0.94405


features selected: ['total_tokens', 'avg_sent_length', 'num_connectives', 'type_token_ratio', 'CONJ', 'PROPN', 'VERB']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5,SVM + 6,SVM + 7
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613,16.649096,16.711594
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915,4.080338,4.150274
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403,0.877403,0.891688
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405,0.94405,0.946749


features selected: ['total_tokens', 'avg_sent_length', 'num_connectives', 'type_token_ratio', 'CONJ', 'PROPN', 'SCONJ', 'VERB']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5,SVM + 6,SVM + 7,SVM + 8
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613,16.649096,16.711594,16.561997
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915,4.080338,4.150274,4.211319
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403,0.877403,0.891688,0.898831
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405,0.94405,0.946749,0.944935


features selected: ['total_tokens', 'avg_sent_length', 'num_connectives', 'type_token_ratio', 'avg_degree_of_abstraction', 'CONJ', 'PROPN', 'SCONJ', 'VERB']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5,SVM + 6,SVM + 7,SVM + 8,SVM + 9
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613,16.649096,16.711594,16.561997,16.581702
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915,4.080338,4.150274,4.211319,4.157915
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403,0.877403,0.891688,0.898831,0.90961
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405,0.94405,0.946749,0.944935,0.945844


features selected: ['total_tokens', 'avg_sent_length', 'num_connectives', 'type_token_ratio', 'avg_degree_of_abstraction', 'CONJ', 'PROPN', 'SCONJ', 'VERB', 'X']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5,SVM + 6,SVM + 7,SVM + 8,SVM + 9,SVM + 10
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613,16.649096,16.711594,16.561997,16.581702,16.728968
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915,4.080338,4.150274,4.211319,4.157915,4.116423
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403,0.877403,0.891688,0.898831,0.90961,0.90961
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405,0.94405,0.946749,0.944935,0.945844,0.945844


features selected: ['total_tokens', 'avg_sent_length', 'num_connectives', 'type_token_ratio', 'avg_degree_of_abstraction', 'CONJ', 'PROPN', 'SCONJ', 'VERB', 'X', 'EOL']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5,SVM + 6,SVM + 7,SVM + 8,SVM + 9,SVM + 10,SVM + 11
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613,16.649096,16.711594,16.561997,16.581702,16.728968,16.886045
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915,4.080338,4.150274,4.211319,4.157915,4.116423,4.113965
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403,0.877403,0.891688,0.898831,0.90961,0.90961,0.90961
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405,0.94405,0.946749,0.944935,0.945844,0.945844,0.945844


features selected: ['total_tokens', 'avg_sent_length', 'num_connectives', 'type_token_ratio', 'avg_degree_of_abstraction', 'CONJ', 'PROPN', 'SCONJ', 'VERB', 'X', 'EOL', 'SPACE']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5,SVM + 6,SVM + 7,SVM + 8,SVM + 9,SVM + 10,SVM + 11,SVM + 12
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613,16.649096,16.711594,16.561997,16.581702,16.728968,16.886045,16.619428
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915,4.080338,4.150274,4.211319,4.157915,4.116423,4.113965,4.147485
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403,0.877403,0.891688,0.898831,0.90961,0.90961,0.90961,0.909675
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405,0.94405,0.946749,0.944935,0.945844,0.945844,0.945844,0.950361


features selected: ['total_tokens', 'avg_sent_length', 'num_connectives', 'logical_operator_density', 'type_token_ratio', 'avg_degree_of_abstraction', 'CONJ', 'PROPN', 'SCONJ', 'VERB', 'X', 'EOL', 'SPACE']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5,SVM + 6,SVM + 7,SVM + 8,SVM + 9,SVM + 10,SVM + 11,SVM + 12,SVM + 13
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613,16.649096,16.711594,16.561997,16.581702,16.728968,16.886045,16.619428,16.657985
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915,4.080338,4.150274,4.211319,4.157915,4.116423,4.113965,4.147485,4.135383
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403,0.877403,0.891688,0.898831,0.90961,0.90961,0.90961,0.909675,0.902403
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405,0.94405,0.946749,0.944935,0.945844,0.945844,0.945844,0.950361,0.953059


features selected: ['total_tokens', 'total_tokens_w/o_stopwords', 'avg_sent_length', 'num_connectives', 'logical_operator_density', 'type_token_ratio', 'avg_degree_of_abstraction', 'CONJ', 'PROPN', 'SCONJ', 'VERB', 'X', 'EOL', 'SPACE']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5,SVM + 6,SVM + 7,SVM + 8,SVM + 9,SVM + 10,SVM + 11,SVM + 12,SVM + 13,SVM + 14
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613,16.649096,16.711594,16.561997,16.581702,16.728968,16.886045,16.619428,16.657985,16.709198
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915,4.080338,4.150274,4.211319,4.157915,4.116423,4.113965,4.147485,4.135383,4.141061
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403,0.877403,0.891688,0.898831,0.90961,0.90961,0.90961,0.909675,0.902403,0.906039
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405,0.94405,0.946749,0.944935,0.945844,0.945844,0.945844,0.950361,0.953059,0.951254


features selected: ['total_tokens', 'total_tokens_w/o_stopwords', 'avg_sent_length', 'num_connectives', 'logical_operator_density', 'type_token_ratio', 'avg_degree_of_abstraction', 'CONJ', 'PROPN', 'SCONJ', 'SYM', 'VERB', 'X', 'EOL', 'SPACE']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5,SVM + 6,SVM + 7,SVM + 8,SVM + 9,SVM + 10,SVM + 11,SVM + 12,SVM + 13,SVM + 14,SVM + 15
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613,16.649096,16.711594,16.561997,16.581702,16.728968,16.886045,16.619428,16.657985,16.709198,16.698334
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915,4.080338,4.150274,4.211319,4.157915,4.116423,4.113965,4.147485,4.135383,4.141061,4.152087
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403,0.877403,0.891688,0.898831,0.90961,0.90961,0.90961,0.909675,0.902403,0.906039,0.891558
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405,0.94405,0.946749,0.944935,0.945844,0.945844,0.945844,0.950361,0.953059,0.951254,0.946741


features selected: ['total_tokens', 'total_tokens_w/o_stopwords', 'avg_sent_length', 'num_connectives', 'logical_operator_density', 'type_token_ratio', 'avg_degree_of_abstraction', 'CONJ', 'PART', 'PROPN', 'SCONJ', 'SYM', 'VERB', 'X', 'EOL', 'SPACE']


Unnamed: 0,SVM baseline,SVM + 1,SVM + 2,SVM + 3,SVM + 4,SVM + 5,SVM + 6,SVM + 7,SVM + 8,SVM + 9,SVM + 10,SVM + 11,SVM + 12,SVM + 13,SVM + 14,SVM + 15,SVM + 16
fit_time,17.271622,16.656463,17.706907,16.708823,17.152427,16.648613,16.649096,16.711594,16.561997,16.581702,16.728968,16.886045,16.619428,16.657985,16.709198,16.698334,16.976476
score_time,4.277825,4.133008,4.207182,4.152337,4.168415,4.125915,4.080338,4.150274,4.211319,4.157915,4.116423,4.113965,4.147485,4.135383,4.141061,4.152087,4.187807
test_accuracy,0.800974,0.851688,0.884481,0.873636,0.880844,0.877403,0.877403,0.891688,0.898831,0.90961,0.90961,0.90961,0.909675,0.902403,0.906039,0.891558,0.89539
train_accuracy,0.837548,0.956671,0.935017,0.929603,0.944042,0.94405,0.94405,0.946749,0.944935,0.945844,0.945844,0.945844,0.950361,0.953059,0.951254,0.946741,0.947646


features selected: ['total_tokens', 'total_tokens_w/o_stopwords', 'avg_sent_length', 'num_connectives', 'logical_operator_density', 'type_token_ratio', 'avg_degree_of_abstraction', 'min_degree_of_abstraction', 'CONJ', 'PART', 'PROPN', 'SCONJ', 'SYM', 'VERB', 'X', 'EOL', 'SPACE']


KeyboardInterrupt: 

In [19]:
# sort by test_accuracy and display
display(pd.DataFrame(results_df).sort_values('test_accuracy', 1, ascending=False))

# pick SVM + 9 due to less features and less overfitting

Unnamed: 0,SVM + 12,SVM + 9,SVM + 11,SVM + 10,SVM + 14,SVM + 13,SVM + 8,SVM + 16,SVM + 7,SVM + 15,SVM + 2,SVM + 4,SVM + 6,SVM + 5,SVM + 3,SVM + 1,SVM baseline
fit_time,16.619428,16.581702,16.886045,16.728968,16.709198,16.657985,16.561997,16.976476,16.711594,16.698334,17.706907,17.152427,16.649096,16.648613,16.708823,16.656463,17.271622
score_time,4.147485,4.157915,4.113965,4.116423,4.141061,4.135383,4.211319,4.187807,4.150274,4.152087,4.207182,4.168415,4.080338,4.125915,4.152337,4.133008,4.277825
test_accuracy,0.909675,0.90961,0.90961,0.90961,0.906039,0.902403,0.898831,0.89539,0.891688,0.891558,0.884481,0.880844,0.877403,0.877403,0.873636,0.851688,0.800974
train_accuracy,0.950361,0.945844,0.945844,0.945844,0.951254,0.953059,0.944935,0.947646,0.946749,0.946741,0.935017,0.944042,0.94405,0.94405,0.929603,0.956671,0.837548


## Experiment 2 - Train with selected features
This part is the train the model with features selected through feature selection

In [40]:
top_9_features = ['total_tokens', 'avg_sent_length', 'num_connectives', 
                   'type_token_ratio', 'avg_degree_of_abstraction', 'CONJ', 'PROPN', 'SCONJ', 'VERB']
text_feature = 'preprocessed_text'

In [41]:
preprocessor = make_column_transformer(
        (StandardScaler(), top_9_features),
        (TfidfVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), text_feature)
    )
pipeline = make_pipeline(
        preprocessor, SVC(random_state=123)
)
X_train_final = X_train[top_9_features+['preprocessed_text']]
X_val_final = X_val[top_9_features+['preprocessed_text']]

In [42]:
pipeline.fit(X_train_final, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['total_tokens',
                                                   'avg_sent_length',
                                                   'num_connectives',
                                                   'type_token_ratio',
                                                   'avg_degree_of_abstraction',
                                                   'CONJ', 'PROPN', 'SCONJ',
                                                   'VERB']),
                                                 ('tfidfvectorizer',
                                                  TfidfVectorizer(max_features=30000,
                                                                  ngram_range=(1,
                                                                               2),
          

## Classification analysis - Feature selection

In [25]:
pipeline.classes_

array(['A', 'B'], dtype='<U1')

In [26]:
y_pred = pipeline.predict(X_train_final)
print(classification_report(y_pred, y_train))

              precision    recall  f1-score   support

           A       0.94      0.94      0.94       140
           B       0.94      0.94      0.94       137

    accuracy                           0.94       277
   macro avg       0.94      0.94      0.94       277
weighted avg       0.94      0.94      0.94       277



In [27]:
y_pred_val = pipeline.predict(X_val_final)
print(classification_report(y_pred_val, y_val))X_train_most_feats

              precision    recall  f1-score   support

           A       0.75      0.86      0.80        14
           B       0.87      0.76      0.81        17

    accuracy                           0.81        31
   macro avg       0.81      0.81      0.81        31
weighted avg       0.81      0.81      0.81        31



## Hyperparameter tuning

In [28]:
# gamma and C value range taken from https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
param_grid = {
    "svc__gamma": 2.0 ** np.arange(-15, 3),
    "svc__C": 2.0 ** np.arange(-5, 15)
}

random_search = RandomizedSearchCV(pipeline, 
                                   scoring='accuracy', 
                                   param_distributions=param_grid, 
                                   n_jobs=-1, 
                                   n_iter=10, 
                                   cv=5,
                                   return_train_score=True,
                                   verbose=10) # default n_iter=10
random_search.fit(X_train_final, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('standardscaler',
                                                                               StandardScaler(),
                                                                               ['total_tokens',
                                                                                'avg_sent_length',
                                                                                'num_connectives',
                                                                                'type_token_ratio',
                                                                                'avg_degree_of_abstraction',
                                                                                'CONJ',
                                                                                'PROPN',
                                  

In [29]:
pd.DataFrame(random_search.cv_results_)[
    [
        'mean_test_score',
        'mean_train_score',
        'param_svc__gamma',
        'param_svc__C',
        'mean_fit_time',
        'rank_test_score',
    ]
].set_index("rank_test_score").sort_index()

Unnamed: 0_level_0,mean_test_score,mean_train_score,param_svc__gamma,param_svc__C,mean_fit_time
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.866364,0.888985,0.007812,4.0,41.692707
2,0.855584,1.0,0.5,2048.0,43.885067
3,0.851688,0.865525,0.001953,8.0,43.819568
4,0.830065,0.834817,0.000244,16.0,40.5815
5,0.786948,0.813138,0.000122,16.0,33.63478
6,0.567143,0.559504,0.0625,0.03125,44.745799
7,0.527143,1.0,4.0,256.0,42.594443
7,0.527143,1.0,4.0,16.0,40.346616
9,0.505455,0.505418,0.000977,0.125,41.117939
9,0.505455,0.505418,1.0,0.0625,43.295684


In [31]:
print("Random Search best hyperparameters: %s" % (random_search.best_params_))
print("Random Search best model score: %0.3f" % (random_search.best_score_))
print(
    "Train score on the full train set: %0.3f" % (random_search.score(X_train_final, y_train))
)

Random Search best hyperparameters: {'svc__gamma': 0.0078125, 'svc__C': 4.0}
Random Search best model score: 0.866
Train score on the full train set: 0.895


## Experiment 3 - without ambiguity & abstraction
This part is just to make sure that the previous best model does not out-perform the model obtained through feature selection

In [33]:
X_train_most_feats = X_train.drop(['avg_degree_of_abstraction', 'min_degree_of_abstraction', 
                                   'avg_ambiguation_all_words', 'avg_ambiguation_content_words'], axis=1)
X_val_most_feats = X_val.drop(['avg_degree_of_abstraction', 'min_degree_of_abstraction', 
                                   'avg_ambiguation_all_words', 'avg_ambiguation_content_words'], axis=1)

most_feats = X_train_most_feats.drop(['preprocessed_text'], axis=1).columns
text_feature = 'preprocessed_text'

preprocessor = make_column_transformer(
        (StandardScaler(), most_feats),
        (TfidfVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), text_feature)
    )
pipeline = make_pipeline(
        preprocessor, SVC(random_state=123)
)

In [34]:
pipeline.fit(X_train_most_feats, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  Index(['total_tokens', 'total_tokens_w/o_stopwords', 'avg_sent_length',
       'proportion_of_A_level_tokens', 'proportion_of_A_level_types',
       'num_connectives', 'logical_operator_density', 'pronoun_density',
       'type_token_ratio', 'avg_rank_of_lemmas_in_freq_list',
       'fernandez_huerta_score', 'syllables_per_sentence', 'ADJ', 'ADP', 'ADV',
       'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON',
       'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'EOL', 'SPACE',
       'CONTENT', 'FUNCTION'],
      dtype='object')),
                                                 ('tfidfvectorizer',
                                                  TfidfVectorizer(max_features=30000,
                                                              

In [35]:
pipeline.classes_

array(['A', 'B'], dtype='<U1')

In [36]:
y_pred = pipeline.predict(X_train_most_feats)
print(classification_report(y_pred, y_train))

              precision    recall  f1-score   support

           A       0.96      0.93      0.94       144
           B       0.93      0.95      0.94       133

    accuracy                           0.94       277
   macro avg       0.94      0.94      0.94       277
weighted avg       0.94      0.94      0.94       277



In [37]:
y_pred_val = pipeline.predict(X_val_most_feats)
print(classification_report(y_pred_val, y_val))

              precision    recall  f1-score   support

           A       0.81      0.87      0.84        15
           B       0.87      0.81      0.84        16

    accuracy                           0.84        31
   macro avg       0.84      0.84      0.84        31
weighted avg       0.84      0.84      0.84        31



## Fine-grained classification
Using the best model obtained from the above 3 experiments to perform a fine-grained classification

In [39]:
results_df_fine = {}
preprocessor = make_column_transformer(
        (StandardScaler(), top_10_features),
        (TfidfVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), text_feature)
    )

cv_and_display(preprocessor, SVC(random_state=123), 'SVM_fine + best feats', X_train_final, results_df_fine, True)

{'SVM_fine + best feats': fit_time          16.392156
 score_time         4.067747
 test_accuracy      0.664221
 train_accuracy     0.787905
 dtype: float64}

## Save best model

In [45]:
best_model = pipeline[1]
pickle.dump(best_model, open('../models/svm_best_0603', 'wb'))