## Set up

In [45]:
# !python -m spacy download es_core_news_md

In [24]:
# import
from datasets import load_dataset
import pandas as pd
import numpy as np
import scipy
from collections import defaultdict, Counter
import os
import json

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report


import spacy
from spacy.lang.es.stop_words import STOP_WORDS

In [3]:
nlp = spacy.load("es_core_news_md")

## Prepare data

In [4]:
# read train and val data
with open(os.path.abspath('../data/X_train.txt'), 'r') as f:
    X_train = f.read().strip().split('#'*20)
with open(os.path.abspath('../data/X_val.txt'), 'r') as f:
    X_val = f.read().strip().split('#'*20)
with open(os.path.abspath('../data/y_train.txt'), 'r') as f:
    y_train = f.read().strip().split('\n')
with open(os.path.abspath('../data/y_val.txt'), 'r') as f:
    y_val = f.read().strip().split('\n')

In [14]:
# read train and val data
with open(os.path.abspath('../data/train_features.json'), 'r') as f:
    train_feat = json.load(f)
with open(os.path.abspath('../data/val_features.json'), 'r') as f:
    val_feat = json.load(f)

In [46]:
X_train_feat = train_feat_df.drop(['level'], axis=1)
y_train_feat = [level[0] for level in train_feat_df['level'].tolist()]
X_val_feat = val_feat_df.drop(['level'], axis=1)
y_val_feat = [level[0] for level in val_feat_df['level'].tolist()]
y_train_fine = train_feat_df['level'].tolist()
y_val_fine = val_feat_df['level'].tolist()

In [20]:
print(X_train[0])

33. EL CANAL DE SUEZ
El proyecto del canal moderno a través del
istmo de Suez, para facilitar el paso de los
buques desde el Mediterráneo al Mar Rojo,
nació de Napoleón el Grande durante su invasión
de Egipto. Pero muchísimos siglos antes
de él, esto es, 1,300 años antes de la Era cristiana,
se construyó un canal desde un ramal
del Nilo hasta el Mar Rojo. Ese canal fué
obstruido varias veces por la arena y en el año
767 de nuestra Era fué destruido por el califa
Almanzor.
En 1854, un ingeniero francés, Fernando de
Lesseps, obtuvo del virrey de Egipto, Said-Bajá,
una concesión a favor de una Compañía
por espacio de noventa y nueve años
para construir un canal navegable a través del
istmo. Organizóse la Compañía en 1858 con un
capital en acciones de 200,000,000 de francos,
que en 1867 fué necesario ampliar con otros
100 millones. Las obras duraron once años.
El canal tiene de un extremo a otro 162
kilómetros de largo; pero una cuarta parte de
esa longitud consiste en lagos naturales. La


## Model set up

In [5]:
scoring = ['accuracy']

# results dictionary
results_df = {}

# tokenizer
def tokenizer(text):
    return [tok.text for tok in nlp(text)]

In [49]:
def cv_and_display(preprocessor, model, name, train_set, fine_grained=False):
    '''
    train model and display cross validation results
    
    preprocessor: (sklearn ColumnTransformer) sklearn object for feature transformation
    model: (sklearn Classifier) initialized sklearn classifier
    name: (str) a name that is shown when the result is displayed
    train_set: (DataFrame) the input train set encoding features
    '''
    pipeline = make_pipeline(
        preprocessor, model
    )
    if fine_grained == False:
        scores = cross_validate(pipeline, train_set, y_train, scoring = scoring, return_train_score=True)
    else:
        scores = cross_validate(pipeline, train_set, y_train_fine, scoring = scoring, return_train_score=True)
    
    results_df[name] = pd.DataFrame(scores).mean()
    display(pd.DataFrame(results_df))

## Baseline

In [7]:
cv_and_display(CountVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), SVC(random_state=123), 'SVM baseline', X_train)

Unnamed: 0,SVM baseline
fit_time,22.103144
score_time,5.216887
test_accuracy,0.804805
train_accuracy,0.827651


## Classification analysis - Baseline

In [9]:
baseline_pipeline = Pipeline(
    steps=[
        ("bow", CountVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer)), 
        ("model", SVC(random_state=123))
    ]
)

In [10]:
baseline_pipeline.fit(X_train, y_train)

Pipeline(steps=[('bow',
                 CountVectorizer(max_features=30000, ngram_range=(1, 2),
                                 tokenizer=<function tokenizer at 0x147a44e50>)),
                ('model', SVC(random_state=123))])

In [11]:
baseline_pipeline.classes_

array(['A', 'B'], dtype='<U1')

In [12]:
y_pred = baseline_pipeline.predict(X_train)
print(classification_report(y_pred, y_train))

## the model seems to be predicting some B level texts as A level
## precision: 71% of texts that are predicted as A level are actually A level
## recall: 99% of texts that are actually A level are predicted as A level

              precision    recall  f1-score   support

           A       0.71      0.99      0.83       101
           B       0.99      0.77      0.87       176

    accuracy                           0.85       277
   macro avg       0.85      0.88      0.85       277
weighted avg       0.89      0.85      0.85       277



In [13]:
y_val_pred = baseline_pipeline.predict(X_val)
print(classification_report(y_val_pred, y_val))

              precision    recall  f1-score   support

           A       0.75      0.92      0.83        13
           B       0.93      0.78      0.85        18

    accuracy                           0.84        31
   macro avg       0.84      0.85      0.84        31
weighted avg       0.86      0.84      0.84        31



## With features

In [17]:
train_feat_df = pd.DataFrame(train_feat)
val_feat_df = pd.DataFrame(val_feat)

train_feat_df.head()

Unnamed: 0,preprocessed_text,total_tokens,total_tokens_w/o_stopwords,avg_sent_length,proportion_of_A_level_tokens,proportion_of_A_level_types,num_connectives,logical_operator_density,pronoun_density,type_token_ratio,...,PUNCT,SCONJ,SYM,VERB,X,EOL,SPACE,CONTENT,FUNCTION,level
0,el canal de suez el proyecto del canal moderno...,293,124,22.538462,0.346774,0.23913,5,0.013841,0.02807,0.474403,...,0.112628,0.006826,0.0,0.054608,0.0,0.0,0.0,0.688462,0.311538,A2
1,el tonto vivían en cierto pueblo un labriego y...,1803,694,11.967105,0.35879,0.25,25,0.044007,0.070665,0.313367,...,0.169163,0.042152,0.001109,0.144759,0.0,0.0,0.0,0.600936,0.399064,A2
2,una lección de español el maestro:—¿qué lecció...,299,132,7.475,0.484848,0.465909,8,0.031034,0.038194,0.454849,...,0.210702,0.010033,0.0,0.120401,0.0,0.0,0.0,0.673729,0.326271,A1
3,capítulo i que trata de la condición y ejercic...,2153,869,61.514286,0.336018,0.178637,17,0.064261,0.070612,0.340455,...,0.130051,0.061774,0.0,0.092429,0.0,0.0,0.0,0.604378,0.395622,A2
4,capítulo —¿qué hora es? —pregunta guillermo mi...,1621,654,9.210227,0.477064,0.430556,16,0.047158,0.043786,0.26897,...,0.25293,0.016039,0.003701,0.107341,0.0,0.0,0.0,0.650622,0.349378,A1


In [31]:
passthrough_features = []
categorical_features = []
numeric_features = ['total_tokens', 'total_tokens_w/o_stopwords', 'avg_sent_length', 'proportion_of_A_level_tokens', 
                    'proportion_of_A_level_types', 'num_connectives', 'logical_operator_density', 'pronoun_density', 
                    'type_token_ratio', 'avg_rank_of_lemmas_in_freq_list', 'fernandez_huerta_score', 'syllables_per_sentence',
                    'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',
                    'SCONJ', 'SYM', 'VERB', 'X', 'EOL', 'SPACE', 'CONTENT', 'FUNCTION']
text_feature = 'preprocessed_text'

preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (TfidfVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), text_feature),
)

all_feat_model = SVC(random_state=123)

cv_and_display(preprocessor, all_feat_model, 'SVM + all feats', X_train_feat)

Unnamed: 0,SVM baseline,SVM + all feats
fit_time,22.103144,17.394472
score_time,5.216887,4.266932
test_accuracy,0.804805,0.834091
train_accuracy,0.827651,0.925087


## Classification analysis - Full model

In [32]:
all_feat_pipeline = make_pipeline(
        preprocessor, all_feat_model
)

In [33]:
all_feat_pipeline.fit(X_train_feat, y_train_feat)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['total_tokens',
                                                   'total_tokens_w/o_stopwords',
                                                   'avg_sent_length',
                                                   'proportion_of_A_level_tokens',
                                                   'proportion_of_A_level_types',
                                                   'num_connectives',
                                                   'logical_operator_density',
                                                   'pronoun_density',
                                                   'type_token_ratio',
                                                   'avg_rank_of_lemmas_in_freq_list',
                                                   'fernandez_huert

In [34]:
all_feat_pipeline.classes_

array(['A', 'B'], dtype='<U1')

In [35]:
y_pred_feat = all_feat_pipeline.predict(X_train_feat)
print(classification_report(y_pred_feat, y_train_feat))

              precision    recall  f1-score   support

           A       0.92      0.91      0.92       141
           B       0.91      0.92      0.92       136

    accuracy                           0.92       277
   macro avg       0.92      0.92      0.92       277
weighted avg       0.92      0.92      0.92       277



In [36]:
y_val_pred_feat = all_feat_pipeline.predict(X_val_feat)
print(classification_report(y_val_pred_feat, y_val_feat))

              precision    recall  f1-score   support

           A       0.94      0.88      0.91        17
           B       0.87      0.93      0.90        14

    accuracy                           0.90        31
   macro avg       0.90      0.91      0.90        31
weighted avg       0.91      0.90      0.90        31



## Hyperparameter tuning

In [42]:
# gamma and C value range taken from https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
param_grid = {
    "svc__gamma": 2.0 ** np.arange(-15, 3),
    "svc__C": 2.0 ** np.arange(-5, 15)
}

preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (TfidfVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), text_feature),
)

best_model = make_pipeline(
        preprocessor, SVC(random_state=123)
)

random_search = RandomizedSearchCV(best_model, 
                                   scoring='accuracy', 
                                   param_distributions=param_grid, 
                                   n_jobs=-1, 
                                   n_iter=50, 
                                   cv=5,
                                   return_train_score=True,
                                   verbose=10) # default n_iter=10
random_search.fit(X_train_feat, y_train_feat)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 26.5min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 29.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 32

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('standardscaler',
                                                                               StandardScaler(),
                                                                               ['total_tokens',
                                                                                'total_tokens_w/o_stopwords',
                                                                                'avg_sent_length',
                                                                                'proportion_of_A_level_tokens',
                                                                                'proportion_of_A_level_types',
                                                                                'num_connectives',
                                                                                'logica

In [43]:
pd.DataFrame(random_search.cv_results_)[
    [
        'mean_test_score',
        'mean_train_score',
        'param_svc__gamma',
        'param_svc__C',
        'mean_fit_time',
        'rank_test_score',
    ]
].set_index("rank_test_score").sort_index()

Unnamed: 0_level_0,mean_test_score,mean_train_score,param_svc__gamma,param_svc__C,mean_fit_time
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.87026,1.0,0.000244,8192.0,37.460074
2,0.87,1.0,0.03125,256.0,36.905247
3,0.866688,1.0,0.000244,4096.0,37.130772
4,0.862987,1.0,0.000244,16384.0,36.935218
5,0.862922,0.990074,3.1e-05,16384.0,37.860035
5,0.862922,1.0,0.015625,4096.0,37.093434
5,0.862922,1.0,0.015625,64.0,37.05843
8,0.855909,1.0,0.000488,2048.0,37.039209
9,0.848636,1.0,0.000977,16384.0,36.982854
9,0.848636,1.0,0.000977,2048.0,37.252066


In [44]:
print("Random Search best hyperparameters: %s" % (random_search.best_params_))
print("Random Search best model score: %0.3f" % (random_search.best_score_))
print(
    "Train score on the full train set: %0.3f" % (random_search.score(X_train_feat, y_train_feat))
)

Random Search best hyperparameters: {'svc__gamma': 0.000244140625, 'svc__C': 8192.0}
Random Search best model score: 0.870
Train score on the full train set: 1.000


## Fine-grained

In [50]:
cv_and_display(preprocessor, all_feat_model, 'SVM_fine + all feats', X_train_feat, True)

Unnamed: 0,SVM baseline,SVM + all feats,SVM_fine + all feats
fit_time,22.103144,17.394472,17.68778
score_time,5.216887,4.266932,4.212239
test_accuracy,0.804805,0.834091,0.581234
train_accuracy,0.827651,0.925087,0.812311


In [51]:
# gamma and C value range taken from https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
param_grid = {
    "svc__gamma": 2.0 ** np.arange(-15, 3),
    "svc__C": 2.0 ** np.arange(-5, 15)
}

preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (TfidfVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer), text_feature),
)

best_model = make_pipeline(
        preprocessor, SVC(random_state=123)
)

random_search = RandomizedSearchCV(best_model, 
                                   scoring='accuracy', 
                                   param_distributions=param_grid, 
                                   n_jobs=-1, 
                                   n_iter=20, 
                                   cv=5,
                                   return_train_score=True,
                                   verbose=10) # default n_iter=10
random_search.fit(X_train_feat, y_train_fine)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [None]:
pd.DataFrame(random_search.cv_results_)[
    [
        'mean_test_score',
        'mean_train_score',
        'param_svc__gamma',
        'param_svc__C',
        'mean_fit_time',
        'rank_test_score',
    ]
].set_index("rank_test_score").sort_index()

In [None]:
print("Random Search best hyperparameters: %s" % (random_search.best_params_))
print("Random Search best model score: %0.3f" % (random_search.best_score_))
print(
    "Train score on the full train set: %0.3f" % (random_search.score(X_train_feat, y_train_feat))
)