## Set up

In [1]:
# !python -m spacy download es_core_news_md

/opt/miniconda3/bin/python: No module named spacy


In [42]:
# import
from datasets import load_dataset
import pandas as pd
import numpy as np
import scipy
from collections import defaultdict, Counter
import os

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report


import spacy
from spacy.lang.es.stop_words import STOP_WORDS

In [26]:
nlp = spacy.load("es_core_news_md")

## Prepare data

In [25]:
# read train and val data
with open(os.path.abspath('../data/X_train.txt'), 'r') as f:
    X_train = f.read().strip().split('#'*20)
with open(os.path.abspath('../data/X_val.txt'), 'r') as f:
    X_val = f.read().strip().split('#'*20)
with open(os.path.abspath('../data/y_train.txt'), 'r') as f:
    y_train = f.read().strip().split('\n')
with open(os.path.abspath('../data/y_val.txt'), 'r') as f:
    y_val = f.read().strip().split('\n')

## Model set up

In [32]:
scoring = ['accuracy']

# results dictionary
results_df = {}

# tokenizer
def tokenizer(text):
    return [tok.text for tok in nlp(text)]

In [33]:
def cv_and_display(pipeline, name, train_set):
    '''
    train model and display cross validation results
    
    pipeline: (Pipeline) sklearn pipeline object
    name: (str) a name that is shown when the result is displayed
    train_set: (DataFrame) the input train set encoding features
    '''
    scores = cross_validate(pipeline, train_set, y_train, scoring = scoring, return_train_score=True)
    
    results_df[name] = pd.DataFrame(scores).mean()
    display(pd.DataFrame(results_df))

## Baseline

In [34]:
baseline_pipeline = Pipeline(
    steps=[
        ("vectorizer", CountVectorizer(max_features=30_000, ngram_range=(1,2), tokenizer=tokenizer)),
        ("model", SVC(random_state=123))
    ]
)

In [38]:
cv_and_display(baseline_pipeline, 'SVM baseline', X_train)

Unnamed: 0,SVM baseline
fit_time,26.199509
score_time,6.226296
test_accuracy,0.804805
train_accuracy,0.827651


## Classification analysis

In [41]:
baseline_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 CountVectorizer(max_features=30000, ngram_range=(1, 2),
                                 tokenizer=<function tokenizer at 0x141a43160>)),
                ('model', SVC(random_state=123))])

In [43]:
baseline_pipeline.classes_

array(['A', 'B'], dtype='<U1')

In [46]:
y_pred = baseline_pipeline.predict(X_train)
print(classification_report(y_pred, y_train))

## the model seems to be predicting some B level texts as A level
## precision: 71% of texts that are predicted as A level are actually A level
## recall: 99% of texts that are actually A level are predicted as A level

              precision    recall  f1-score   support

           A       0.71      0.99      0.83       101
           B       0.99      0.77      0.87       176

    accuracy                           0.85       277
   macro avg       0.85      0.88      0.85       277
weighted avg       0.89      0.85      0.85       277



In [47]:
y_val_pred = baseline_pipeline.predict(X_val)
print(classification_report(y_val_pred, y_val))

              precision    recall  f1-score   support

           A       0.75      0.92      0.83        13
           B       0.93      0.78      0.85        18

    accuracy                           0.84        31
   macro avg       0.84      0.85      0.84        31
weighted avg       0.86      0.84      0.84        31

