In [1]:
# boilerplate
import os
import sys
from pathlib import Path

path = Path(os.getcwd())
sys.path.append(str(path.parent))

In [2]:
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer

from preprocessing import SentencesGenerate

### Carga de los datos

In [30]:
PATH_DEV = '../../dataset/dev.txt'
PATH_TEST = '../../dataset/test.txt'
PATH_TRAIN = '../../dataset/train.txt'

MIN_LENGTH = 4

In [31]:
dev = SentencesGenerate(path=PATH_DEV, min_length=MIN_LENGTH)
test = SentencesGenerate(path=PATH_TEST, min_length=MIN_LENGTH)
train = SentencesGenerate(path=PATH_TRAIN, min_length=MIN_LENGTH)

### Preparación de los datos

Creación de la ventana deslizante: utilizamos una ventana de tamaño *4* siendo la palabra central la tercera posición de la ventana, mientras que la primera y segunda posición indican palabras previas y la cuarta posición una palabra posterior. Cada palabra dentro de la ventana es caracterizada de la siguiente manera:
   
* *Token* en minúscula
* Característica *booleana* indicando si se encuentra en mayúscula (Unicamente palabra central)
* Característica *booleana* indicando si es de tipo númerico (Posiciónes 2, 3 y 4)
* Característica *booleana* indicando si comienza con mayúscula (Posiciónes 2, 3 y 4)


In [32]:
def word2features(sent, i):
    # i(ndex) word from sent
    word = sent[i]
    
    # features dict
    fs = {
        'lower': word.lower(),
        'isupper': word.isupper(),
        'istitle': word.istitle(),
        'isdigit': word.isdigit(),
    }
    
    # previous word
    if i > 0:
        pword = sent[i-1]
        fs.update({
            'pword': pword.lower(),
            'pwistitle': pword.istitle(),
            'pwisdigit': pword.isdigit(),
        })
        # previous previous word
        if i > 1:
            fs.update({
                'ppword': sent[i-2].lower(),
            })
    else:
        fs.update({
            'pword': '',
        })
        
    # next word
    if i < len(sent)-1:
        nword = sent[i+1]
        fs.update({
            'nword': nword.lower(),
            'nwistitle': nword.istitle(),
            'nwisdigit': nword.isdigit(),
        })
    else:
        fs.update({
            'nword': '',
        })
        
    return fs

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [33]:
def transform(sentences, tags):
    X, y = [], []
    for sent, tag in zip(sentences, tags):
        X += sent2features(sent)
        y.extend(tag)
    return X, y

In [34]:
X_train, y_train = transform(train.X, train.y)
X_test, y_test = transform(test.X, test.y)

### Definición del *pipeline*

In [35]:
clf = LinearSVC()
vectorizer = DictVectorizer()
pipeline =  Pipeline([('vect', vectorizer), ('clf', clf),])

In [36]:
X_test[0], y_test[0]

({'lower': 'la',
  'isupper': False,
  'istitle': True,
  'isdigit': False,
  'pword': '',
  'nword': 'coruña',
  'nwistitle': True,
  'nwisdigit': False},
 'B-LOC')

### Entrenamiento

In [37]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect', DictVectorizer()), ('clf', LinearSVC())])

### Predicción

In [38]:
y_predict = pipeline.predict(X_test)

### Métricas

In [39]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

       B-LOC       0.79      0.74      0.76      1083
      B-MISC       0.63      0.49      0.55       339
       B-ORG       0.79      0.83      0.81      1397
       B-PER       0.83      0.88      0.85       735
       I-LOC       0.75      0.58      0.66       325
      I-MISC       0.68      0.40      0.50       557
       I-ORG       0.78      0.67      0.73      1104
       I-PER       0.86      0.92      0.89       634
           O       0.99      1.00      0.99     45127

    accuracy                           0.97     51301
   macro avg       0.79      0.72      0.75     51301
weighted avg       0.96      0.97      0.96     51301

