# POS-tagging with a classical sequence tagging model

Import necessary modules. 

In [1]:
!pip install -U 'scikit-learn<0.24'
!pip install -U spacy

from typing import List, Dict, Union
import pandas as pd
import numpy as np
import sklearn

import nltk
import spacy

! python3 -m spacy download en
nlp = spacy.load("en_core_web_sm")

Collecting scikit-learn<0.24
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 5.1 MB/s 
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.23.2 which is incompatible.
imbalanced-learn 0.8.1 requires scikit-learn>=0.24, but you have scikit-learn 0.23.2 which is incompatible.[0m
Successfully installed scikit-learn-0.23.2
Collecting spacy
  Downloading spacy-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 5.2 MB/s 
Collecti

## Corpus visualization: NLTK Brown corpus

In [2]:
from nltk.corpus import brown
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [3]:
brown: nltk.corpus.util.LazyCorpusLoader
print(brown.words())

sents = brown.tagged_sents(tagset='universal')
print(sents[10])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
[('It', 'PRON'), ('urged', 'VERB'), ('that', 'ADP'), ('the', 'DET'), ('city', 'NOUN'), ('``', '.'), ('take', 'VERB'), ('steps', 'NOUN'), ('to', 'PRT'), ('remedy', 'VERB'), ("''", '.'), ('this', 'DET'), ('problem', 'NOUN'), ('.', '.')]


## Feature Extraction

In [4]:
def word2features(sent: spacy.tokens.Doc, i: int) -> Dict[str, Union[str, bool]]:
    features: Dict[str, Union[str, bool]] = {}

    bos = sent[i].is_sent_start
    eos = sent[i].is_sent_end
    word = sent[i].text
    leng = len(sent)
    
    features = {
        "word.lower()": word.lower(),
        "word[-3:]": word[-3:].lower(),
        "word[-2:]": word[-3:].lower(),
        "word.isupper()": word.isupper(),
        "word.istitle()": word.istitle(),
        "word.isdigit()": word.isdigit()
    }
    
    if bos and leng>1:
        a_word = sent[i+1].text
        features.update({"+1:word.lower()": a_word.lower(),
                        "+1:word.istitle()": a_word.istitle(),
                        "+1:word.isupper()": a_word.isupper(),
                        "BOS": True})
        
    elif eos and leng>1:
        b_word = sent[i-1].text
        features.update({"-1:word.lower()": b_word.lower(),
                        "-1:word.istitle()": b_word.istitle(),
                        "-1:word.isupper()": b_word.isupper(),
                        "EOS": True})
        
    elif (eos or bos) and leng<=1:
        features.update({"BOS": True,
                        "EOS": True})
        
    else:
        b_word = sent[i-1].text
        a_word = sent[i+1].text
        features.update({"-1:word.lower()": b_word.lower(),
                        "-1:word.istitle()": b_word.istitle(),
                        "-1:word.istitle()": b_word.istitle(),
                        "+1:word.lower()": a_word.lower(),
                        "+1:word.istitle()": a_word.istitle(),
                        "+1:word.isupper()": a_word.isupper()})           
    
    return features

In [5]:
def sent2features(sent: spacy.tokens.Doc) -> List[Dict[str, Union[str, bool]]]:
    features_list: List[Dict[str, Union[str, bool]]] = []

    for i in range(0, len(sent)):
        features_list.append(word2features(sent, i))

    return features_list


In [6]:
from spacy.tokens import Doc
words = ['this', 'is', 'A', '1', 'sentence', '.']
spaces = [True, True, True, True, False, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)

sent2features(doc)

[{'+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'is',
  'BOS': True,
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': 'this',
  'word[-2:]': 'his',
  'word[-3:]': 'his'},
 {'+1:word.istitle()': True,
  '+1:word.isupper()': True,
  '+1:word.lower()': 'a',
  '-1:word.istitle()': False,
  '-1:word.lower()': 'this',
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': 'is',
  'word[-2:]': 'is',
  'word[-3:]': 'is'},
 {'+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': '1',
  '-1:word.istitle()': False,
  '-1:word.lower()': 'is',
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': True,
  'word.lower()': 'a',
  'word[-2:]': 'a',
  'word[-3:]': 'a'},
 {'+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'sentence',
  '-1:word.istitle()': True,
  '-1:word.lower()': 'a',
  'word.isdigit()': T

In [7]:
x: List[List[Dict[str, Union[str, bool]]]] = []
y: List[List[str]] = []

for sent in sents:
    words = []
    pos = []
    spaces = []
    for word in sent:
        words.append(word[0])
        pos.append(word[1])
        if word[1] == '.':
            spaces.append(False)
        else:
            spaces.append(True)

    doc = Doc(nlp.vocab, words=words, spaces=spaces)
    x.append(sent2features(doc))
    y.append(pos)

In [8]:
from sklearn.model_selection import train_test_split

x_train: List[Dict[str, Union[str, bool]]] = []
x_test: List[Dict[str, Union[str, bool]]] = []
y_train: List[str] = []
y_test: List[str] = []

x_train, x_test = train_test_split(x, test_size=0.2, shuffle=True, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.2, shuffle=True, random_state=42)

## Training the POS tagging model

import the sklearn-crfsuite package.

if you see **AttributeError: 'CRF' object has no attribute 'keep_tempfiles'** error, then downgrade your **sklearn** module under 0.24

!pip install -U 'scikit-learn<0.24'

In [9]:
# If the sklearn_crfsuite package is not intalled
# uncomment the following line:
!pip install sklearn-crfsuite
!pip install -U 'scikit-learn<0.24'
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (965 kB)
[K     |████████████████████████████████| 965 kB 4.8 MB/s 
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6


In [10]:
crf = sklearn_crfsuite.CRF(
    algorithm='ap',
    max_iterations=100
    )

crf.fit(x_train, y_train)



CRF(algorithm='ap', keep_tempfiles=None, max_iterations=100)

## Evaluation

In [11]:
y_pred = crf.predict(x_test)

print(metrics.flat_accuracy_score(y_test, y_pred))
print(metrics.sequence_accuracy_score(y_test, y_pred))

0.9774584244180281
0.6827694454133241


## L-BFGS

In [12]:
crf2 = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=1000,
    all_possible_transitions=True
)

crf2.fit(x_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=1000)

In [13]:
y_pred = crf.predict(x_test)

print(metrics.flat_accuracy_score(y_test, y_pred))
print(metrics.flat_f1_score(y_test, y_pred, average="weighted"))

0.9774584244180281
0.9773079833210477
