<a href="https://colab.research.google.com/github/muhanangmahrub/named-entity-recognizer-aps/blob/main/training_crfs_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn_crfsuite-0.5.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/data/NER dataset.csv', encoding='latin1')
df.head(5)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [4]:
def convert_crfs_format(df):
    sentences = []
    current_sentence = []

    for _, row in df.iterrows():
        sentence_marker = row["Sentence #"]
        if isinstance(sentence_marker, str) and sentence_marker.startswith("Sentence:"):
            # A new sentence starts, save the previous one if not empty
            if current_sentence:
                sentences.append(current_sentence)

            # Reset for the new sentence
            current_sentence = []

        current_sentence.append((str(row['Word']), row['POS'], row['Tag']))
        if current_sentence:
            sentences.append(current_sentence)

    return sentences

sents = convert_crfs_format(df[:35000])

In [5]:
sents[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [6]:
train_sents = sents[:int(0.8 * len(sents))]
test_sents = sents[int(0.8 * len(sents)):]

In [7]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
    # return [(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [8]:
sent2features(train_sents[0])[0]

{'bias': 1.0,
 'word.lower()': 'thousands',
 'word[-3:]': 'nds',
 'word[-2:]': 'ds',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'postag': 'NNS',
 'postag[:2]': 'NN',
 'BOS': True,
 '+1:word.lower()': 'of',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'IN',
 '+1:postag[:2]': 'IN'}

In [9]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 3.67 s, sys: 514 ms, total: 4.18 s
Wall time: 12.6 s


In [10]:
%%time

import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 2min 29s, sys: 861 ms, total: 2min 30s
Wall time: 2min 32s


In [11]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-geo',
 'B-gpe',
 'B-per',
 'I-geo',
 'B-org',
 'I-org',
 'B-tim',
 'B-art',
 'I-art',
 'I-per',
 'I-gpe',
 'I-tim',
 'B-nat',
 'B-eve',
 'I-eve',
 'I-nat']

In [12]:
from sklearn_crfsuite import metrics

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels, zero_division=False)

0.7095883898291645

In [13]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, zero_division=False
))

              precision    recall  f1-score   support

       B-art      0.000     0.000     0.000       126
       I-art      0.000     0.000     0.000         0
       B-eve      0.000     0.000     0.000        26
       I-eve      0.000     0.000     0.000        26
       B-geo      0.807     0.629     0.707      6689
       I-geo      0.734     0.546     0.626      1587
       B-gpe      0.559     0.923     0.696      2321
       I-gpe      0.000     0.000     0.000        25
       B-nat      0.392     0.632     0.484        95
       I-nat      1.000     1.000     1.000        60
       B-org      0.588     0.632     0.609      3683
       I-org      0.787     0.643     0.708      3156
       B-per      0.735     0.779     0.756      2746
       I-per      0.770     0.917     0.837      3033
       B-tim      0.887     0.748     0.811      3042
       I-tim      0.875     0.429     0.576       913

   micro avg      0.721     0.700     0.710     27528
   macro avg      0.508   

In [14]:
import joblib

crf = joblib.dump(crf, 'crf.joblib')

In [15]:
crf = joblib.load('crf.joblib')

In [16]:
def extract_features(word, i, new_sentence):
    """Extract features from a sentence for CRF prediction."""
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }

    if i > 0:
        word1 = new_sentence[i - 1]  # Access previous word from new_sentence
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()
        })
    else:
        features['BOS'] = True

    if i < len(new_sentence) - 1:
        word1 = new_sentence[i + 1]  # Access next word from new_sentence
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
    else:
        features['EOS'] = True

    return features

In [19]:
# New sentence for inference
sample = 'The Israeli army has killed a Palestinian youth in the northern Gaza Strip and wounded at least three other people .'
new_sentence = sample.split()

X_test = [extract_features(word, i, new_sentence) for i, word in enumerate(new_sentence)]
X_test = [X_test]  # Wrap X_test in a list to match the expected format

In [20]:
# Predict labels for the new sentence
y_pred = crf.predict(X_test)

# Output results
for word, label in zip(new_sentence, y_pred[0]):
    print(f"{word}: {label}")

The: O
Israeli: B-gpe
army: O
has: O
killed: O
a: O
Palestinian: B-gpe
youth: O
in: O
the: O
northern: O
Gaza: B-geo
Strip: I-geo
and: O
wounded: O
at: O
least: O
three: O
other: O
people: O
.: O
