## Preparing Dataset

In [79]:
# Training example
sentences = [
    ["tapaiko", "storema", "Redmi", "Note", "9", "Pro", "mobile", "xa"],
    ["Redmi", "9", "ko", "price", "kati", "ho"],
    ["Samsung", "ko", "mobile", "kati", "ho"],
    ["mero", "order", "number", "ORD1234", "track", "garidinu"],
    ["Redmi", "phone", "kati", "ho"],
    ["Redmi", "ra", "Samsung", "ko", "mobile", "kati", "ho"],
    ["maile", "gareko", "order", "ORD989", "track", "garidinu"],
    # Add more training examples
]

labels = [
    ["O", "O", "B-Brand", "I-Brand", "I-Brand", "I-Brand", "B-Category", "O"],
    ["B-Brand", "I-Brand", "O", "B-Attribute", "O", "O"],
    ["B-Brand", "O", "B-Category", "O", "O"],
    ["O", "O", "O", "B-OrderID", "O", "O"],
    ["B-Brand", "B-Category", "O", "O"],
    ["B-Brand", "O", "B-Brand", "O", "B-Category", "O", "O"],
    ["O", "O", "O", "B-OrderID", "O", "O"],
    # Corresponding labels
]

## Importing Libraries

In [80]:
import numpy as np
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
import re

## Creating Features

In [81]:
def word_shape(word):
    shape = ""
    for char in word:
        if char.isupper():
            shape += "X"
        elif char.islower():
            shape += "x"
        elif char.isdigit():
            shape += "d"
        else:
            shape += char
    return shape

In [82]:
def create_word_features(sentence, index):
    word = sentence[index]
    
    features = {
        "word.lower": word.lower(),
        "word.isupper": word.isupper(),
        "word.istitle": word.istitle(),
        "word.shape": word_shape(word),
        "word.isdigit": word.isdigit()
    }
    
    # adding surrounding word features
    if index > 0:
        word_prev = sentence[index - 1]
        features.update({
            "prev_word.lower": word_prev.lower(),
            "prev_word.istitle": word_prev.istitle(),
            "prev_word.isupper": word_prev.isupper(),
            "prev_word.isdigit": word_prev.isdigit()
        })
    else:
        features["BOS"] = True
        
    if index < len(sentence) - 1:
        word_next = sentence[index + 1]
        features.update({
            "next_word.lower": word_next.lower(),
            "next_word.istitle": word_next.istitle(),
            "next_word.isupper": word_next.isupper(),
            "next_word.isdigit": word_next.isdigit()
        })
    else:
        features["EOS"] = True
        
    return features

## Preparing Training Data

In [83]:
def prepare_data(sentences, labels):
    X, y = [], []
    
    for sentence, sentence_labels in zip(sentences, labels):
        sentence_features = []
        for i in range(len(sentence)):
            word_features = create_word_features(sentence, i)
            sentence_features.append(word_features)
            
        X.append(sentence_features)
        y.append(sentence_labels)
        
    return X, y

In [84]:
X_train, y_train = prepare_data(sentences, labels)

In [85]:
X_train[0]

[{'word.lower': 'tapaiko',
  'word.isupper': False,
  'word.istitle': False,
  'word.shape': 'xxxxxxx',
  'word.isdigit': False,
  'BOS': True,
  'next_word.lower': 'storema',
  'next_word.istitle': False,
  'next_word.isupper': False,
  'next_word.isdigit': False},
 {'word.lower': 'storema',
  'word.isupper': False,
  'word.istitle': False,
  'word.shape': 'xxxxxxx',
  'word.isdigit': False,
  'prev_word.lower': 'tapaiko',
  'prev_word.istitle': False,
  'prev_word.isupper': False,
  'prev_word.isdigit': False,
  'next_word.lower': 'redmi',
  'next_word.istitle': True,
  'next_word.isupper': False,
  'next_word.isdigit': False},
 {'word.lower': 'redmi',
  'word.isupper': False,
  'word.istitle': True,
  'word.shape': 'Xxxxx',
  'word.isdigit': False,
  'prev_word.lower': 'storema',
  'prev_word.istitle': False,
  'prev_word.isupper': False,
  'prev_word.isdigit': False,
  'next_word.lower': 'note',
  'next_word.istitle': True,
  'next_word.isupper': False,
  'next_word.isdigit': False

In [86]:
y_train[0]

['O', 'O', 'B-Brand', 'I-Brand', 'I-Brand', 'I-Brand', 'B-Category', 'O']

## Model Train CRF

In [87]:
crf = CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.2,
    max_iterations=100,
    all_possible_transitions=True
)

In [88]:
crf.fit(X_train, y_train)

In [89]:
def predict_entities(crf_model: CRF, sentence: list):
    sentence_features = [create_word_features(sentence, i) for i in range(len(sentence))]
    predictions = crf_model.predict([sentence_features])[0]
    return predictions

In [90]:
new_sentence = ["Redmi", "mobile", "kati", "ho"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'B-Category' 'O' 'O']


In [91]:
new_sentence = ["Apple", "mobile", "Rs", "100000", "vanda", "sasto", "ho"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'B-Category' 'O' 'O' 'O' 'O' 'O']


In [92]:
new_sentence = ["Apple", "mobile", "order", "gareko", "id", "ORD12", "track", "garidinu"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'B-Category' 'O' 'O' 'O' 'B-OrderID' 'O' 'O']


In [93]:
new_sentence = ["Oppo", "ra", "Vivo", "ko", "mobile", "kati", "ho"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'O' 'B-Brand' 'O' 'B-Category' 'O' 'O']


In [94]:
new_sentence = ["Redmi", "mobile", "ko", "price", "kati", "ho"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'B-Category' 'O' 'B-Attribute' 'O' 'O']


In [95]:
new_sentence = ["Huawei", "calculator", "ko", "price", "kati", "ho"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'O' 'O' 'B-Attribute' 'O' 'O']


## Evaluation

In [96]:
X_test = ["Casio", "calculator", "ko", "price", "kati", "ho"]
y_test = ["B-Brand", "B-Category", "O", "B-Attribute", "O", "O"]

X_test_features = [create_word_features(X_test, i) for i in range(len(X_test))]
y_pred = crf.predict([X_test_features])

y_pred

array([['B-Brand', 'O', 'O', 'B-Attribute', 'O', 'O']], dtype=object)

In [97]:
labels = crf.predict_marginals([X_test_features])[0]

labels

array([{'O': 0.0876059968925813, 'B-Brand': 0.7340555607162239, 'I-Brand': 0.07023751092973064, 'B-Category': 0.038001674764988655, 'B-Attribute': 0.03462219903527901, 'B-OrderID': 0.03547705766119632},
       {'O': 0.3350934164311626, 'B-Brand': 0.0988055039088461, 'I-Brand': 0.19101552750279535, 'B-Category': 0.23256099842037847, 'B-Attribute': 0.060283371356210466, 'B-OrderID': 0.08224118238060696},
       {'O': 0.899847297363934, 'B-Brand': 0.016777104532296364, 'I-Brand': 0.024117264562654565, 'B-Category': 0.02037916594012317, 'B-Attribute': 0.01863006116380619, 'B-OrderID': 0.020249106437185766},
       {'O': 0.3107754933152471, 'B-Brand': 0.026105709025310928, 'I-Brand': 0.012191983875474837, 'B-Category': 0.1622303374647279, 'B-Attribute': 0.45336399300913727, 'B-OrderID': 0.03533248331010196},
       {'O': 0.8845411968550176, 'B-Brand': 0.021812661110052617, 'I-Brand': 0.016731481307206864, 'B-Category': 0.03175378990731769, 'B-Attribute': 0.019203568032846584, 'B-OrderID': 0

In [98]:
dict_ = {
    "a": 1,
    "b": 2
}

max(dict_.items(), key=lambda x: x[1])

('b', 2)

In [99]:
dict_.items()

dict_items([('a', 1), ('b', 2)])

In [100]:
threshold = 0.5

final_predictions = []

for token_probs in labels:
    best_label = max(token_probs.items(), key=lambda x: x[1])
    if best_label[1] > threshold:
        final_predictions.append(best_label[0])
    else:
        final_predictions.append("O")

In [101]:
final_predictions

['B-Brand', 'O', 'O', 'O', 'O', 'O']