## Preparing Dataset

In [1]:
# Training example
sentences = [
    ["tapaiko", "storema", "Redmi", "Note", "9", "Pro", "mobile", "xa"],
    ["Redmi", "9", "ko", "price", "kati", "ho"],
    ["Samsung", "ko", "mobile", "kati", "ho"],
    ["mero", "order", "number", "ORD1234", "track", "garidinu"],
    ["Redmi", "phone", "kati", "ho"],
    ["Redmi", "ra", "Samsung", "ko", "mobile", "kati", "ho"],
    ["maile", "gareko", "order", "ORD989", "track", "garidinu"],
    # Add more training examples
]

labels = [
    ["O", "O", "B-Brand", "I-Brand", "I-Brand", "I-Brand", "B-Category", "O"],
    ["B-Brand", "I-Brand", "O", "B-Attribute", "O", "O"],
    ["B-Brand", "O", "B-Category", "O", "O"],
    ["O", "O", "O", "B-OrderID", "O", "O"],
    ["B-Brand", "B-Category", "O", "O"],
    ["B-Brand", "O", "B-Brand", "O", "B-Category", "O", "O"],
    ["O", "O", "O", "B-OrderID", "O", "O"],
    # Corresponding labels
]

## Importing Libraries

In [2]:
import numpy as np
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
import re

## Creating Features

In [5]:
def word_shape(word):
    shape = ""
    for char in word:
        if char.isupper():
            shape += "X"
        elif char.islower():
            shape += "x"
        elif char.isdigit():
            shape += "d"
        else:
            shape += char
    return shape

In [12]:
def create_word_features(sentence, index):
    word = sentence[index]
    
    features = {
        "word.lower": word.lower(),
        "word.isupper": word.isupper(),
        "word.istitle": word.istitle(),
        "word.shape": word_shape(word),
        "word.isdigit": word.isdigit()
    }
    
    # adding surrounding word features
    if index > 0:
        word_prev = sentence[index - 1]
        features.update({
            "prev_word.lower": word_prev.lower(),
            "prev_word.istitle": word_prev.istitle(),
            "prev_word.isupper": word_prev.isupper(),
            "prev_word.isdigit": word_prev.isdigit()
        })
    else:
        features["BOS"] = True
        
    if index < len(sentence) - 1:
        word_next = sentence[index + 1]
        features.update({
            "next_word.lower": word_next.lower(),
            "next_word.istitle": word_next.istitle(),
            "next_word.isupper": word_next.isupper(),
            "next_word.isdigit": word_next.isdigit()
        })
    else:
        features["EOS"] = True
        
    return features

## Preparing Training Data

In [13]:
def prepare_data(sentences, labels):
    X, y = [], []
    
    for sentence, sentence_labels in zip(sentences, labels):
        sentence_features = []
        for i in range(len(sentence)):
            word_features = create_word_features(sentence, i)
            sentence_features.append(word_features)
            
        X.append(sentence_features)
        y.append(sentence_labels)
        
    return X, y

In [14]:
X_train, y_train = prepare_data(sentences, labels)

In [17]:
X_train[0]

[{'word.lower': 'tapaiko',
  'word.isupper': False,
  'word.istitle': False,
  'word.shape': 'xxxxxxx',
  'word.isdigit': False,
  'BOS': True,
  'next_word.lower': 'storema',
  'next_word.istitle': False,
  'next_word.isupper': False,
  'next_word.isdigit': False},
 {'word.lower': 'storema',
  'word.isupper': False,
  'word.istitle': False,
  'word.shape': 'xxxxxxx',
  'word.isdigit': False,
  'prev_word.lower': 'tapaiko',
  'prev_word.istitle': False,
  'prev_word.isupper': False,
  'prev_word.isdigit': False,
  'next_word.lower': 'redmi',
  'next_word.istitle': True,
  'next_word.isupper': False,
  'next_word.isdigit': False},
 {'word.lower': 'redmi',
  'word.isupper': False,
  'word.istitle': True,
  'word.shape': 'Xxxxx',
  'word.isdigit': False,
  'prev_word.lower': 'storema',
  'prev_word.istitle': False,
  'prev_word.isupper': False,
  'prev_word.isdigit': False,
  'next_word.lower': 'note',
  'next_word.istitle': True,
  'next_word.isupper': False,
  'next_word.isdigit': False

In [18]:
y_train[0]

['O', 'O', 'B-Brand', 'I-Brand', 'I-Brand', 'I-Brand', 'B-Category', 'O']

## Model Train CRF

In [20]:
crf = CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.2,
    max_iterations=100,
    all_possible_transitions=True
)

In [21]:
crf.fit(X_train, y_train)

In [23]:
def predict_entities(crf_model: CRF, sentence: list):
    sentence_features = [create_word_features(sentence, i) for i in range(len(sentence))]
    predictions = crf_model.predict([sentence_features])[0]
    return predictions

In [24]:
new_sentence = ["Redmi", "mobile", "kati", "ho"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'B-Category' 'O' 'O']


In [25]:
new_sentence = ["Apple", "mobile", "Rs", "100000", "vanda", "sasto", "ho"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'B-Category' 'O' 'O' 'O' 'O' 'O']


In [26]:
new_sentence = ["Apple", "mobile", "order", "gareko", "id", "ORD12", "track", "garidinu"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'B-Category' 'O' 'O' 'O' 'B-OrderID' 'O' 'O']


In [27]:
new_sentence = ["Oppo", "ra", "Vivo", "ko", "mobile", "kati", "ho"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'O' 'B-Brand' 'O' 'B-Category' 'O' 'O']


In [38]:
new_sentence = ["Redmi", "mobile", "ko", "price", "kati", "ho"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'B-Category' 'O' 'B-Attribute' 'O' 'O']


In [51]:
new_sentence = ["Huawei", "calculator", "ko", "price", "kati", "ho"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Brand' 'O' 'O' 'B-Attribute' 'O' 'O']
