## Entity Recognition

In [1]:
import os
import json

### Preparing Dataset

In [2]:
with open("./data.json", "r") as f:
    data = f.read()

In [3]:
data = json.loads(data)

In [4]:
for train_pair in data:
    print(train_pair)
    break

[['malai', 'Samsung', 'Galaxy', 'ko', 'mobile', 'dekhaunu'], ['O', 'B-Brand', 'B-Product', 'O', 'B-Category', 'O']]


In [5]:
train_sentences = list()
train_labels = list()

for example in data:
    train_sentences.append(example[0])
    train_labels.append(example[1])

In [6]:
len(train_sentences), len(train_labels)

(107, 107)

In [7]:
for i, (x, y) in enumerate(zip(train_sentences[:3], train_labels[:3])):
    print(f"{i}")
    for j in range(len(x)):
        print(f"{x[j]} : {y[j]}")
    print(f"\n")

0
malai : O
Samsung : B-Brand
Galaxy : B-Product
ko : O
mobile : B-Category
dekhaunu : O


1
tapai : O
ko : O
store : O
ma : O
Redmi : B-Product
Note : I-Product
9 : I-Product
ko : O
mobile : B-Category
xa : O


2
malai : O
Redmi : B-Product
Note : I-Product
9 : I-Product
ko : O
mobile : B-Category
chaiyeko : O
xa : O




## Importing Libraries

In [8]:
import numpy as np
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
import re

## Features

In [9]:
def get_word_shape(word):
    shape = ""
    last_char_type = ""
    
    for char in word:
        if char.isupper():
            char_type = "X"
        elif char.islower():
            char_type = "x"
        elif char.isdigit():
            char_type = "d"
        else:
            char_type = char
        if char_type != last_char_type:
            shape += char_type
            last_char_type = char_type
    return shape

In [10]:
get_word_shape("ORD123"), get_word_shape("ORD"), get_word_shape("ORD123abc")

('Xd', 'X', 'Xdx')

In [11]:

get_word_shape("oO11o1o")

'xXdxdx'

In [12]:
feature_dict = {
    "brands": [
        "Xiaomi", "Samsung", "Casio", "Apple"
    ],
    "category": [
        "mobile", "TV", "calculator", "laptop"
    ]
}

In [13]:
def get_ecommerce_features(sentence: list[str] , index: int) -> dict:
    """
    Creates a feature dictionary for a given word in a sentence.
    
    Args:
    sentence: list of words in a sentence
    index: index of the word in the sentence
    Returns:
    dict: features
    """
    features = dict()
    word = sentence[index]
    
    features.update(
        {
            "has_number": bool(re.search(r"\d", word)),
            "is_brand": word.lower() in feature_dict["brands"],
            "is_category": word.lower() in feature_dict["category"],
            "word_shape": get_word_shape(word)
        }
    )
    
    if index < len(sentence) - 1:
        word_next = sentence[index + 1]
        features.update({
            "next_word.lower": word_next.lower(),
            "next_word.istitle": word_next.istitle(),
            "next_word.isupper": word_next.isupper(),
            "next_word.isdigit": word_next.isdigit()
        })
    else:
        features["EOS"] = True
    
    if index > 0:
        word_prev = sentence[index - 1]
        features.update({
            "prev_word.lower": word_prev.lower(),
            "prev_word.istitle": word_prev.istitle(),
            "prev_word.isupper": word_prev.isupper(),
            "prev_word.isdigit": word_prev.isdigit()
        })
    else:
        features["BOS"] = True
        
    return features

## Preparing Training Data

In [14]:
def prepare_data(sentences: list[str], labels: list):
    X, y = [], []
    
    for sentence, sentence_labels in zip(sentences, labels):
        sentence_features = []
        for i in range(len(sentence)):
            word_features = get_ecommerce_features(sentence, i)
            sentence_features.append(word_features)
            
        X.append(sentence_features)
        y.append(sentence_labels)
    return X, y

In [15]:
X_train, y_train = prepare_data(train_sentences, train_labels)

In [16]:
len(X_train), len(y_train)

(107, 107)

In [17]:
train_sentences[0][4], X_train[0][4]

('mobile',
 {'has_number': False,
  'is_brand': False,
  'is_category': True,
  'word_shape': 'x',
  'next_word.lower': 'dekhaunu',
  'next_word.istitle': False,
  'next_word.isupper': False,
  'next_word.isdigit': False,
  'prev_word.lower': 'ko',
  'prev_word.istitle': False,
  'prev_word.isupper': False,
  'prev_word.isdigit': False})

In [18]:
train_sentences[6][0], X_train[6][0], y_train[6][0]

('malai',
 {'has_number': False,
  'is_brand': False,
  'is_category': False,
  'word_shape': 'x',
  'next_word.lower': 'mobile',
  'next_word.istitle': False,
  'next_word.isupper': False,
  'next_word.isdigit': False,
  'BOS': True},
 'O')

## Model Training

In [19]:
crf = CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.2,
    max_iterations=100,
    all_possible_transitions=True
)

In [25]:
crf.fit(X_train, y_train)

In [26]:
import joblib

# Save the model
joblib.dump(crf, 'crf_model.pkl')

# Load the model
crf_loaded = joblib.load('crf_model.pkl')

## Prediction

In [21]:
def predict_entities(crf_model: CRF, sentence: list):
    sentence_features = [get_ecommerce_features(sentence, i) for i in range(len(sentence))]
    predictions = crf_model.predict([sentence_features])[0]
    return predictions

In [22]:
new_sentence = ["Redmi", "mobile", "kati", "ho"]
predicted_labels = predict_entities(crf, new_sentence)
print(predicted_labels)

['B-Product' 'B-Category' 'O' 'O']


In [29]:
test_sentences = [
    "malai Redmi Note 9 ko barema bujhnu xa",
    "Xiaomi ko Redmi mobile available xa ki xaina",
    "Casio ko calculator Dharan ma deliver garidinu",
    "3 ta mobile Kathmandu ma delivery",
    "mero order id ORD12345 track garidinu",
    "Apple ko laptop available xa",
    "Sony ko TV Dharan ma 2 ta delivery garne",
    "Dharan ma delivery hunxa ki hudaina",
    "Samsung Galaxy s21 lai kati parchha",
    "Redmi"
]

In [30]:
for text in test_sentences:
    sentence = text.split()
    predicted_labels = predict_entities(crf, sentence)
    for i in range(len(sentence)):
        print(f"{sentence[i]}: {predicted_labels[i]}")
    print(f"Another one***********************\n")

malai: O
Redmi: B-Product
Note: I-Product
9: I-Product
ko: O
barema: O
bujhnu: O
xa: O
Another one***********************

Xiaomi: B-Brand
ko: O
Redmi: B-Product
mobile: B-Category
available: O
xa: O
ki: O
xaina: O
Another one***********************

Casio: B-Brand
ko: O
calculator: B-Category
Dharan: B-Location
ma: O
deliver: O
garidinu: O
Another one***********************

3: B-Quantity
ta: O
mobile: B-Category
Kathmandu: B-Location
ma: B-Location
delivery: O
Another one***********************

mero: O
order: O
id: O
ORD12345: B-Order_Number
track: O
garidinu: O
Another one***********************

Apple: B-Brand
ko: O
laptop: B-Category
available: O
xa: O
Another one***********************

Sony: B-Brand
ko: O
TV: B-Category
Dharan: B-Location
ma: O
2: B-Quantity
ta: O
delivery: O
garne: O
Another one***********************

Dharan: B-Location
ma: O
delivery: O
hunxa: O
ki: O
hudaina: O
Another one***********************

Samsung: B-Brand
Galaxy: B-Product
s21: I-Product
lai: O
kati