<a href="https://colab.research.google.com/github/mayanksingh-27/NER-system-using-CRF-/blob/main/Untitled35.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()

Saving hinglish_ner_data_extended.txt to hinglish_ner_data_extended.txt


In [2]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0


In [3]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd


def read_conll(filename):
    sentences, labels = [], []
    with open(filename, 'r', encoding='utf-8') as f:
        words, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if words:
                    sentences.append(words)
                    labels.append(tags)
                    words, tags = [], []
            else:
                word, tag = line.split()
                words.append(word)
                tags.append(tag)
        if words:
            sentences.append(words)
            labels.append(tags)
    return sentences, labels

sentences, labels = read_conll('hinglish_ner_data_extended.txt')

In [4]:
def word2features(sent, i):
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [5]:
X = [sent2features(s) for s in sentences]
y = labels

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train, y_train)

In [6]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-LOC       1.00      0.80      0.89         5
       B-ORG       0.89      1.00      0.94        16
       B-PER       1.00      0.90      0.95        10
       I-LOC       1.00      1.00      1.00         1
       I-ORG       1.00      1.00      1.00         4
       I-PER       1.00      1.00      1.00         5
           O       1.00      1.00      1.00        87

    accuracy                           0.98       128
   macro avg       0.98      0.96      0.97       128
weighted avg       0.99      0.98      0.98       128



In [7]:
test_sentences = [
    "aaj main zomato se biryani mangwaya in hyderabad",
    "kal Priya aur amit google me interview dene gaye",
    "Ola ka cab driver bangalore se tha",
    "mumbai mein bhai Sneha ne amazon se phone order kiya",
    "Kolkata ke weather ke liye google dekh raha tha",
    "ravi ne facebook aur twitter dono uninstall kar diye",
    "aaj ka meeting Infosys campus me hai",
    "Google aur Flipkart me kaafi farak hai",
    "aaj Anjali ne New Delhi me cricket match dekha",
    "bhaiya ne bola Swiggy late deliver karta hai"
]

In [8]:
for sentence in test_sentences:
    tokens = sentence.split()
    features = sent2features(tokens)
    preds = crf.predict([features])[0]
    print(f"\n Sentence: {sentence}")
    for word, tag in zip(tokens, preds):
        if tag != 'O':
            print(f"  {word} → {tag}")


 Sentence: aaj main zomato se biryani mangwaya in hyderabad

 Sentence: kal Priya aur amit google me interview dene gaye
  Priya → B-ORG

 Sentence: Ola ka cab driver bangalore se tha
  Ola → B-ORG

 Sentence: mumbai mein bhai Sneha ne amazon se phone order kiya
  Sneha → B-ORG

 Sentence: Kolkata ke weather ke liye google dekh raha tha
  Kolkata → B-ORG

 Sentence: ravi ne facebook aur twitter dono uninstall kar diye

 Sentence: aaj ka meeting Infosys campus me hai
  Infosys → B-ORG

 Sentence: Google aur Flipkart me kaafi farak hai
  Google → B-ORG
  Flipkart → B-ORG

 Sentence: aaj Anjali ne New Delhi me cricket match dekha
  Anjali → B-PER
  New → B-PER
  Delhi → B-LOC

 Sentence: bhaiya ne bola Swiggy late deliver karta hai
  Swiggy → B-ORG
