In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [42]:
# df=pd.read_csv('/Users/lukishyadav/Desktop/engineering/case_studies/ner_casestudy/data/ner_dataset.csv',encoding='unicode_escape')

In [53]:
# Load the dataset with a specified encoding
file_path = '/Users/lukishyadav/Desktop/engineering/case_studies/ner_casestudy/data/ner_dataset.csv'  # Replace with your file path
data = pd.read_csv(file_path, encoding='ISO-8859-1')

In [54]:
# Display the first few rows of the dataset to understand its structure
print(data.head())

    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


In [55]:
# # Drop rows with NaN values
data = data.dropna(subset=['Word','POS','Tag'])

In [56]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [57]:
# Group the data by sentences
data['Sentence #'] = data['Sentence #'].ffill()  # Fill forward to propagate sentence IDs
sentences = data.groupby('Sentence #').apply(lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                                                     s['POS'].values.tolist(),
                                                                                     s['Tag'].values.tolist())])



In [58]:
# Convert the groupby object to a list of sentences
sentences = [s for s in sentences]


In [59]:
# Split the dataset into training, validation, and test sets (20% for test)
train_sentences, test_sentences = train_test_split(sentences, test_size=0.20, random_state=42)
train_sentences, val_sentences = train_test_split(train_sentences, test_size=0.25, random_state=42)  # 0.25 * 0.80 = 0.20


In [60]:
# Print the number of sentences in each split
print(f"Training sentences: {len(train_sentences)}")
print(f"Validation sentences: {len(val_sentences)}")
print(f"Test sentences: {len(test_sentences)}")

Training sentences: 28769
Validation sentences: 9590
Test sentences: 9590


In [61]:
# Extract words and tags from the sentences for training, validation, and test sets
def extract_words_tags(sentences):
    words = [[word for word, pos, tag in sentence] for sentence in sentences]
    tags = [[tag for word, pos, tag in sentence] for sentence in sentences]
    return words, tags

In [62]:
train_words, train_tags = extract_words_tags(train_sentences)
val_words, val_tags = extract_words_tags(val_sentences)
test_words, test_tags = extract_words_tags(test_sentences)


In [63]:
# Sample output
print(train_words[0])
print(train_tags[0])

['Shortly', 'before', 'midday', 'local', 'time', 'Thursday', ',', 'the', 'torch', 'entered', 'Turin', 'through', 'the', 'Piazza', 'Massaua', '.']
['O', 'O', 'B-tim', 'O', 'B-tim', 'I-tim', 'O', 'O', 'O', 'O', 'B-org', 'O', 'O', 'B-geo', 'I-geo', 'O']


In [64]:
# Evaluation metrics: Precision, Recall, F1-score
from sklearn.metrics import classification_report

In [65]:
# Example baseline model (assigns 'O' to every word)
def baseline_model(words):
    return [['O'] * len(sentence) for sentence in words]

In [66]:
# Evaluate the baseline model
baseline_predictions = baseline_model(test_words)
print(classification_report([tag for sentence in test_tags for tag in sentence],
                            [tag for sentence in baseline_predictions for tag in sentence]))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        72
       B-eve       0.00      0.00      0.00        74
       B-geo       0.00      0.00      0.00      7481
       B-gpe       0.00      0.00      0.00      3196
       B-nat       0.00      0.00      0.00        40
       B-org       0.00      0.00      0.00      3962
       B-per       0.00      0.00      0.00      3370
       B-tim       0.00      0.00      0.00      3986
       I-art       0.00      0.00      0.00        43
       I-eve       0.00      0.00      0.00        69
       I-geo       0.00      0.00      0.00      1486
       I-gpe       0.00      0.00      0.00        49
       I-nat       0.00      0.00      0.00        19
       I-org       0.00      0.00      0.00      3311
       I-per       0.00      0.00      0.00      3337
       I-tim       0.00      0.00      0.00      1324
           O       0.85      1.00      0.92    177485

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
# Shortcomings: baseline model assigns 'O' to every word, leading to poor precision, recall, and F1-score


In [68]:
# Improved model (using Conditional Random Fields - CRF)
import sklearn_crfsuite
from sklearn_crfsuite import metrics


In [70]:
# Feature extraction function
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

In [71]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for word, postag, label in sent]


In [72]:
# Extract features and labels for training and testing
X_train = [sent2features(s) for s in train_sentences]
y_train = [sent2labels(s) for s in train_sentences]
X_val = [sent2features(s) for s in val_sentences]
y_val = [sent2labels(s) for s in val_sentences]
X_test = [sent2features(s) for s in test_sentences]
y_test = [sent2labels(s) for s in test_sentences]

In [73]:
%time
# Train the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)

crf.fit(X_train, y_train)


In [74]:
# Make predictions using the CRF model
y_pred = crf.predict(X_test)


In [75]:
# Evaluate the CRF model
print(metrics.flat_classification_report(y_test, y_pred, labels=crf.classes_, digits=3))


              precision    recall  f1-score   support

           O      0.992     0.994     0.993    177485
       B-tim      0.918     0.879     0.898      3986
       I-tim      0.829     0.740     0.782      1324
       B-org      0.789     0.739     0.763      3962
       B-geo      0.859     0.902     0.880      7481
       I-geo      0.825     0.788     0.806      1486
       B-gpe      0.966     0.939     0.952      3196
       B-per      0.849     0.825     0.837      3370
       I-per      0.850     0.892     0.871      3337
       B-nat      0.562     0.225     0.321        40
       I-org      0.799     0.797     0.798      3311
       I-gpe      0.862     0.510     0.641        49
       B-art      0.500     0.194     0.280        72
       I-art      0.429     0.140     0.211        43
       B-eve      0.521     0.338     0.410        74
       I-nat      1.000     0.158     0.273        19
       I-eve      0.472     0.246     0.324        69

    accuracy              

In [76]:
# Future scope: 
# - Experiment with different feature sets
# - Use deep learning models such as BiLSTM-CRF or BERT
# - Fine-tune hyperparameters of the CRF model

In [77]:
train_sentences

[[('Shortly', 'RB', 'O'),
  ('before', 'IN', 'O'),
  ('midday', 'NN', 'B-tim'),
  ('local', 'JJ', 'O'),
  ('time', 'NN', 'B-tim'),
  ('Thursday', 'NNP', 'I-tim'),
  (',', ',', 'O'),
  ('the', 'DT', 'O'),
  ('torch', 'NN', 'O'),
  ('entered', 'VBD', 'O'),
  ('Turin', 'NNP', 'B-org'),
  ('through', 'IN', 'O'),
  ('the', 'DT', 'O'),
  ('Piazza', 'NNP', 'B-geo'),
  ('Massaua', 'NNP', 'I-geo'),
  ('.', '.', 'O')],
 [('There', 'EX', 'O'),
  ('has', 'VBZ', 'O'),
  ('been', 'VBN', 'O'),
  ('a', 'DT', 'O'),
  ('global', 'JJ', 'O'),
  ('shift', 'NN', 'O'),
  ('in', 'IN', 'O'),
  ('dietary', 'JJ', 'O'),
  ('habits', 'NNS', 'O'),
  ('towards', 'IN', 'O'),
  ('calorie', 'NN', 'O'),
  ('rich', 'JJ', 'O'),
  ('foods', 'NNS', 'O'),
  (',', ',', 'O'),
  ('as', 'RB', 'O'),
  ('well', 'RB', 'O'),
  ('as', 'IN', 'O'),
  ('a', 'DT', 'O'),
  ('universal', 'JJ', 'O'),
  ('decrease', 'NN', 'O'),
  ('in', 'IN', 'O'),
  ('exercise', 'NN', 'O'),
  ('.', '.', 'O')],
 [('The', 'DT', 'O'),
  ('attack', 'NN', 'O'),
