In [None]:
#%pip install scikit-learn
#%pip install sklearn_crfsuite
#%pip install streamlit
#%pip install bs4
#%pip install lxml

Collecting lxml
  Using cached lxml-5.4.0-cp310-cp310-win_amd64.whl.metadata (3.6 kB)
Using cached lxml-5.4.0-cp310-cp310-win_amd64.whl (3.8 MB)
Installing collected packages: lxml
Successfully installed lxml-5.4.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from collections import Counter
from utils import transform_label, transform_probs, get_entity_info, CLASS_MAP  # Import custom utility functions
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
# Install sklearn-crfsuite library
#%pip install sklearn-crfsuite
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import xml.etree.ElementTree as ET

# helper functions
def identity_tokenizer(x):
    return x

def identity_preprocessor(x):
    return x


# Load the training data from the JSON file
df = pd.read_json("./data/processed/hindi_train.json", orient="records", lines=True)

# Split the data into training and validation sets
df_train, df_valid = train_test_split(df, test_size=0.10, random_state=42)

# Print the shapes of the training and validation sets
print("Training data shape:", df_train.shape)
print("Validation data shape:", df_valid.shape)

# Count the occurrence of labels in the training data
label_counts = sum([Counter(l) for l in df.labels], Counter())
print("Label Counts:", label_counts)

# Convert labels to indices based on the maximum occurrence
X_train = df_train["tokens"]
y_train = df_train.labels.apply(pd.Series).idxmax(axis=1)

X_valid = df_valid["tokens"]
y_valid = df_valid.labels.apply(pd.Series).idxmax(axis=1)


# Define and train a RandomForestClassifier pipeline
clf = Pipeline([
    ("tfidf", TfidfVectorizer(tokenizer=identity_tokenizer, preprocessor=identity_preprocessor)),
    ("model", RandomForestClassifier())
])
clf.fit(df_train["tokens"], y_train)

# Evaluate the model on the training and validation sets
print("Training Set Performance:")
print(classification_report(y_true=y_train, y_pred=clf.predict(df_train["tokens"])))
print("Validation Set Performance:")
print(classification_report(y_true=y_valid, y_pred=clf.predict(df_valid["tokens"])))

# Further evaluation of the model with transformed labels and probabilities
y_train_transformed = transform_label(y_train)
y_valid_transformed = transform_label(y_valid)
print("Transformed Labels - Training Set Performance:")
print(classification_report(y_true=y_train_transformed, y_pred=transform_label(clf.predict(df_train["tokens"]))))
print("Transformed Labels - Validation Set Performance:")
print(classification_report(y_true=y_valid_transformed, y_pred=transform_label(clf.predict(df_valid["tokens"]))))

# Generate predictions and probabilities
y_valid_proba = clf.predict_proba(df_valid["tokens"])

# Convert class labels to corresponding categories
clf_map = {c: c.split(".")[0] for c in clf.classes_}

# Transform probabilities and evaluate
print("Transformed Probabilities - Training Set Performance:")
print(classification_report(y_true=y_train_transformed, y_pred=transform_probs(clf.predict_proba(df_train["tokens"]), clf)))
print("Transformed Probabilities - Validation Set Performance:")
print(classification_report(y_true=y_valid_transformed, y_pred=transform_probs(y_valid_proba, clf)))

# Define feature extraction functions for CRF
def word2features(sent, i):
    word = sent[i][0]
    #postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        #'postag': postag,
        #'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        #postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            #'-1:postag': postag1,
            #'-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        #postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            #'+1:postag': postag1,
            #'+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]


def df_to_Xy(df):
    sentences = [list(zip(*row[["tokens", "tags"]])) for i, row in df.iterrows()]
    X = [sent2features(s) for s in sentences]
    y = [sent2labels(s) for s in sentences]
    return X, y

# Prepare data for CRF
X_train, y_train = df_to_Xy(df_train)
X_valid, y_valid = df_to_Xy(df_valid)

# Train CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='ap',
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)

# Make predictions with CRF model
y_pred = crf.predict(X_valid)

# Evaluate CRF model performance
labels = list(crf.classes_)
labels.remove('O')
f1_score = metrics.flat_f1_score(y_valid, y_pred, average='weighted', labels=labels)
#print("CRF Model F1 Score:", f1_score)

# Load test data
df_test = pd.read_json("./data/processed/hindi_test.json", orient="records", lines=True)

def generate_xml(entity_info, docid, simple=False):
    root = ET.Element("DOCUMENT") 
    docid_e = ET.Element("DOCID")
    docid_e.text = f"{docid}"
    root.append(docid_e) 
    for info in entity_info:
        label = info["label"]
        subtype = None
        if label.startswith(tuple(CLASS_MAP.keys())):
            label, subtype = label.split(".")
            label = CLASS_MAP[label]
        elif simple:
            continue
        if label.endswith("-ARG"):
            label = label.split("-ARG")[0]
            subtype = None
        elem = ET.Element(label.upper())
        if subtype:
            elem.attrib["TYPE"] = subtype.upper()
        elem.text = info["entity_phrase"]
        root.append(elem)
    return root

from pathlib import Path
base_path = Path(f"./data/Output/")

def create_files(df_test):
    X_test, y_test = df_to_Xy(df_test)
    y_pred = crf.predict(X_test)
    for pred, (idx, row) in zip(y_pred, df_test.iterrows()):
        entity_info = get_entity_info(pred, row["tokens"])
        docid = row["docid"]
        for task, simple in enumerate([True, False], start=1):
            root = generate_xml(entity_info, docid, simple=simple)
            out_path = base_path / f"Task_{task}" / f"{docid}.xml"
            with open(out_path, "wb") as fp:  # Open file for writing in binary mode
                tree = ET.ElementTree(root)
                tree.write(fp, encoding="utf-8")

# Generate XML output files
create_files(df_test)
print("Output saved in data\Output files in XML format")


Training data shape: (609, 4)
Validation data shape: (68, 4)
Label Counts: Counter({'MAN_MADE_EVENT.SUICIDE_ATTACK': 326, 'MAN_MADE_EVENT.TERRORIST_ATTACK': 287, 'MAN_MADE_EVENT.SHOOT_OUT': 280, 'NATURAL_EVENT.EARTHQUAKE': 263, 'MAN_MADE_EVENT.VEHICULAR_COLLISION': 250, 'MAN_MADE_EVENT.FIRE': 228, 'MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT': 194, 'NATURAL_EVENT.HEAT_WAVE': 185, 'MAN_MADE_EVENT.TRAIN_COLLISION': 139, 'NATURAL_EVENT.LAND_SLIDE': 138, 'NATURAL_EVENT.FLOODS': 136, 'NATURAL_EVENT.AVALANCHES': 135, 'MAN_MADE_EVENT.RIOTS': 125, 'NATURAL_EVENT.HURRICANE': 117, 'MAN_MADE_EVENT.TRANSPORT_HAZARDS': 115, 'NATURAL_EVENT.FOREST_FIRE': 114, 'NATURAL_EVENT.TORNADO': 112, 'NATURAL_EVENT.HAIL_STORMS': 103, 'NATURAL_EVENT.STORM': 101, 'NATURAL_EVENT.COLD_WAVE': 99, 'NATURAL_EVENT.VOLCANO': 96, 'MAN_MADE_EVENT.AVIATION_HAZARD': 93, 'NATURAL_EVENT.BLIZZARD': 67, 'NATURAL_EVENT.CYCLONE': 66, 'MAN_MADE_EVENT.NORMAL_BOMBING': 47, 'MAN_MADE_EVENT.ARMED_CONFLICTS': 23, 'NATURAL_EVENT.DROUGHT': 5, 'NAT



Training Set Performance:
                                    precision    recall  f1-score   support

    MAN_MADE_EVENT.ARMED_CONFLICTS       1.00      1.00      1.00         3
    MAN_MADE_EVENT.AVIATION_HAZARD       1.00      1.00      1.00        15
               MAN_MADE_EVENT.FIRE       1.00      1.00      1.00        30
MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT       1.00      1.00      1.00        32
     MAN_MADE_EVENT.NORMAL_BOMBING       1.00      1.00      1.00         4
              MAN_MADE_EVENT.RIOTS       1.00      1.00      1.00         9
          MAN_MADE_EVENT.SHOOT_OUT       1.00      1.00      1.00        28
     MAN_MADE_EVENT.SUICIDE_ATTACK       1.00      1.00      1.00        39
   MAN_MADE_EVENT.TERRORIST_ATTACK       1.00      1.00      1.00        78
    MAN_MADE_EVENT.TRAIN_COLLISION       1.00      1.00      1.00        14
  MAN_MADE_EVENT.TRANSPORT_HAZARDS       1.00      1.00      1.00        25
MAN_MADE_EVENT.VEHICULAR_COLLISION       1.00      1.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                    precision    recall  f1-score   support

    MAN_MADE_EVENT.AVIATION_HAZARD       0.00      0.00      0.00         1
               MAN_MADE_EVENT.FIRE       1.00      0.88      0.93         8
MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT       0.75      1.00      0.86         3
     MAN_MADE_EVENT.NORMAL_BOMBING       0.00      0.00      0.00         1
          MAN_MADE_EVENT.SHOOT_OUT       1.00      0.86      0.92         7
     MAN_MADE_EVENT.SUICIDE_ATTACK       1.00      1.00      1.00         2
   MAN_MADE_EVENT.TERRORIST_ATTACK       0.70      1.00      0.82         7
    MAN_MADE_EVENT.TRAIN_COLLISION       0.50      1.00      0.67         2
  MAN_MADE_EVENT.TRANSPORT_HAZARDS       0.00      0.00      0.00         4
MAN_MADE_EVENT.VEHICULAR_COLLISION       0.78      1.00      0.88         7
          NATURAL_EVENT.AVALANCHES       0.00      0.00      0.00         0
            NATURAL_EVENT.BLIZZARD       0.67      0.50      0.57         4
           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Output saved in data\Output files in XML format


In [2]:
# Full Evaluation Script for Task 1 and Task 2

import pandas as pd
from collections import Counter
from sklearn.metrics import classification_report
from sklearn_crfsuite import metrics as crf_metrics
from utils import transform_label, transform_probs

# === TASK 1: Evaluate CRF Sequence Labeling ===
def evaluate_crf_model(crf_model, X_valid, y_valid, task_name="Task 1"):
    print(f"\n==== {task_name} - CRF Sequence Labeling Evaluation ====")
    
    y_pred = crf_model.predict(X_valid)
    
    # Exclude 'O' label from entity evaluation
    labels = list(crf_model.classes_)
    if 'O' in labels:
        labels.remove('O')

    report = crf_metrics.flat_classification_report(
        y_true=y_valid, y_pred=y_pred, labels=labels, digits=3
    )
    print(report)

    # Save report to file
    with open("task1_crf_report.txt", "w", encoding="utf-8") as f:
        f.write(report)
    print("✅ Task 1 CRF evaluation saved to 'task1_crf_report.txt'")


# === TASK 2: Evaluate Classifier (Random Forest) ===
def evaluate_classifier_model(clf_model, df_train, df_valid):
    print("\n==== Task 2 - Random Forest Event Type Classification ====")
    
    # Get majority labels for each instance (used as ground truth)
    y_train = df_train.labels.apply(pd.Series).idxmax(axis=1)
    y_valid = df_valid.labels.apply(pd.Series).idxmax(axis=1)

    # Train and validation predictions
    y_train_pred = clf_model.predict(df_train["tokens"])
    y_valid_pred = clf_model.predict(df_valid["tokens"])

    print("\n📊 Raw Label Evaluation:")
    train_report = classification_report(y_train, y_train_pred)
    valid_report = classification_report(y_valid, y_valid_pred)
    print("--- Training ---\n", train_report)
    print("--- Validation ---\n", valid_report)

    # Save reports
    with open("task2_train_report.txt", "w", encoding="utf-8") as f:
        f.write(train_report)
    with open("task2_valid_report.txt", "w", encoding="utf-8") as f:
        f.write(valid_report)

    print("✅ Task 2 evaluation saved to 'task2_train_report.txt' and 'task2_valid_report.txt'")

    # Transformed label-based evaluation (optional)
    print("\n🔁 Transformed Label Evaluation (e.g., Manmade/Natural only):")
    y_train_t = transform_label(y_train)
    y_valid_t = transform_label(y_valid)
    y_train_pred_t = transform_label(y_train_pred)
    y_valid_pred_t = transform_label(y_valid_pred)

    print(classification_report(y_train_t, y_train_pred_t))
    print(classification_report(y_valid_t, y_valid_pred_t))


# === USAGE EXAMPLE ===
# Ensure these objects are already defined in your main script:
# - crf
# - X_valid, y_valid from CRF
# - clf (RandomForest pipeline)
# - df_train, df_valid from JSON

# Call these after training
evaluate_crf_model(crf_model=crf, X_valid=X_valid, y_valid=y_valid)
evaluate_classifier_model(clf_model=clf, df_train=df_train, df_valid=df_valid)



==== Task 1 - CRF Sequence Labeling Evaluation ====


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                      precision    recall  f1-score   support

                        B-REASON-ARG      0.000     0.000     0.000        24
                        I-REASON-ARG      0.000     0.000     0.000        69
                          B-TIME-ARG      0.747     0.378     0.502       172
                         B-PLACE-ARG      0.543     0.231     0.324       407
    B-MAN_MADE_EVENT.TRAIN_COLLISION      0.400     0.333     0.364        12
    I-MAN_MADE_EVENT.TRAIN_COLLISION      0.400     0.067     0.114        30
                   B-PARTICIPANT-ARG      0.545     0.109     0.182       110
                   I-PARTICIPANT-ARG      0.579     0.090     0.156       122
                    B-CASUALTIES-ARG      0.562     0.389     0.460       162
                    I-CASUALTIES-ARG      0.683     0.276     0.393       695
                 B-AFTER_EFFECTS-ARG      0.077     0.007     0.013       144
                 I-AFTER_EFFECTS-ARG      0.455     0.014     0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Key Contributions:

Implemented a CRF-based sequence labeling model to identify event arguments (e.g., TIME, PLACE, CASUALTIES), evaluated on BIO-tagged Hindi dataset.

Achieved moderate F1 scores for high-support labels like B-TIME-ARG (0.50), B-CASUALTIES-ARG (0.46), and B-EARTHQUAKE (0.78); struggled with underrepresented labels.

Built a Random Forest classifier for event type classification, attaining 99% accuracy on binary classification (Natural vs. Manmade events) and 76% macro F1-score on fine-grained labels during validation.

Automated full pipeline including XML parsing, feature engineering, model training, and metric reporting using scikit-learn and sklearn-crfsuite.

Event Detection in Hindi News Articles | NLP Web Application
Developed and deployed a full-stack NLP solution to extract structured event information from Hindi news articles using CRF and Random Forest models.

Achieved micro-F1 score of 0.282 for BIO-tagged argument extraction via a custom CRF sequence labeling model.

Built a TF-IDF + Random Forest classifier for event type classification, attaining up to 99% accuracy on binary (Manmade/Natural) and 79% macro-F1 on fine-grained event types.

Enabled real-time user input and automatic XML news parsing via Streamlit, with support for multilingual tokenization and labeled predictions.

Deployed publicly on Streamlit Cloud to demonstrate model predictions and support qualitative analysis for shared task evaluation.

In [4]:
#Save the RandomForestClassifier Pipeline (Task 2)

import joblib

# Save the classifier pipeline
joblib.dump(clf, "models/task2_event_classifier.pkl")

# To load it later:
# clf = joblib.load("./models/task2_event_classifier.pkl")


['models/task2_event_classifier.pkl']

In [5]:
# Save the CRF Model (Task 1)

# Save CRF model
crf_filename = "models/task1_sequence_labeler.crf"
joblib.dump(crf, crf_filename)


['models/task1_sequence_labeler.crf']