In [32]:
import pandas as pd
from sklearn.metrics import classification_report

In [33]:
# STEP 1: Load the CoNLL-2003 dataset

def load_data(file_path):
    # Read the dataset and preprocess it
    df = pd.read_csv(file_path, sep=" ", header=None, names=["Word", "POS", "Chunk", "NE"])
    df = df[df["Word"] != "-DOCSTART-"]  # Remove -DOCSTART- lines

    # Remove empty lines
    df = df.dropna(subset=["NE"]) 
    df = df.dropna(subset=["Word"])

    def update_ne(ne):
        ne_mapping = {
            'B-ORG': 'ORG',
            'O': 'O',
            'B-MISC': 'MISC',
            'B-PER': 'PER',
            'I-PER': 'PER',
            'B-LOC': 'LOC',
            'I-ORG': 'ORG',
            'I-MISC': 'MISC',
            'I-LOC': 'LOC'
        }
        return ne_mapping.get(ne, ne)
    
    # Update NE values
    df["NE"] = df["NE"].apply(update_ne)

    return df

train_df = load_data("conll2003/train.txt")
test_df = load_data("conll2003/test.txt")

# Store all NER tags
ne_tags = list(train_df["NE"].unique())

In [34]:
test_df[:10]

Unnamed: 0,Word,POS,Chunk,NE
1,SOCCER,NN,B-NP,O
2,-,:,O,O
3,JAPAN,NNP,B-NP,LOC
4,GET,VB,B-VP,O
5,LUCKY,NNP,B-NP,O
6,WIN,NNP,I-NP,O
7,",",",",O,O
8,CHINA,NNP,B-NP,PER
9,IN,IN,B-PP,O
10,SURPRISE,DT,B-NP,O


In [35]:
# STEP 2: Find the majority class in training data
majority_class = train_df["NE"].value_counts().idxmax()
majority_class

'O'

In [40]:
# STEP 3: "Predict" with majority class (from training data) for every row in test dataset
majority_predictions = [majority_class] * len(test_df["NE"])

# STEP 4: Evaluate baseline using classification_report
report = classification_report(test_df["NE"], majority_predictions, zero_division=0)

print(report)

with open("ner_results_baseline.txt", "w", encoding="utf-8") as f:
    f.write(report)

Majority Baseline Classification Report:
              precision    recall  f1-score   support

         LOC       0.00      0.00      0.00      1925
        MISC       0.00      0.00      0.00       918
           O       0.82      1.00      0.90     37894
         ORG       0.00      0.00      0.00      2496
         PER       0.00      0.00      0.00      2773

    accuracy                           0.82     46006
   macro avg       0.16      0.20      0.18     46006
weighted avg       0.68      0.82      0.74     46006

