In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle
nltk.download("punkt")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from gensim.models import Word2Vec, KeyedVectors

[nltk_data] Downloading package punkt to C:\Users\Malathi
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [93]:
df = pd.read_csv("C:/Users/Malathi M/OneDrive/Documents/MDTE25/guvi final project/Main project/news.tsv.zip", sep="\t")
print(df.head())


  News ID Category         Topic  \
0  N10000   sports        soccer   
1  N10001     news  newspolitics   
2  N10002     news        newsus   
3  N10003     news  newspolitics   
4  N10004     news     newsworld   

                                            Headline  \
0  Predicting Atlanta United's lineup against Col...   
1  Mitch McConnell: DC statehood push is 'full bo...   
2            Home In North Highlands Damaged By Fire   
3  Meghan McCain blames 'liberal media' and 'thir...   
4                            Today in History: Aug 1   

                                           News body  \
0  Only FIVE internationals allowed, count em, FI...   
1  WASHINGTON -- Senate Majority Leader Mitch McC...   
2  NORTH HIGHLANDS (CBS13)   Fire damaged a home ...   
3  Meghan McCain is speaking out after a journali...   
4  1714: George I becomes King Georg Ludwig, Elec...   

                                Title entity  \
0  {"Atlanta United's": 'Atlanta United FC'}   
1            

In [3]:
df.isnull().sum()

News ID            0
Category           0
Topic              0
Headline           0
News body         58
Title entity       0
Entity content     0
dtype: int64

In [95]:
df = df.dropna(subset=["Headline", "News body"]).reset_index(drop=True)


In [96]:
df.isnull().sum()

News ID           0
Category          0
Topic             0
Headline          0
News body         0
Title entity      0
Entity content    0
dtype: int64

In [79]:
df.columns = df.columns.str.strip().str.lower()
df.columns


Index(['news id', 'category', 'topic', 'headline', 'news body', 'title entity',
       'entity content'],
      dtype='object')

In [97]:
# Combine Headline + News body
df['text'] = (df['Headline'].str.strip() + ' ' + df['News body'].str.strip()).str.strip()
df['text'] = df['text'].fillna('')

# Feature and target
X = df["text"]
y = df["Category"]

In [98]:
# keep only required columns
df = df[["Headline", "Title entity", "Entity content"]]

In [99]:
# Take only 50% of the data to reduce computation
df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)

In [100]:
import ast

def parse_dict(x):
    try:
        return ast.literal_eval(x)
    except:
        return {}

df["title_entity_dict"] = df["Title entity"].apply(parse_dict)

In [101]:
import re

def clean_text_ner(text):
    text = re.sub(r"<.*?>", "", str(text))
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["clean_text"] = df["Headline"].apply(clean_text_ner)

In [102]:
def tokenize(text):
    return text.split()

df["tokens"] = df["clean_text"].apply(tokenize)


In [103]:
import spacy

In [104]:
nlp = spacy.load("en_core_web_sm")

df["Headline"] = df["Headline"].astype(str)
df["Title entity"] = df["Title entity"].astype(str)

COUNTRIES = ["United States", "India", "Brazil", "China", "Mexico", "Canada"]
PERSON_PATTERN = r"^[A-Z][a-z]+(\s[A-Z][a-z]+)+$"
ORG_KEYWORDS = ["Corporation", "Authority", "Committee", "Association", "University", "Agency", "Company", "FC", "Ltd"]


def infer_entity_type(expanded):
    expanded = expanded.strip()

    if re.match(PERSON_PATTERN, expanded):
        return "PERSON"

    if expanded in COUNTRIES:
        return "LOCATION"

    if any(k in expanded for k in ORG_KEYWORDS):
        return "ORG"

    return "MISC"

def convert_to_bio(text, entity_string):
    tokens = text.split()
    tags = ["O"] * len(tokens)

    if entity_string == "{}":
        return tokens, tags

    try:
        ent_dict = ast.literal_eval(entity_string)
    except:
        return tokens, tags

    lower_tokens = [w.lower().strip(".,!?") for w in tokens]

    for surface, expanded in ent_dict.items():
        clean_surface = surface.replace("'s", "").strip()
        stoks = clean_surface.split()
        stoks = [w.lower().strip(".,!?") for w in stoks]
        n = len(stoks)

        ent_type = infer_entity_type(expanded)

        # Search entity span safely
        for i in range(len(tokens)):
            try:
                if lower_tokens[i:i+n] == stoks:
                    tags[i] = f"B-{ent_type}"
                    for j in range(i+1, i+n):
                        if j < len(tags):   # SAFETY CHECK
                            tags[j] = f"I-{ent_type}"
            except:
                continue

    return tokens, tags



sentences = []
labels = []

for _, row in df.iterrows():
    s, t = convert_to_bio(row["Headline"], row["Title entity"])
    sentences.append(s)
    labels.append(t)

print("DATA READY â€” Samples:", len(sentences))

DATA READY â€” Samples: 56881


In [107]:
df["ner_tags"] = df.apply(
    lambda row: create_ner_tags(
        row["tokens"],
        row["Entity content"],
        row["Title entity"]
    ),
    axis=1
)


In [109]:
df = df[df["tokens"].apply(len) == df["ner_tags"].apply(len)]


In [110]:
from datasets import Dataset

ner_dataset = Dataset.from_pandas(df[["tokens", "ner_tags"]])
ner_dataset


Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 28440
})

In [111]:
# Example BIO labels â€“ adjust if needed
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}


In [112]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


In [113]:
max_len = 128

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],              # list of word tokens
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=max_len
    )

    aligned_labels = []

    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)    # padding / special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[labels[word_idx]])
            else:
                label_ids.append(-100)    # subword â†’ ignore

            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


In [114]:
from datasets import Dataset

# Example: you already created these
# tokens â†’ list[list[str]]
# ner_tags â†’ list[list[str]]

df_small = df.sample(frac=0.5, random_state=42)

dataset = Dataset.from_dict({
    "tokens": df_small["tokens"].tolist(),
    "ner_tags": df_small["ner_tags"].tolist()
})

dataset = dataset.train_test_split(test_size=0.2)

tokenized_ds = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names
)


Map:   0%|          | 0/11376 [00:00<?, ? examples/s]

Map:   0%|          | 0/2844 [00:00<?, ? examples/s]

In [115]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [116]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_ner",
    max_steps=850,          # ðŸ”´ STOP at step 850
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    logging_steps=50,
    save_strategy="no",
    report_to="none"
)


In [117]:
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_preds = []
    true_labels = []

    for pred, lab in zip(predictions, labels):
        for p_i, l_i in zip(pred, lab):
            if l_i != -100:
                true_preds.append(id2label[p_i])
                true_labels.append(id2label[l_i])

    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }


In [118]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
50,0.2352
100,0.0003
150,0.0002
200,0.0002
250,0.0001
300,0.0001
350,0.0001
400,0.0001
450,0.0001
500,0.0001


TrainOutput(global_step=850, training_loss=0.013946388042531907, metrics={'train_runtime': 4609.9836, 'train_samples_per_second': 1.475, 'train_steps_per_second': 0.184, 'total_flos': 444224566579200.0, 'train_loss': 0.013946388042531907, 'epoch': 0.5977496483825597})

In [121]:
model.save_pretrained("./bert_ner_final")
tokenizer.save_pretrained("./bert_ner_final")

('./bert_ner_final\\tokenizer_config.json',
 './bert_ner_final\\special_tokens_map.json',
 './bert_ner_final\\vocab.txt',
 './bert_ner_final\\added_tokens.json',
 './bert_ner_final\\tokenizer.json')

In [32]:
import numpy as np
from seqeval.metrics import f1_score


Model	rouge1	rouge2	rougeL	rougeLsum	Average Score
bert	0.74819271	0.686270584	0.6888	0.6884	0.7077
lstm_bahdanau	0.685955	0.300094	0.5139	0.2253	0.5433
TextRank	0.5028	0.3924	0.4402	0.4402	0.4451
TF-IDF	0.4836	0.3779	0.4144	0.4144	0.4253