In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import drive


In [None]:
df = pd.read_csv(
    "/content/drive/MyDrive/MDTE25/news.tsv.zip",
    sep="\t",
    compression="zip"
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print(df.columns)
print(df.head())
print(df.shape)

Index(['News ID', 'Category', 'Topic', 'Headline', 'News body', 'Title entity',
       'Entity content'],
      dtype='object')
  News ID Category         Topic  \
0  N10000   sports        soccer   
1  N10001     news  newspolitics   
2  N10002     news        newsus   
3  N10003     news  newspolitics   
4  N10004     news     newsworld   

                                            Headline  \
0  Predicting Atlanta United's lineup against Col...   
1  Mitch McConnell: DC statehood push is 'full bo...   
2            Home In North Highlands Damaged By Fire   
3  Meghan McCain blames 'liberal media' and 'thir...   
4                            Today in History: Aug 1   

                                           News body  \
0  Only FIVE internationals allowed, count em, FI...   
1  WASHINGTON -- Senate Majority Leader Mitch McC...   
2  NORTH HIGHLANDS (CBS13)   Fire damaged a home ...   
3  Meghan McCain is speaking out after a journali...   
4  1714: George I becomes King Georg L

In [None]:

df["text"] = df["Headline"].fillna("") + " " + df["News body"].fillna("")
df = df[["text", "Category"]]

In [None]:
df = df.dropna(subset=["text", "Category"])

# Optional: remove rows where text is just whitespace
df = df[df["text"].str.strip() != ""]

# Reset index
df = df.reset_index(drop=True)

print(df.shape)
df.head()

(113762, 2)


Unnamed: 0,text,Category
0,Predicting Atlanta United's lineup against Col...,sports
1,Mitch McConnell: DC statehood push is 'full bo...,news
2,Home In North Highlands Damaged By Fire NORTH ...,news
3,Meghan McCain blames 'liberal media' and 'thir...,news
4,Today in History: Aug 1 1714: George I becomes...,news


In [None]:
from sklearn.model_selection import train_test_split

# Keep only classes with at least 2 samples
vc = df["Category"].value_counts()
df = df[df["Category"].isin(vc[vc >= 2].index)]


In [None]:
df, _ = train_test_split(
    df,
    test_size=0.5,
    stratify=df["Category"],
    random_state=42
)


In [None]:
import os

os.makedirs("data/processed", exist_ok=True)


In [None]:
df.to_csv("data/processed/news_50.csv", index=False)


In [None]:
df.shape


(56880, 2)

In [None]:
df["Category"].value_counts()


Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
sports,15278
news,13345
finance,5286
lifestyle,3702
autos,2747
travel,2691
foodanddrink,2643
video,2484
tv,1991
health,1899


In [None]:
df.head()


Unnamed: 0,text,Category
104803,How Do You Know When It's Too Cold To Ride? Do...,autos
61523,Lightning sign forward Gemel Smith to one-year...,sports
95679,Drivers caught misbehaving by stopping in Glen...,autos
39635,'Exceeded Expectations': Final Four Weekend Ge...,sports
75022,"Ocasio-Cortez, progressives accuse Trump of us...",news


In [None]:
import re

def clean_text(text, lowercase=True):
    text = re.sub(r"<.*?>", " ", text)          # HTML
    text = re.sub(r"http\S+|www\S+", " ", text) # URLs
    text = re.sub(r"[^\w\s]", " ", text)        # special chars
    text = re.sub(r"\s+", " ", text).strip()    # whitespace
    if lowercase:
        text = text.lower()
    return text


In [None]:
# Classification / TF-IDF
df["clean_text"] = df["text"].apply(clean_text)

# NER (preserve casing)
df["clean_text_ner"] = df["text"].apply(lambda x: clean_text(x, lowercase=False))


In [None]:
df.columns

Index(['text', 'Category', 'clean_text', 'clean_text_ner'], dtype='object')

CLASSIFICATION ML

In [None]:
import os

os.makedirs("models/classification/ml", exist_ok=True)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import joblib

X = df["clean_text"]
y = df["Category"]

le = LabelEncoder()
y_enc = le.fit_transform(y)

tfidf = TfidfVectorizer(max_features=5000)
X_vec = tfidf.fit_transform(X)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_vec, y_enc)

joblib.dump(tfidf, "models/classification/ml/tfidf.pkl")
joblib.dump(clf, "models/classification/ml/logreg.pkl")
joblib.dump(le, "models/classification/ml/label_encoder.pkl")


['models/classification/ml/label_encoder.pkl']

In [None]:
!ls models/classification/ml


label_encoder.pkl  logreg.pkl  tfidf.pkl


In [None]:
from google.colab import files

files.download("models/classification/ml/tfidf.pkl")
files.download("models/classification/ml/logreg.pkl")
files.download("models/classification/ml/label_encoder.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

CLASSIFICATION DL

In [None]:
import os

os.makedirs("models/classification/dl", exist_ok=True)


In [None]:
import torch
import torch.nn as nn

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        emb = self.embedding(x)
        _, (h, _) = self.lstm(emb)
        return self.fc(torch.cat((h[-2], h[-1]), dim=1))

model = BiLSTMClassifier(vocab_size=10000, embed_dim=100, hidden_dim=64, num_classes=16)


In [None]:
import os
print(os.getcwd())


/content


In [None]:
word2idx = {
    "<PAD>": 0,
    "<UNK>": 1,
    "the": 2,
    "economy": 3,
    "is": 4,
    "growing": 5
}

idx2word = {v: k for k, v in word2idx.items()}


In [None]:
from collections import Counter

tokens = []
for text in df["clean_text"]:
    tokens.extend(text.lower().split())

vocab = Counter(tokens)
vocab = [w for w, c in vocab.most_common(10000)]  # top 10k tokens

word2idx = {w:i+2 for i, w in enumerate(vocab)}  # reserve 0: PAD, 1: UNK
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1

# For labels (classification)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df["Category"])
idx2label = {i: label for i, label in enumerate(le.classes_)}

import pickle
os.makedirs("models/classification/dl", exist_ok=True)
pickle.dump(word2idx, open("models/classification/dl/word2idx.pkl", "wb"))
pickle.dump(idx2label, open("models/classification/dl/idx2label.pkl", "wb"))


In [None]:
!ls models/classification/dl

bilstm_classifier.pt  idx2label.pkl  word2idx.pkl


In [None]:
from google.colab import files
# Download them to your local machine
files.download("models/classification/dl/word2idx.pkl")
files.download("models/classification/dl/idx2label.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
checkpoint = torch.load("models/classification/dl/bilstm_classifier.pt", map_location="cpu")
model.load_state_dict(checkpoint)


<All keys matched successfully>

In [None]:
!ls models/classification/dl


bilstm_classifier.pt


In [None]:
from google.colab import files

files.download("models/classification/dl/bilstm_classifier.pt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

CLASSIFICATION TRANSFORM

In [None]:
import os

os.makedirs("models/classification/transformer/bert_classifier", exist_ok=True)


In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(le.classes_)
)

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

model.save_pretrained("models/classification/transformer/bert_classifier")
tokenizer.save_pretrained("models/classification/transformer/bert_classifier")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

('models/classification/transformer/bert_classifier/tokenizer_config.json',
 'models/classification/transformer/bert_classifier/special_tokens_map.json',
 'models/classification/transformer/bert_classifier/vocab.txt',
 'models/classification/transformer/bert_classifier/added_tokens.json',
 'models/classification/transformer/bert_classifier/tokenizer.json')

In [None]:
!ls models/classification/transformer/bert_classifier


config.json	   special_tokens_map.json  tokenizer.json
model.safetensors  tokenizer_config.json    vocab.txt


In [None]:
!zip -r bert_classifier.zip models/classification/transformer/bert_classifier


  adding: models/classification/transformer/bert_classifier/ (stored 0%)
  adding: models/classification/transformer/bert_classifier/model.safetensors (deflated 7%)
  adding: models/classification/transformer/bert_classifier/vocab.txt (deflated 53%)
  adding: models/classification/transformer/bert_classifier/tokenizer.json (deflated 71%)
  adding: models/classification/transformer/bert_classifier/config.json (deflated 60%)
  adding: models/classification/transformer/bert_classifier/tokenizer_config.json (deflated 75%)
  adding: models/classification/transformer/bert_classifier/special_tokens_map.json (deflated 42%)


In [None]:
from google.colab import files
files.download("bert_classifier.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

NER ML

In [None]:
import os

os.makedirs("ner/ml", exist_ok=True)


In [None]:
import json

patterns = {
  "DATE": "\\b(?:\\d{1,2}[/-])?\\d{1,2}[/-]\\d{2,4}\\b|\\b\\d{4}\\b",
  "ORG": "\\b[A-Z][a-zA-Z]+(?: (Inc|Ltd|Corp|LLC|Group|Company))?\\b",
  "PERSON": "\\b[A-Z][a-z]+ [A-Z][a-z]+\\b",
  "LOCATION": "\\b[A-Z][a-z]+(?:, [A-Z]{2})?\\b",
  "MONEY": "\\$\\d+(?:\\.\\d{2})?",
  "PERCENT": "\\b\\d{1,3}%\\b"
}


with open("ner/ml/rule_patterns.json", "w") as f:
    json.dump(patterns, f)


In [None]:
!ls ner/ml


rule_patterns.json


In [None]:
from google.colab import files
files.download("ner/ml/rule_patterns.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

NER DL

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

MAX_VOCAB = 15000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(df["clean_text"])

X_seq = tokenizer.texts_to_sequences(df["clean_text"])
X_pad = pad_sequences(X_seq, maxlen=MAX_LEN, padding="post")

le = LabelEncoder()
y = le.fit_transform(df["category"])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_pad, y, test_size=0.2, random_state=42
)

In [None]:
import re

def weak_ner_tag(tokens):
    tags = []
    for tok in tokens:
        if re.fullmatch(r"\d{4}", tok):
            tags.append("B-DATE")
        elif tok.istitle():
            tags.append("B-PER")
        elif tok.lower() in ["india", "usa", "china", "france"]:
            tags.append("B-LOC")
        elif tok.lower() in ["google", "microsoft", "amazon"]:
            tags.append("B-ORG")
        else:
            tags.append("O")
    return tags

df["tokens"] = df["clean_text_ner"].apply(lambda x: x.split())
df["tags"] = df["tokens"].apply(weak_ner_tag)


In [None]:
word2idx = {"PAD": 0, "UNK": 1}
tag2idx = {"PAD": 0}

for sent in sentences:
    for w in sent:
        if w not in word2idx:
            word2idx[w] = len(word2idx)

for tag_seq in tags:
    for t in tag_seq:
        if t not in tag2idx:
            tag2idx[t] = len(tag2idx)

idx2tag = {v: k for k, v in tag2idx.items()}

In [None]:
MAX_LEN = 20

def encode_sentence(sent):
    ids = [word2idx.get(w, word2idx["UNK"]) for w in sent]
    ids += [0] * (MAX_LEN - len(ids))
    return ids[:MAX_LEN]

def encode_tags(tag_seq):
    ids = [tag2idx[t] for t in tag_seq]
    ids += [0] * (MAX_LEN - len(ids))
    return ids[:MAX_LEN]

X = torch.tensor([encode_sentence(s) for s in sentences])
y = torch.tensor([encode_tags(t) for t in tags])


In [None]:
class BiLSTMNER(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, tag_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            bidirectional=True,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim * 2, tag_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

In [None]:
model = BiLSTMNER(
    vocab_size=len(word2idx),
    embed_dim=32,
    hidden_dim=32,
    tag_size=len(tag2idx)
)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
for epoch in range(5):
    model.train()
    optimizer.zero_grad()

    outputs = model(X)
    loss = criterion(
        outputs.view(-1, len(tag2idx)),
        y.view(-1)
    )

    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")

In [None]:
dl_dir = "models/ner/dl"
os.makedirs(dl_dir, exist_ok=True)

In [None]:
torch.save(
    model.state_dict(),
    os.path.join(dl_dir, "bilstm_ner.pt")
)

print("✅ BiLSTM NER model saved successfully!")


In [None]:
with open(os.path.join(dl_dir, "word2idx.pkl"), "wb") as f:
    pickle.dump(word2idx, f)

with open(os.path.join(dl_dir, "tag2idx.pkl"), "wb") as f:
    pickle.dump(tag2idx, f)

with open(os.path.join(dl_dir, "idx2tag.pkl"), "wb") as f:
    pickle.dump(idx2tag, f)

print("✅ NER dictionaries saved successfully!")

In [None]:
from google.colab import files

files.download("ner/dl/word2idx.pkl")
files.download("ner/dl/tag2id.pkl")
files.download("ner/dl/idx2tag.pkl")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

NER TRANSFORMER

In [None]:
import os

os.makedirs("ner/transformer/bert_ner", exist_ok=True)


In [None]:
from transformers import BertForTokenClassification, BertTokenizerFast

model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2id)
)

tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

model.save_pretrained("ner/transformer/bert_ner")
tokenizer.save_pretrained("ner/transformer/bert_ner")


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

('ner/transformer/bert_ner/tokenizer_config.json',
 'ner/transformer/bert_ner/special_tokens_map.json',
 'ner/transformer/bert_ner/vocab.txt',
 'ner/transformer/bert_ner/added_tokens.json',
 'ner/transformer/bert_ner/tokenizer.json')

In [None]:
!ls ner/transformer/bert_ner


config.json	   special_tokens_map.json  tokenizer.json
model.safetensors  tokenizer_config.json    vocab.txt


In [None]:
!zip -r bert_ner.zip ner/transformer/bert_ner


  adding: ner/transformer/bert_ner/ (stored 0%)
  adding: ner/transformer/bert_ner/model.safetensors (deflated 7%)
  adding: ner/transformer/bert_ner/vocab.txt (deflated 49%)
  adding: ner/transformer/bert_ner/tokenizer.json (deflated 70%)
  adding: ner/transformer/bert_ner/config.json (deflated 54%)
  adding: ner/transformer/bert_ner/tokenizer_config.json (deflated 75%)
  adding: ner/transformer/bert_ner/special_tokens_map.json (deflated 42%)


In [None]:
from google.colab import files
files.download("bert_ner.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

SUMMERIZATION ML

In [None]:
import os

# Create folder if it doesn't exist
os.makedirs("summarization/ml", exist_ok=True)

In [None]:
import pickle

extractive_config = {
    "method": "tfidf_sentence_ranking",
    "top_k": 3
}

pickle.dump(extractive_config, open("summarization/ml/tfidf_extractive.pkl", "wb"))


In [None]:
from google.colab import files
files.download("summarization/ml/tfidf_extractive.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

SUMMERIZATION DL

In [None]:
import os

os.makedirs("summarization/dl", exist_ok=True)

In [None]:
!pip install torch nltk tqdm




In [None]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk, pickle, re
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
from tqdm import tqdm

nltk.download("punkt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def simple_summary(text):
    words = text.split()
    return " ".join(words[:15])

df['summary'] = df['clean_text'].apply(simple_summary)

# Now you have 'clean_text' -> 'summary'
df[['clean_text','summary']].head()

In [None]:
from collections import Counter
import pickle
import torch

MAX_LEN = 20        # max tokens in input
MAX_SUM_LEN = 15     # max tokens in summary

texts = [t.lower().split() for t in df['clean_text']]
summaries = [s.lower().split() for s in df['summary']]

# Build vocabulary
word_counter = Counter()
for seq in texts + summaries:
    word_counter.update(seq)

vocab = ['<PAD>', '<UNK>'] + list(word_counter.keys())
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}

# Encode sequences
def encode(seq, max_len):
    ids = [word2idx.get(w, word2idx['<UNK>']) for w in seq]
    ids += [0]*(max_len - len(ids))
    return ids[:max_len]

X = torch.tensor([encode(t, MAX_LEN) for t in texts])
Y = torch.tensor([encode(s, MAX_SUM_LEN) for s in summaries])

In [None]:
import torch.nn as nn

class Seq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.encoder = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, src, tgt):
        src_emb = self.embedding(src)
        _, (h, c) = self.encoder(src_emb)

        tgt_emb = self.embedding(tgt)
        out, _ = self.decoder(tgt_emb, (h, c))
        return self.fc(out)

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

BATCH_SIZE = 10

dataset = TensorDataset(X, Y)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

model = Seq2Seq(len(vocab), embed_dim=32, hidden_dim=32)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 2
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch_X, batch_Y in loader:
        optimizer.zero_grad()
        output = model(batch_X, batch_Y)          # Forward pass
        loss = criterion(output.view(-1, len(vocab)), batch_Y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss/len(loader):.4f}")

In [None]:
# Save model weights
torch.save(model.state_dict(), "models/summarization/dl/seq2seq_lstm.pt")
print("Model saved!")

import pickle

# Save word2idx
with open("models/summarization/dl/word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)

# Save idx2word
with open("models/summarization/dl/idx2word.pkl", "wb") as f:
    pickle.dump(idx2word, f)

print("Vocab saved!")



SUMMERIZATION TRANSFORMATION

In [None]:
import os

os.makedirs("summarization/transformer/bart_summarizer", exist_ok=True)


In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

model.save_pretrained("summarization/transformer/bart_summarizer")
tokenizer.save_pretrained("summarization/transformer/bart_summarizer")


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



('summarization/transformer/bart_summarizer/tokenizer_config.json',
 'summarization/transformer/bart_summarizer/special_tokens_map.json',
 'summarization/transformer/bart_summarizer/vocab.json',
 'summarization/transformer/bart_summarizer/merges.txt',
 'summarization/transformer/bart_summarizer/added_tokens.json')

In [None]:
!zip -r bart_summarizer.zip summarization/transformer/bart_summarizer


  adding: summarization/transformer/bart_summarizer/ (stored 0%)
  adding: summarization/transformer/bart_summarizer/generation_config.json (deflated 46%)
  adding: summarization/transformer/bart_summarizer/model.safetensors (deflated 41%)
  adding: summarization/transformer/bart_summarizer/vocab.json (deflated 68%)
  adding: summarization/transformer/bart_summarizer/merges.txt (deflated 53%)
  adding: summarization/transformer/bart_summarizer/config.json (deflated 64%)
  adding: summarization/transformer/bart_summarizer/tokenizer_config.json (deflated 75%)
  adding: summarization/transformer/bart_summarizer/special_tokens_map.json (deflated 85%)


In [None]:
from google.colab import files
files.download("bart_summarizer.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from collections import Counter

tokens = []
for text in df["clean_text"]:
    tokens.extend(text.lower().split())

vocab = Counter(tokens)
vocab = [w for w, c in vocab.most_common(10000)]  # top 10k tokens

word2idx = {w:i+2 for i, w in enumerate(vocab)}  # reserve 0: PAD, 1: UNK
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1

# For labels (classification)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df["Category"])
idx2label = {i: label for i, label in enumerate(le.classes_)}

import pickle
os.makedirs("models/classification/dl", exist_ok=True)
pickle.dump(word2idx, open("models/classification/dl/word2idx.pkl", "wb"))
pickle.dump(idx2label, open("models/classification/dl/idx2label.pkl", "wb"))


KeyError: 'clean_text'

In [None]:
# Example after training your DL classification model
import pickle

word2idx = {"<PAD>":0, "<UNK>":1, "the":2, "economy":3, ... }  # your vocabulary dict
idx2label = {0:"Business", 1:"Politics", 2:"Sports", 3:"Tech", 4:"Health", ... }  # from LabelEncoder

os.makedirs("models/classification/dl", exist_ok=True)
pickle.dump(word2idx, open("models/classification/dl/word2idx.pkl", "wb"))
pickle.dump(idx2label, open("models/classification/dl/idx2label.pkl", "wb"))


SyntaxError: ':' expected after dictionary key (ipython-input-1090801211.py, line 4)

In [None]:
# word2idx: mapping tokens to ids
# id2tag: mapping tag ids to NER labels (e.g., {0:"O", 1:"PERSON", 2:"ORG", ...})
pickle.dump(word2idx, open("ner/dl/word2idx.pkl", "wb"))
pickle.dump(id2tag, open("ner/dl/id2tag.pkl", "wb"))


In [None]:
pickle.dump(word2idx, open("summarization/dl/word2idx.pkl", "wb"))
pickle.dump(idx2word, open("summarization/dl/idx2word.pkl", "wb"))


In [None]:
India defeated Australia by five wickets in the final match of the series.
Virat Kohli scored a match-winning century in Chennai.
The victory helped India secure the championship.