<a href="https://colab.research.google.com/github/m-numan1/100-Deep-Learning-Projects/blob/main/News_Category_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Creating a model for News Category CLassification##

**STEPS**

* Import Libraries
* Preprocess the data
* (Remove Punctuation, lowercase, remove special characters)
* tokenize dataset
* make vocabulary
* now convert tokenzie dataset into numerical values based on indice in vocab
* make dataloaders
* design the model
* Train the model
* Evaluate


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import re
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopword = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
# downloading kaggle dataset
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d rmisra/news-category-dataset


Dataset URL: https://www.kaggle.com/datasets/rmisra/news-category-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading news-category-dataset.zip to /content
  0% 0.00/26.5M [00:00<?, ?B/s]
100% 26.5M/26.5M [00:00<00:00, 1.24GB/s]


In [7]:
!unzip news-category-dataset.zip

Archive:  news-category-dataset.zip
  inflating: News_Category_Dataset_v3.json  


In [8]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

In [9]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [10]:
df["news"] = df["headline"] + " " + df["short_description"]

In [11]:
# need to covert y into numbers
y = df.category
y = pd.factorize(y)[0]

In [12]:
y

array([0, 0, 1, ..., 6, 6, 6])

In [13]:
# cleaning the data, removing the punctuation, special characters

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)  # remove numbers and punctuation
    text = text.lower()                   # lowercase
    tokens = word_tokenize(text)          # tokenize
    tokens = [word for word in tokens if word not in stopword]  # remove stopwords
    return tokens

df['cleaned_news'] = df['news'].apply(clean_text)

In [14]:
df.cleaned_news[1]

['american',
 'airlines',
 'flyer',
 'charged',
 'banned',
 'life',
 'punching',
 'flight',
 'attendant',
 'video',
 'subdued',
 'passengers',
 'crew',
 'fled',
 'back',
 'aircraft',
 'confrontation',
 'according',
 'u',
 'attorney',
 'office',
 'los',
 'angeles']

In [15]:
# making a vocabulary
vocab = {'<unkn>':1, '<pad>':0}

for sentence in df.cleaned_news:
  for word in sentence:
    if word not in vocab:
      vocab[word] = len(vocab)

In [16]:
len(vocab)

86093

In [17]:
numerical_sentence = []

def word_to_index(text , vocab):
  numerical_sentence = []
  for word in text:
    for i in word:
      if i in vocab:
        numerical_sentence.append(vocab[i])
      else:
        numerical_sentence.append(vocab['<unkn>'])
  return numerical_sentence

# numerical_sentence = word_to_index(df.cleaned_news, vocab)


In [18]:
#
df["encoded_news"] = df["cleaned_news"].apply(lambda x: word_to_index(x, vocab))

In [19]:
# Applying padding
max_length = 100

def custom_padd(seq , max_len = max_length):
  if len(seq)< max_len :
    return seq + [vocab['<pad>']] * (max_len - len(seq))
  else:
    return seq[:max_len]


df["padded_news"] = df["encoded_news"].apply(custom_padd)



In [20]:
len(df.padded_news[1])

100

In [21]:
# Converting to tensors now

X = torch.tensor(df["padded_news"].tolist(), dtype= torch.long)
y = torch.tensor(y, dtype= torch.long)

In [22]:
# splitting data

from sklearn.model_selection import train_test_split

In [23]:
x_train,x_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:

train_data = TensorDataset(x_train, y_train)
test_data = TensorDataset(x_test, y_test)


In [25]:
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False)

In [38]:
# making a model

class LstmModel(nn.Module):
  def __init__(self, vocab_size, embed_dim , hidden_dim, output_dim , padd_idx):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim= embed_dim, padding_idx= padd_idx)
    self.lstm = nn.LSTM(embed_dim , hidden_dim , batch_first= True, bidirectional= True)
    self.fc = nn.Linear(hidden_dim * 2, output_dim)

  def forward(self, x):
    embedding = self.embedding(x)
    lstm_out, (hidden, cell) = self.lstm(embedding)
    lstm_out = F.dropout(lstm_out, p = 0.3) # Apply dropout to lstm_out
    output = self.fc(torch.cat((hidden[-2], hidden[-1]), dim = 1))
    return output

In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [40]:
num_classes = len(df.category.unique())
num_classes

42

In [41]:

epoch = 20
learning_rate = 1e-3

model = LstmModel(len(vocab), embed_dim= 100 ,hidden_dim=256, output_dim=42, padd_idx=0 ).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr= learning_rate)

In [42]:
# Training loop

for i in range(epoch):
  total_loss = 0
  for x_batch, y_batch in train_dataloader:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)
    optimizer.zero_grad()
    output = model(x_batch)
    loss = criterion(output, y_batch)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f"Epoch {i+1}/{epoch}, Loss: {total_loss/len(train_dataloader)}")

Epoch 1/20, Loss: 2.7180655063347725
Epoch 2/20, Loss: 2.2375944245064114
Epoch 3/20, Loss: 2.032687279576904
Epoch 4/20, Loss: 1.8967809260266955
Epoch 5/20, Loss: 1.7919210529459366
Epoch 6/20, Loss: 1.7025991727387002
Epoch 7/20, Loss: 1.6233878105418058
Epoch 8/20, Loss: 1.5500268803613972
Epoch 9/20, Loss: 1.482614353785512
Epoch 10/20, Loss: 1.418238775048417
Epoch 11/20, Loss: 1.3589247676008278
Epoch 12/20, Loss: 1.3036053962244698
Epoch 13/20, Loss: 1.2473313098794336
Epoch 14/20, Loss: 1.198297696211202
Epoch 15/20, Loss: 1.1541435782248155
Epoch 16/20, Loss: 1.114965338832346
Epoch 17/20, Loss: 1.0781512968604814
Epoch 18/20, Loss: 1.0455971263216797
Epoch 19/20, Loss: 1.0441919272979905
Epoch 20/20, Loss: 0.9878389277196973


In [43]:
# evaluating the model

model.eval()

with torch.no_grad():
  correct = 0
  total = 0
  for x_batch, y_batch in test_dataloader:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)
    output = model(x_batch)
    _, predicted = torch.max(output.data, 1)
    total += y_batch.size(0)
    correct += (predicted == y_batch).sum().item()

  print(f"Accuracy: {100 * correct / total}%")

Accuracy: 45.2584355462225%


In [44]:
# evaluate for training data


model.eval()

with torch.no_grad():
  for x_batch, y_batch in train_dataloader:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)
    output = model(x_batch)
    _, predicted = torch.max(output.data, 1)
    total += y_batch.size(0)
    correct += (predicted == y_batch).sum().item()

  print(f"Accuracy: {100 * correct / total}%")


Accuracy: 68.92763223832728%


In [64]:
y, uniques = pd.factorize(df['category'])

def idx_to_label(idx):
  return uniques[idx]

In [69]:
z = "Ronaldo Is the best player of Football"
z = clean_text(z)             # e.g., lowercasing, punctuation removal
z = word_to_index(z, vocab)   # convert tokens to indices
z = custom_padd(z)            # pad/truncate to max_seq_len
z = torch.tensor(z, dtype=torch.long).unsqueeze(0).to(device)

model.eval()
with torch.no_grad():
    output = model(z)
    predicted_class = torch.argmax(output, dim=1).item()

print("Predicted class index:", predicted_class)

# If you have label mapping
print("Predicted label:", idx_to_label(predicted_class))


Predicted class index: 6
Predicted label: SPORTS
