<a href="https://colab.research.google.com/github/ons13/taskmanagermicroservice/blob/main/TextClassificationForTopicPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [None]:
# Setup device-agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [None]:
from pathlib import Path
import zipfile
import requests

data_path = Path("data/")
text_path = data_path / "bbc"
if text_path.is_dir():
    print(f"{text_path} directory exists.")
else:
    print(f"Did not find {text_path} directory, creating one...")
    text_path.mkdir(parents=True, exist_ok=True)

    with open(data_path / "bbc.zip", "wb") as f:
        request = requests.get("https://github.com/HoussemBeltifa/Topic_Prediction/raw/main/bbc.zip")
        print("Downloading data...")
        f.write(request.content)

    with zipfile.ZipFile(data_path / "bbc.zip", "r") as zip_ref:
        print("Unzipping data...")
        zip_ref.extractall(text_path)


Did not find data/bbc directory, creating one...
Downloading data...
Unzipping data...


In [None]:
import os
def walk_through_dir(dir_path):
  """
  Walks through dir_path returning its contents.
  Args:
    dir_path (str or pathlib.Path): target directory

  Returns:
    A print out of:
      number of subdiretories in dir_path
      number of articles (files) in each subdirectory
      name of each subdirectory
  """
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} article in '{dirpath}'.")

In [None]:
walk_through_dir(text_path)

There are 1 directories and 0 article in 'data/bbc'.
There are 2 directories and 0 article in 'data/bbc/bbc'.
There are 5 directories and 0 article in 'data/bbc/bbc/train'.
There are 0 directories and 408 article in 'data/bbc/bbc/train/business'.
There are 0 directories and 320 article in 'data/bbc/bbc/train/tech'.
There are 0 directories and 408 article in 'data/bbc/bbc/train/sport'.
There are 0 directories and 333 article in 'data/bbc/bbc/train/politics'.
There are 0 directories and 308 article in 'data/bbc/bbc/train/entertainment'.
There are 5 directories and 0 article in 'data/bbc/bbc/test'.
There are 0 directories and 102 article in 'data/bbc/bbc/test/business'.
There are 0 directories and 81 article in 'data/bbc/bbc/test/tech'.
There are 0 directories and 103 article in 'data/bbc/bbc/test/sport'.
There are 0 directories and 84 article in 'data/bbc/bbc/test/politics'.
There are 0 directories and 78 article in 'data/bbc/bbc/test/entertainment'.


In [None]:
# Setup train and testing paths
train_dir = text_path / "bbc" / "train"
test_dir = text_path / "bbc" / "test"

train_dir, test_dir

(PosixPath('data/bbc/bbc/train'), PosixPath('data/bbc/bbc/test'))

In [None]:
!pip install nltk
!pip install scikit-learn



In [None]:
import os

def load_data_from_folder(folder_path):
    categories = os.listdir(folder_path)
    data = []
    for category in categories:
        category_path = os.path.join(folder_path, category)
        for filename in os.listdir(category_path):
            file_path = os.path.join(category_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data.append((file.read(), category))
    return data

train_data = load_data_from_folder(train_dir)
test_data = load_data_from_folder(test_dir)


In [None]:
train_data[800][0]

'Gerrard plays down European hopes\n\nSteven Gerrard has admitted that Liverpool have little chance of winning the Champions League this season.\n\nThe 24-year-old Reds skipper spoke out ahead of Tuesday\'s first leg at home to Bayer Leverkusen in the last 16, which he will miss through suspension. "Let\'s be realistic, there are some fantastic teams left in the Champions League," he told BBC Radio Five Live. "We are just going to try to stay in as long as possible but we realise that maybe it is not our year this year." Gerrard has made no secret of his desire to be involved in Europe\'s premier club competition.\n\nLast season he described qualification for the Champions League as the "be all and end all" - and rumours persist that he will leave Anfield if the Reds fail to secure a place in the competition. He has consistently been linked with a move away from Liverpool, with Chelsea the favourites to snap up the England midfielder. And Blues boss Jose Mourinho backed Gerrard\'s view

In [None]:
# Lowercase the text data while preserving the structure
train_data_lowercased = [(text.lower(), label) for text, label in train_data]
test_data_lowercased = [(text.lower(), label) for text, label in test_data]

In [None]:
train_data_lowercased[800][0]

'gerrard plays down european hopes\n\nsteven gerrard has admitted that liverpool have little chance of winning the champions league this season.\n\nthe 24-year-old reds skipper spoke out ahead of tuesday\'s first leg at home to bayer leverkusen in the last 16, which he will miss through suspension. "let\'s be realistic, there are some fantastic teams left in the champions league," he told bbc radio five live. "we are just going to try to stay in as long as possible but we realise that maybe it is not our year this year." gerrard has made no secret of his desire to be involved in europe\'s premier club competition.\n\nlast season he described qualification for the champions league as the "be all and end all" - and rumours persist that he will leave anfield if the reds fail to secure a place in the competition. he has consistently been linked with a move away from liverpool, with chelsea the favourites to snap up the england midfielder. and blues boss jose mourinho backed gerrard\'s view

In [None]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

# Tokenize the lowercased data
train_tokens = [(word_tokenize(text), label) for text, label in train_data_lowercased]
test_tokens = [(word_tokenize(text), label) for text, label in test_data_lowercased]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
train_tokens[800][0]

['gerrard',
 'plays',
 'down',
 'european',
 'hopes',
 'steven',
 'gerrard',
 'has',
 'admitted',
 'that',
 'liverpool',
 'have',
 'little',
 'chance',
 'of',
 'winning',
 'the',
 'champions',
 'league',
 'this',
 'season',
 '.',
 'the',
 '24-year-old',
 'reds',
 'skipper',
 'spoke',
 'out',
 'ahead',
 'of',
 'tuesday',
 "'s",
 'first',
 'leg',
 'at',
 'home',
 'to',
 'bayer',
 'leverkusen',
 'in',
 'the',
 'last',
 '16',
 ',',
 'which',
 'he',
 'will',
 'miss',
 'through',
 'suspension',
 '.',
 '``',
 'let',
 "'s",
 'be',
 'realistic',
 ',',
 'there',
 'are',
 'some',
 'fantastic',
 'teams',
 'left',
 'in',
 'the',
 'champions',
 'league',
 ',',
 "''",
 'he',
 'told',
 'bbc',
 'radio',
 'five',
 'live',
 '.',
 '``',
 'we',
 'are',
 'just',
 'going',
 'to',
 'try',
 'to',
 'stay',
 'in',
 'as',
 'long',
 'as',
 'possible',
 'but',
 'we',
 'realise',
 'that',
 'maybe',
 'it',
 'is',
 'not',
 'our',
 'year',
 'this',
 'year',
 '.',
 "''",
 'gerrard',
 'has',
 'made',
 'no',
 'secret',
 '

In [None]:
nltk.download('stopwords')

from nltk.corpus import stopwords

# Get the list of stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from tokenized data
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Remove stopwords from tokenized data
train_tokens_no_stopwords = [(remove_stopwords(tokens), label) for tokens, label in train_tokens]
test_tokens_no_stopwords = [(remove_stopwords(tokens), label) for tokens, label in test_tokens]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_tokens_no_stopwords[800][0]

['gerrard',
 'plays',
 'european',
 'hopes',
 'steven',
 'gerrard',
 'admitted',
 'liverpool',
 'little',
 'chance',
 'winning',
 'champions',
 'league',
 'season',
 '.',
 '24-year-old',
 'reds',
 'skipper',
 'spoke',
 'ahead',
 'tuesday',
 "'s",
 'first',
 'leg',
 'home',
 'bayer',
 'leverkusen',
 'last',
 '16',
 ',',
 'miss',
 'suspension',
 '.',
 '``',
 'let',
 "'s",
 'realistic',
 ',',
 'fantastic',
 'teams',
 'left',
 'champions',
 'league',
 ',',
 "''",
 'told',
 'bbc',
 'radio',
 'five',
 'live',
 '.',
 '``',
 'going',
 'try',
 'stay',
 'long',
 'possible',
 'realise',
 'maybe',
 'year',
 'year',
 '.',
 "''",
 'gerrard',
 'made',
 'secret',
 'desire',
 'involved',
 'europe',
 "'s",
 'premier',
 'club',
 'competition',
 '.',
 'last',
 'season',
 'described',
 'qualification',
 'champions',
 'league',
 '``',
 'end',
 "''",
 '-',
 'rumours',
 'persist',
 'leave',
 'anfield',
 'reds',
 'fail',
 'secure',
 'place',
 'competition',
 '.',
 'consistently',
 'linked',
 'move',
 'away',
 

In [None]:
import re

# Function to remove special characters from a list of strings
def remove_special_characters(texts):
    cleaned_texts = []
    # Define the pattern to match special characters
    pattern = r'[^a-zA-Z\s]'
    for text in texts:
        cleaned_text = re.sub(pattern, '', text).strip()
        if cleaned_text:  # Check if the cleaned text is not empty
            cleaned_texts.append(cleaned_text)
    return cleaned_texts

train_data_f = [(remove_special_characters(text), label) for text, label in train_tokens_no_stopwords]
test_data_f = [(remove_special_characters(text), label) for text, label in test_tokens_no_stopwords]

In [None]:
train_data_f[800][1]

'sport'

In [None]:
# Get class names as a list
a = [label for text,label in train_data_f]
class_names = list(dict.fromkeys(sorted(a)))
class_names

['business', 'entertainment', 'politics', 'sport', 'tech']

In [None]:
# Can also get class names as a dict
class_names_dict = {label: index for index, label in enumerate(sorted(set(label for _, label in train_data_f)))}

class_names_dict

{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}

In [None]:
# Check the lengths
len(train_data_f), len(test_data_f)

(1777, 448)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


# Create vocabulary and label mapping
all_tokens = [token for tokens, _ in train_data_f + test_data_f for token in tokens]
vocab = {token: index for index, token in enumerate(set(all_tokens))}
label_vocab = {label: index for index, label in enumerate(sorted(set(label for _, label in train_data_f)))}

# Convert tokenized text to numericalized tensors
numericalized_texts = [[vocab[token] for token in tokens] for tokens, _ in train_data_f]
numericalized_labels = [label_vocab[label] for _, label in train_data_f]

# Define the model
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(Model, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim, mode='mean')
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc5 = nn.Linear(hidden_dim, output_dim)
        self.fc3 = nn.ReLU()
        self.fc4 = nn.Dropout(p=0.3)

    def forward(self, text):
        embedded = self.embedding(text)
        x=self.fc1(embedded)
        x=self.fc3(x)
        x=self.fc4(x)
        x=self.fc2(x)
        x=self.fc3(x)
        x=self.fc4(x)
        x=self.fc5(x)
        return x

# Initialize the model
model = Model(vocab_size=len(vocab), embedding_dim=100, hidden_dim=128, output_dim=len(label_vocab))


In [None]:
len(vocab)

30239

In [None]:
len(label_vocab)

5

In [None]:
from tqdm.auto import tqdm

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

epochs = 10
# Training loop
for epoch in tqdm(range(epochs)):
    running_loss = 0.0
    for text, label in zip(numericalized_texts, numericalized_labels):
        optimizer.zero_grad()
        text = torch.tensor(text, dtype=torch.long)
        output = model(text.unsqueeze(0))
        loss = criterion(output, torch.tensor([label], dtype=torch.long))
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_data)}")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1, Loss: 0.6328815091410231
Epoch 2, Loss: 0.3493325250976081


In [None]:
# Test loop
model.eval()

correct_predictions = 0
total_samples = len(test_data_f)

with torch.no_grad():
    for text, label in test_data_f:
        text = torch.tensor([vocab[token] for token in text], dtype=torch.long)
        output = model(text.unsqueeze(0))
        predicted_label = torch.argmax(output).item()
        true_label = label_vocab[label]

        if predicted_label == true_label:
            correct_predictions += 1

accuracy = correct_predictions / total_samples
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
import torch
import torch.nn.functional as F

def predict_topic(model, article, vocab, label_vocab):
    # Tokenize the article
    text = article.lower()
    text = word_tokenize(text)
    text = remove_stopwords(text)
    text = remove_special_characters(text)

    # Filter out unknown words
    text = [token for token in text if token in vocab]

    # Convert tokens to numericalized tensor
    numericalized_tokens = torch.tensor(
        [vocab[token] for token in text], dtype=torch.long)

    # Check if there are any tokens left after filtering out unknown words
    if len(numericalized_tokens) == 0:
        raise ValueError("All words in the article are unknown to the model.")

    # Pass the tensor through the model
    model.eval()
    with torch.no_grad():
        output = model(numericalized_tokens.unsqueeze(0))

    # Interpret the model's output
    predicted_class_index = torch.argmax(output).item()
    predicted_topic = list(label_vocab.keys())[list(
        label_vocab.values()).index(predicted_class_index)]

    return predicted_topic


In [None]:

article = "DaimlerChrysler's 2004 sales rise\nUS-German carmaker DaimlerChrysler has sold 2.1% more cars in 2004 than in the previous year, as solid Chrysler sales offset a weak showing for Mercedes.\nSales totalled 3.9 million units worldwide during 2004, the company said at the Detroit Motor Show. A switch to new models hit luxury marque Mercedes-Benz, with sales down 3.1% at 1.06 million. Chrysler avoided the fate of US rivals Ford and General Motors, both of whom lost ground to Japanese firms. Its sales rose 3.5% to 2.7 million units.\nSimilarly on the up was the Smart brand of compact cars, with the division's sales jumping by 21.1% during 2004 to 136,000. The future of the brand - which is controlled by the Mercedes group within DaimlerChrysler - remains in question, however. Smart has consistently lost money since it started trading in 1998, and new model launches are now 'on hold', said Mercedes chief executive Eckhard Cordes. In Europe, the Smart will now go on sale through regular Mercedes dealerships as well as its own dealer network, Mr Cordes said."

predicted_topic = predict_topic(model, vocab, label_vocab, article)
print(f"Predicted Topic: {predicted_topic}")


In [None]:
#save the model
torch.save(model.state_dict(), 'model_weights.pth')

In [None]:
import pickle

# save vocab and label_vocab
with open('vocab_data.pkl', 'wb') as fp:
    pickle.dump(vocab, fp)
    print('dictionary saved successfully to file')

with open('label_vocab_data.pkl', 'wb') as fp:
    pickle.dump(label_vocab, fp)
    print('dictionary saved successfully to file')