# **1.Data Loading & Preprocessing**

Download the BBC News Dataset.

In [25]:
import pandas as pd
df = pd.read_csv('/content/sample_data/bbc-news-data.csv', sep='\t')
print(df.head())


   category filename                              title  \
0  business  001.txt  Ad sales boost Time Warner profit   
1  business  002.txt   Dollar gains on Greenspan speech   
2  business  003.txt  Yukos unit buyer faces loan claim   
3  business  004.txt  High fuel prices hit BA's profits   
4  business  005.txt  Pernod takeover talk lifts Domecq   

                                             content  
0   Quarterly profits at US media giant TimeWarne...  
1   The dollar has hit its highest level against ...  
2   The owners of embattled Russian oil giant Yuk...  
3   British Airways has blamed high fuel prices f...  
4   Shares in UK drinks and food firm Allied Dome...  


In [26]:
df['text']=df['title']+df['content']

**Lowercase**

In [27]:
df['text'] = df['text'].str.lower()


**Remove** **punctuation**

In [28]:
import re
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))


In [29]:
import nltk
nltk.download('stopwords')
stopwords_set = set(nltk.corpus.stopwords.words('english'))
df['text'] = df['text'].apply(
    lambda x: ' '.join([word for word in str(x).split() if word not in stopwords_set])
)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**stemming**

In [30]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['text'] = df['text'].apply(
    lambda x: ' '.join([stemmer.stem(w) for w in str(x).split()])
)


**Lemmatization**

In [31]:
import spacy


nlp = spacy.load("en_core_web_sm")


def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

df['text'] = df['text'].apply(lemmatize_text)

print("Lemmatization applied using spaCy.")

Lemmatization applied using spaCy.


# **2.Baseline Model (Algorithms & Models)**

**Convert text to TF-IDF vectors.**

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['category']


**spliting data into train and test**

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


**Naive Bayes classifier.**

In [34]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)


**Evaluate with accuracy, precision, recall, and confusion matrix**

In [35]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9842696629213483
Precision: 0.9844374310554768
Recall: 0.9842696629213483
Confusion Matrix:
 [[ 99   0   2   0   1]
 [  1  75   1   0   0]
 [  0   0  83   0   1]
 [  0   0   0 102   0]
 [  1   0   0   0  79]]


# **3.NLP Upgrade**

Using the pretrained DistilBERT (Hugging Face) for classification
i would notice that i use pytorch because tensorflow got some problem do to the new version of it.
so teh Fine-tune on the dataset is all in one cell
to Compare results with the baseline model i has to TRAIN this model on a down-stream task to be able to use it for predictions and inference.

In [36]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=5
)


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['category'])


train_indices, test_indices, y_train_encoded, y_test_encoded = train_test_split(
    range(len(df)), y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

train_encodings = tokenizer(list(df['text'].iloc[train_indices]), truncation=True, padding=True)
test_encodings = tokenizer(list(df['text'].iloc[test_indices]), truncation=True, padding=True)


train_input_ids = torch.tensor(train_encodings['input_ids'])
train_attention_mask = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(y_train_encoded)

test_input_ids = torch.tensor(test_encodings['input_ids'])
test_attention_mask = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(y_test_encoded)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(16)

**Results comparison**

In [41]:
# Evaluate the BERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

bert_preds = []
with torch.no_grad():
    for i in range(0, len(test_input_ids), 16): # Using batch size 16 as in dataset creation
        batch_input_ids = test_input_ids[i:i+16].to(device)
        batch_attention_mask = test_attention_mask[i:i+16].to(device)
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        bert_preds.extend(preds)

# Decode BERT predictions
bert_predicted_categories = label_encoder.inverse_transform(bert_preds)

# Get Naive Bayes results (assuming they are available from previous execution)
nb_accuracy = accuracy_score(y_test, y_pred)
nb_precision = precision_score(y_test, y_pred, average='weighted')
nb_recall = recall_score(y_test, y_pred, average='weighted')
nb_confusion_matrix = confusion_matrix(y_test, y_pred)

# Evaluate BERT model
bert_accuracy = accuracy_score(y_test, bert_predicted_categories)
bert_precision = precision_score(y_test, bert_predicted_categories, average='weighted')
bert_recall = recall_score(y_test, bert_predicted_categories, average='weighted')
bert_confusion_matrix = confusion_matrix(y_test, bert_predicted_categories)


print("Naive Bayes Model Results:")
print("Accuracy:", nb_accuracy)
print("Precision:", nb_precision)
print("Recall:", nb_recall)
print("Confusion Matrix:\n", nb_confusion_matrix)

print("\nBERT Model Results:")
print("Accuracy:", bert_accuracy)
print("Precision:", bert_precision)
print("Recall:", bert_recall)
print("Confusion Matrix:\n", bert_confusion_matrix)

Naive Bayes Model Results:
Accuracy: 0.9842696629213483
Precision: 0.9844374310554768
Recall: 0.9842696629213483
Confusion Matrix:
 [[ 99   0   2   0   1]
 [  1  75   1   0   0]
 [  0   0  83   0   1]
 [  0   0   0 102   0]
 [  1   0   0   0  79]]

BERT Model Results:
Accuracy: 0.1797752808988764
Precision: 0.032319151622269914
Recall: 0.1797752808988764
Confusion Matrix:
 [[  0   0   0   0 102]
 [  0   0   0   0  77]
 [  0   0   0   0  84]
 [  0   0   0   0 102]
 [  0   0   0   0  80]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **4.Deployment (Mini Version)**

this app would'nt be run in the cell bellow because it sould run in .py file and should be alone in an aproperiate interpeter .
but i put it here to make all clear to navigate the whole code.

In [40]:
import streamlit as st
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Load the trained PyTorch model and tokenizer
# Make sure the model and tokenizer are loaded in the previous cells
# For this example, we'll assume 'model', 'tokenizer', and 'label_encoder' are available
# You might need to save and load them properly in a real application

# Assuming the model, tokenizer, and label_encoder are available from previous cells
if 'model' not in globals() or 'tokenizer' not in globals() or 'label_encoder' not in globals():
    st.error("Model, tokenizer, or label encoder not found. Please run the previous cells.")
else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval() # Set model to evaluation mode

    st.title("BBC News Article Classifier (PyTorch)")
    input_text = st.text_area("Enter your news article here:")

    if st.button('Classify'):
        if input_text:
            # Preprocess the input text
            encoded_input = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')
            input_ids = encoded_input['input_ids'].to(device)
            attention_mask = encoded_input['attention_mask'].to(device)

            # Get prediction from the model
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_class_idx = torch.argmax(logits, dim=1).item()

            # Decode the predicted class
            predicted_category = label_encoder.inverse_transform([predicted_class_idx])[0]

            st.write(f"Predicted Category: {predicted_category}")
        else:
            st.warning("Please enter some text to classify.")



# **5.Ethics Check**
Potential bias in training data:
If the data is all BBC news or a specific type of news, the model may tend to classify unfairly or ignore unrepresented topics.

Risks of misclassification:
It may confuse political and sports news, or classify serious news as entertainment, thereby distorting its message or misleading the recipient.

How to mitigate:

Use as diverse data as possible.

Monitor the model's performance and ensure the distribution of classes.

Clearly communicate the model's limitations and constraints to users in the app or README.