In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
import seaborn as sns
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV


In [4]:
warnings.filterwarnings("ignore")
nlp = spacy.load('en_core_web_sm')

In [5]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df_newsgroups = pd.DataFrame({
    'Category': newsgroups.target,
    'Text': newsgroups.data    
})

df_newsgroups.head()

Unnamed: 0,Category,Text
0,10,\n\nI am sure some bashers of Pens fans are pr...
1,3,My brother is in the market for a high-perform...
2,17,\n\n\n\n\tFinally you said what you dream abou...
3,3,\nThink!\n\nIt's the SCSI card doing the DMA t...
4,4,1) I have an old Jasmine drive which I cann...


In [6]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()

    text = re.sub(r'[^a-zA-Z\s]', '', text)

    text = text.lower()

    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])

    return text
df_newsgroups['cleaned_text'] = df_newsgroups['Text'].apply(clean_text)

In [7]:
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

df_newsgroups['lemmatized_text'] = df_newsgroups['cleaned_text'].apply(lemmatize_text)

print(df_newsgroups[['cleaned_text', 'lemmatized_text']].head())

                                        cleaned_text  \
0  sure bashers pens fans pretty confused lack ki...   
1  brother market highperformance video card supp...   
2  finally said dream mediterranean new area grea...   
3  think scsi card doing dma transfers disks scsi...   
4  old jasmine drive use new understanding upsate...   

                                     lemmatized_text  
0  sure basher pen fan pretty confused lack kind ...  
1  brother market highperformance video card supp...  
2  finally say dream mediterranean new area great...  
3  think scsi card dma transfer disk scsi card dm...  
4  old jasmine drive use new understanding upsate...  


In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

vocab_size = 20000  
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df_newsgroups['lemmatized_text'])

X_seq = tokenizer.texts_to_sequences(df_newsgroups['lemmatized_text'])

max_length = 200  
X_pad = pad_sequences(X_seq, maxlen=max_length)

label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(df_newsgroups['Category'])


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X_pad, y_enc, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [18]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,  
    batch_size=32,  
    verbose=1
)


Epoch 1/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 496ms/step - accuracy: 0.0792 - loss: 2.8923 - val_accuracy: 0.1677 - val_loss: 2.3745
Epoch 2/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 348ms/step - accuracy: 0.1870 - loss: 2.3059 - val_accuracy: 0.2727 - val_loss: 2.1459
Epoch 3/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 245ms/step - accuracy: 0.3309 - loss: 1.8951 - val_accuracy: 0.3919 - val_loss: 1.7564
Epoch 4/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 187ms/step - accuracy: 0.4772 - loss: 1.4568 - val_accuracy: 0.5055 - val_loss: 1.5380
Epoch 5/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 184ms/step - accuracy: 0.5993 - loss: 1.1355 - val_accuracy: 0.5607 - val_loss: 1.4861
Epoch 6/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 186ms/step - accuracy: 0.6870 - loss: 0.9138 - val_accuracy: 0.5886 - val_loss: 1.4551
Epoch 7

In [24]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.6),  # Increased from 0.5
    Dense(64, activation='relu'),
    Dropout(0.6),
    Dense(20, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10, 
    batch_size=32,  
    verbose=1
)

Epoch 1/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 188ms/step - accuracy: 0.0736 - loss: 2.9441 - val_accuracy: 0.2048 - val_loss: 2.2588
Epoch 2/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 184ms/step - accuracy: 0.2063 - loss: 2.2649 - val_accuracy: 0.3414 - val_loss: 1.9037
Epoch 3/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 183ms/step - accuracy: 0.3465 - loss: 1.7878 - val_accuracy: 0.4599 - val_loss: 1.6361
Epoch 4/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 182ms/step - accuracy: 0.4627 - loss: 1.4473 - val_accuracy: 0.5122 - val_loss: 1.5375
Epoch 5/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 183ms/step - accuracy: 0.5552 - loss: 1.2211 - val_accuracy: 0.5302 - val_loss: 1.4930
Epoch 6/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 182ms/step - accuracy: 0.6325 - loss: 1.0221 - val_accuracy: 0.5493 - val_loss: 1.5798
Epoch 7/10

In [25]:
from tensorflow.keras.regularizers import l2

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(128, kernel_regularizer=l2(0.01), return_sequences=False)),  # L2 regularization
    Dropout(0.5),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.5),
    Dense(20, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,  
    batch_size=32,  
    verbose=1
)


Epoch 1/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 193ms/step - accuracy: 0.0541 - loss: 4.0612 - val_accuracy: 0.0456 - val_loss: 2.9937
Epoch 2/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 188ms/step - accuracy: 0.0508 - loss: 2.9917 - val_accuracy: 0.0456 - val_loss: 2.9927
Epoch 3/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 186ms/step - accuracy: 0.0519 - loss: 2.9904 - val_accuracy: 0.0456 - val_loss: 2.9919
Epoch 4/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 186ms/step - accuracy: 0.0547 - loss: 2.9917 - val_accuracy: 0.0456 - val_loss: 2.9922
Epoch 5/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 187ms/step - accuracy: 0.0557 - loss: 2.9906 - val_accuracy: 0.0456 - val_loss: 2.9922
Epoch 6/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 186ms/step - accuracy: 0.0575 - loss: 2.9925 - val_accuracy: 0.0456 - val_loss: 2.9923
Epoch 7/10

In [28]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(1,), initializer="zeros")
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.nn.tanh(tf.matmul(x, self.W) + self.b)  
        a = tf.nn.softmax(e, axis=1)  
        output = x * a  
        return tf.reduce_sum(output, axis=1)  


In [32]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout

embedding_dim = 100
input_seq = Input(shape=(max_length,))

x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(input_seq)
x = Bidirectional(LSTM(128, return_sequences=True))(x)  
x = Attention()(x)  
x = Dropout(0.5)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(20, activation='softmax')(x)

model = Model(inputs=input_seq, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10, 
    batch_size=32, 
    verbose=1
)


Epoch 1/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 205ms/step - accuracy: 0.0627 - loss: 2.9528 - val_accuracy: 0.2264 - val_loss: 2.3968
Epoch 2/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 201ms/step - accuracy: 0.2408 - loss: 2.2789 - val_accuracy: 0.4468 - val_loss: 1.6539
Epoch 3/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 203ms/step - accuracy: 0.4748 - loss: 1.5105 - val_accuracy: 0.5617 - val_loss: 1.3494
Epoch 4/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 202ms/step - accuracy: 0.6261 - loss: 1.1081 - val_accuracy: 0.5865 - val_loss: 1.2733
Epoch 5/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 203ms/step - accuracy: 0.7055 - loss: 0.8737 - val_accuracy: 0.6031 - val_loss: 1.2970
Epoch 6/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 202ms/step - accuracy: 0.7816 - loss: 0.6884 - val_accuracy: 0.6314 - val_loss: 1.3306
Epoch 7/10