In [None]:
pip install tensorflow nltk scikit-learn

In [2]:
import numpy as np
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Sample documents (sports vs technology)
documents = [
    "The football match was thrilling and intense!",
    "A new football player just broke a world record for goals.",
    "The basketball team celebrated after winning the final match.",
    "Baseball season is starting soon, and fans are excited.",
    "The soccer game was a nail-biting experience.",
    "Athletes are training hard for the upcoming tournament.",
    "The tennis match was fast-paced with many exciting rallies.",
    "The volleyball championship will be held next month.",
    "Cycling competitions are becoming more popular worldwide.",
    "Olympic athletes are preparing for the next games.",
    "The football team is aiming for the championship.",
    "Basketball players are working on their shooting skills.",
    "Tennis stars are competing in major tournaments.",
    "Soccer players are training in warm-weather camps.",
    "The sports equipment industry is booming.",
    
    "The new smartphone features amazing AI capabilities.",
    "Artificial intelligence is shaping the future of technology.",
    "Quantum computing is revolutionizing the tech industry.",
    "The tech conference introduced several new innovations.",
    "New advancements in 5G technology will change the world.",
    "Electric vehicles are becoming a popular choice among consumers.",
    "The tech startup just raised millions in funding for their app.",
    "Blockchain technology is being integrated into various industries.",
    "Virtual reality is becoming mainstream in entertainment.",
    "Wearable tech like smartwatches is growing rapidly.",
    "Cloud computing is transforming the IT industry.",
    "Artificial intelligence is used in self-driving cars.",
    "Blockchain is disrupting industries like finance and supply chain.",
    "Smart cities are using technology to improve infrastructure.",
    "The tech world is excited about the potential of quantum computing."
]

# Labels (1 for Sports, 0 for Technology)
labels = [1] * 15 + [0] * 15  # 15 Sports and 15 Technology labels

# Initialize the Tokenizer
#Tokenization and Padding: Using Tokenizer from Keras and proper padding for sequences.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents)

# Convert texts to sequences (tokens)
sequences = tokenizer.texts_to_sequences(documents)

# Pad the sequences so they all have the same length
X = pad_sequences(sequences, padding='post')

# Encode labels (0 for Technology, 1 for Sports)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=X.shape[1]))  # Embedding layer
model.add(LSTM(64, return_sequences=False))  # LSTM layer

#Dropout for Overfitting: Added dropout layers to avoid overfitting during training.
model.add(Dropout(0.5))  # Dropout to prevent overfitting
model.add(Dense(1, activation='sigmoid'))  # Output layer (binary classification)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Train the model
#Training for More Epochs: We increased the number of epochs to 10 to improve the learning process.
model.fit(X_train, y_train, epochs=10, batch_size=2, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"LSTM Model Accuracy: {accuracy * 100:.2f}%")

# Example prediction
sample_text = "AI technology is transforming the way we live and work."
sample_sequence = tokenizer.texts_to_sequences([sample_text])
sample_padded = pad_sequences(sample_sequence, padding='post', maxlen=X.shape[1])

predicted_label = model.predict(sample_padded)
category = "Sports" if predicted_label >= 0.5 else "Technology"
print(f"\nThe document is categorized as: {category}")


Epoch 1/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.5862 - loss: 0.6908 - val_accuracy: 0.4000 - val_loss: 0.7001
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.4611 - loss: 0.6891 - val_accuracy: 0.4000 - val_loss: 0.6992
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5649 - loss: 0.6784 - val_accuracy: 0.4000 - val_loss: 0.6999
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6798 - loss: 0.6570 - val_accuracy: 0.4000 - val_loss: 0.6940
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6817 - loss: 0.5880 - val_accuracy: 0.6000 - val_loss: 0.5950
Epoch 6/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 1.0000 - loss: 0.2012 - val_accuracy: 0.8000 - val_loss: 0.3761
Epoch 7/10
[1m10/10[0m [32m━━━━