In [3]:
#a. Basic Text Classification using Naive Bayes and Bag-of-Words

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pandas as pd

df = pd.DataFrame({
     'text': ["Article about politics...", 
              "Sports news here...", 
              "Tech trends update...",
              "Healthcare tips for better living...",
              "New player signed in football league"],
     'category': ["politics", "sports", "technology", "health", "sports"]
})

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

y_pred = clf.predict(X_test_counts)
print(classification_report(y_test, y_pred))

sample = ["Vitamin D supplements may boost immune system."]
sample_counts = vectorizer.transform(sample)
pred = clf.predict(sample_counts)[0]
print("Prediction for sample:", pred)


              precision    recall  f1-score   support

      health       0.00      0.00      0.00       0.0
      sports       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0

Prediction for sample: health


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#  b. News Article Categorization using BERT 

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
df=pd.read_csv('/kaggle/input/news-dataset/news_dataset.csv')
df.head()
# 2. Encode Labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

# 3. Initialize BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 4. Tokenize Text
def tokenize_text(texts, max_length=128):
    return tokenizer(
        texts.tolist(),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

encoded_data = tokenize_text(df['text'])
# 5. Create Dataset (FIXED THIS LINE)
dataset = TensorDataset(
    encoded_data['input_ids'],
    encoded_data['attention_mask'],
    torch.tensor(df['label'].values)
)    
dataloader = DataLoader(dataset, batch_size=2)

# 6. Initialize BERT Model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_)
)

# 7. Simple Prediction Example
def predict(text, model, tokenizer, label_encoder):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    pred = torch.argmax(outputs.logits).item()
    return label_encoder.inverse_transform([pred])[0]

# Test prediction
sample = "Last night's basketball game went into triple overtime."
print(f"Prediction: {predict(sample, model, tokenizer, label_encoder)}")

2025-10-13 16:28:14.392633: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760372894.685590      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760372894.764390      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/news-dataset/news_dataset.csv'

In [4]:
#c. Spam Detection in Email using LSTM Networks

In [8]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
df = pd.DataFrame({
    'text': [
        "Win a free iPhone now!", 
        "Meeting tomorrow at 9am", 
        "Lowest prices on medicines, buy today", 
        "Let's go for lunch", 
        "You won $1000 cash prize!"
    ],
    'label': [1,0,1,0,1]   # 1 = Spam, 0 = Not Spam
})
tok = Tokenizer(num_words=5000)
X = tok.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=20)
y = df['label']

model = Sequential([
    Embedding(5000, 32, input_length=20),
    LSTM(32),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=5, verbose=0)
test = ["Congratulations! You've won a free trip"]
seq = pad_sequences(tok.texts_to_sequences(test), maxlen=20)
print("Prediction:", "Spam" if model.predict(seq)[0][0] > 0.5 else "Not Spam")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step
Prediction: Spam
