In [1]:
import pandas as pd

# Sample dataset
data = {'text': ['Win a free iPhone now!', 'Your bill is due tomorrow', 'Claim your lottery prize'],
        'label': ['spam', 'ham', 'spam']}

df = pd.DataFrame(data)
print(df)

                        text label
0     Win a free iPhone now!  spam
1  Your bill is due tomorrow   ham
2   Claim your lottery prize  spam


In [2]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df['clean_text'] = df['text'].apply(preprocess_text)
print(df[['text', 'clean_text']])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


                        text           clean_text
0     Win a free iPhone now!      win free iphone
1  Your bill is due tomorrow    bill due tomorrow
2   Claim your lottery prize  claim lottery prize


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Convert text into numerical vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['clean_text'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

# Train Naïve Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Accuracy
print("Model Accuracy:", accuracy_score(y_test, y_pred))


Model Accuracy: 0.0


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_text'])

# Convert to sequences
X_seq = tokenizer.texts_to_sequences(df['clean_text'])
X_padded = pad_sequences(X_seq, maxlen=10, padding='post')


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define model
model = Sequential([
    Embedding(5000, 16, input_length=10),
    LSTM(32, return_sequences=True),
    LSTM(16),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X_padded, df['label'].replace({'spam': 1, 'ham': 0}), epochs=5, batch_size=2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1a2aedc5d90>