<a href="https://colab.research.google.com/github/mithun415/Deep-Learning-Project/blob/main/Project_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
file_path = "/content/drive/MyDrive/Deep Learning Project/Project 3/train.csv"
data = pd.read_csv(file_path)

In [None]:
data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [None]:
questions = data['question_text']
target = data['target']

In [None]:
tokenizer = Tokenizer(num_words=15000)
tokenizer.fit_on_texts(questions)

In [None]:
sequences = tokenizer.texts_to_sequences(questions)

In [None]:
max_sequence_length = 80
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, target, test_size=0.2, random_state=42)

In [None]:
model = Sequential([
    Embedding(input_dim=15000, output_dim=128),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])


In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(
    X_train, y_train,
    epochs=4,
    batch_size=64,
    validation_data=(X_test, y_test)
)

#3 hours+ runtime

Epoch 1/4
[1m16327/16327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4274s[0m 261ms/step - accuracy: 0.9506 - loss: 0.1335 - val_accuracy: 0.9566 - val_loss: 0.1105
Epoch 2/4
[1m16327/16327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4289s[0m 263ms/step - accuracy: 0.9583 - loss: 0.1062 - val_accuracy: 0.9579 - val_loss: 0.1071
Epoch 3/4
[1m16327/16327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4353s[0m 267ms/step - accuracy: 0.9612 - loss: 0.0989 - val_accuracy: 0.9580 - val_loss: 0.1073
Epoch 4/4
[1m16327/16327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4296s[0m 260ms/step - accuracy: 0.9640 - loss: 0.0922 - val_accuracy: 0.9574 - val_loss: 0.1082


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m283s[0m 35ms/step - accuracy: 0.9570 - loss: 0.1087
Test Loss: 0.10821221768856049, Test Accuracy: 0.9574159979820251


In [None]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 34ms/step
              precision    recall  f1-score   support

           0       0.97      0.98      0.98    245369
           1       0.68      0.56      0.62     15856

    accuracy                           0.96    261225
   macro avg       0.83      0.77      0.80    261225
weighted avg       0.95      0.96      0.96    261225



In [None]:
model.save("quora_spam_detection_model.h5")



In [None]:
import pickle
with open("tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model and tokenizer saved successfully.")

Model and tokenizer saved successfully.


In [None]:
# Test the model with data
new_questions = [
    "Why does Quora moderation hate conservatives?",
    "How big is the blue whale?"
]
new_sequences = tokenizer.texts_to_sequences(new_questions)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length)
predictions = model.predict(new_padded_sequences)

for question, prediction in zip(new_questions, predictions):
    label = "Spam" if prediction > 0.5 else "Not Spam"
    print(f"Question: {question}\nPrediction: {label}\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Question: Why does Quora moderation hate conservatives?
Prediction: Spam

Question: How big is the blue whale?
Prediction: Not Spam

