# Quora Spam Detection - Deep Learning Project
This notebook builds a spam detection model for Quora questions using GloVe embeddings and LSTM.

In [None]:
# !pip install pandas numpy scikit-learn keras tensorflow

In [None]:
import pandas as pd
import numpy as np
import re
import os
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv('train.csv')
df.dropna(subset=['question_text'], inplace=True)
df['question_text'] = df['question_text'].astype(str)

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text

df['clean_text'] = df['question_text'].apply(clean_text)

In [None]:
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 100

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y = df['target'].values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Upload GloVe file: glove.6B.100d.txt
glove_path = 'glove.6B.100d.txt'

embeddings_index = {}
with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embedding_dim = 100
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
model = Sequential()
model.add(Embedding(num_words, embedding_dim, embeddings_initializer=Constant(embedding_matrix),
                    input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=4, batch_size=128)

In [None]:
y_pred = (model.predict(X_val) > 0.5).astype(int)
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))