# Stack Overflow Tag Predictor
Predict top 10 tags from Stack Overflow questions using LSTM.

Might try transformers or GRU later — this is a basic prototype.

In [None]:
# Mount Google Drive (because dataset is big and already in Drive)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Unzipping data... already uploaded to Drive manually
import zipfile
zip_path = '/content/drive/MyDrive/stacksample.zip'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/stacksample')

In [None]:
# Load the main files — skipping Answers.csv for now
import pandas as pd
questions = pd.read_csv('/content/stacksample/Questions.csv', encoding='latin1')
tags = pd.read_csv('/content/stacksample/Tags.csv', encoding='latin1')

In [None]:
# Let's limit to the top 10 most frequent tags
# Might expand this later once it's working well
top_tags = tags['Tag'].value_counts().nlargest(10).index.tolist()
tags = tags[tags['Tag'].isin(top_tags)]
print("Top tags selected:", top_tags)

In [None]:
# Merge questions with their tags
questions_tags = questions.merge(tags, on='Id')
questions_tags['Tag'] = questions_tags['Tag'].astype(str)

In [None]:
# Group tags by question (so we can multi-label)
tag_lists = questions_tags.groupby('Id')['Tag'].apply(list)

# Keep the text part of the questions
questions_text = questions.drop_duplicates(subset='Id').set_index('Id').loc[tag_lists.index]

In [None]:
# Basic text cleaning — may improve with more advanced preprocessing later
import re
def clean_text(text):
    text = re.sub('<.*?>', '', str(text))  # remove HTML
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text.lower()

questions_text['Text'] = (questions_text['Title'] + ' ' + questions_text['Body']).apply(clean_text)

In [None]:
# Convert tag lists into multi-hot vectors
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=top_tags)
y = mlb.fit_transform(tag_lists)
print("Label shape:", y.shape)  # should be (num_samples, 10)

In [None]:
# Tokenize and pad sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(questions_text['Text'])

X_seq = tokenizer.texts_to_sequences(questions_text['Text'])
X_pad = pad_sequences(X_seq, maxlen=300)  # might try 500 later
print("Padded sequence shape:", X_pad.shape)

In [None]:
# Quick train-test split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_pad, y, test_size=0.2, random_state=42)

In [None]:
# Build the model — LSTM first
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=128, input_length=300))
model.add(LSTM(64))  # could test GRU too
# model.add(GRU(64))
model.add(Dense(10, activation='sigmoid'))  # multi-label output

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Train it — 3 epochs just to test things quickly
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=3, batch_size=128)

In [None]:
# Save to Drive — in case we want to load and fine-tune later
model.save('/content/drive/MyDrive/tag_predictor_model_v1.h5')

### Notes
- Can add pretrained GloVe embeddings later
- Might try transformers if this does well
- Could integrate question score or answer count as features too
