<a href="https://colab.research.google.com/github/mithun415/Deep-Learning-Project/blob/main/Draft_FINAL_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [2]:
questions_path = '/content/drive/MyDrive/Deep Learning Project/Project 1/Questions.csv'
tags_path = '/content/drive/MyDrive/Deep Learning Project/Project 1/Tags.csv'

In [3]:
questions_df = pd.read_csv(questions_path, usecols=['Id', 'Title', 'Body'], encoding='latin-1')
tags_df = pd.read_csv(tags_path, usecols=['Id', 'Tag'], encoding='latin-1')

In [4]:
data = questions_df.merge(tags_df, on='Id', how='inner')

In [5]:
# Sample of the data to reduce memory usage because colab keeps crashing.
data = data.sample(frac=0.5, random_state=42).reset_index(drop=True)

In [6]:
data['text'] = data['Title'].fillna('') + " " + data['Body'].fillna('')

In [7]:
data['Tag'] = data['Tag'].astype(str).apply(lambda x: [x])

In [8]:
data['Tag']

Unnamed: 0,Tag
0,[missing-data]
1,[c]
2,[regex]
3,[symfony2]
4,[ajax]
...,...
1875492,[model-view-controller]
1875493,[android]
1875494,[c++]
1875495,[mysql]


In [9]:
data = data.groupby('Id').agg({
    'text': 'first',
    'Tag': lambda x: list(np.unique(x.astype(str).tolist()))  # Convert to list and handle single values
}).reset_index()

In [10]:
data

Unnamed: 0,Id,text,Tag
0,80,SQLStatement.execute() - multiple queries in o...,"[['actionscript-3'], ['air'], ['flex']]"
1,90,Good branching and merging tutorials for Torto...,"[['branch'], ['svn']]"
2,120,ASP.NET Site Maps <p>Has anyone got experience...,"[['sitemap'], ['sql']]"
3,180,Function for creating color wheels <p>This is ...,"[['color-space'], ['colors'], ['language-agnos..."
4,330,Should I use nested classes in this case? <p>I...,"[['class'], ['oop']]"
...,...,...,...
1040339,40143190,How to execute multiline python code from a ba...,[['multiline']]
1040340,40143300,Bigquery.Jobs.Insert - Resumable Upload? <p>Th...,[['google-bigquery']]
1040341,40143340,Obfuscating code in android studio <p>Under mi...,"[['android'], ['android-studio']]"
1040342,40143360,How to fire function after v-model change? <p>...,[['javascript']]


In [11]:
texts = data['text'].values
tags = data['Tag'].values

In [12]:
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

In [13]:
text_sequences = tokenizer.texts_to_sequences(texts)

In [14]:
text_padded = pad_sequences(text_sequences, maxlen=max_len, padding='post')

In [15]:
tag_counts = pd.Series(np.concatenate(tags)).value_counts()
tag_counts

Unnamed: 0,count
['javascript'],62183
['java'],57604
['c#'],50383
['php'],49076
['android'],45320
...,...
['invalid-pointer'],1
['cdonts'],1
['perl-hash'],1
['datatip'],1


In [16]:
top_tags = tag_counts.index[:10]  # Limit to top 10 tags to reduce RAM usage in colab
tags = [[tag for tag in tag_list if tag in top_tags] for tag_list in tags]

In [17]:
top_tags

Index(['['javascript']', '['java']', '['c#']', '['php']', '['android']',
       '['jquery']', '['python']', '['html']', '['c++']', '['ios']'],
      dtype='object')

In [18]:
mlb = MultiLabelBinarizer()
tags_encoded = mlb.fit_transform(tags)

In [19]:
text_padded = np.array(text_padded, dtype=np.int32)
tags_encoded = np.array(tags_encoded, dtype=np.float32)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(text_padded, tags_encoded, test_size=0.2, random_state=42)

In [21]:
embedding_dim = 128
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(tags_encoded.shape[1], activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    metrics=['accuracy']
)

In [22]:
# Train the model
batch_size = 64
epochs = 5

history = model.fit(
    X_train, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_test, y_test)
)

# 8 hours+ runtime

Epoch 1/5
[1m13005/13005[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6156s[0m 473ms/step - accuracy: 0.2250 - loss: 0.1279 - val_accuracy: 0.3074 - val_loss: 0.0914
Epoch 2/5
[1m13005/13005[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6343s[0m 484ms/step - accuracy: 0.3065 - loss: 0.0918 - val_accuracy: 0.3136 - val_loss: 0.0893
Epoch 3/5
[1m13005/13005[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6395s[0m 492ms/step - accuracy: 0.3137 - loss: 0.0887 - val_accuracy: 0.3110 - val_loss: 0.0885
Epoch 4/5
[1m13005/13005[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6383s[0m 487ms/step - accuracy: 0.3161 - loss: 0.0869 - val_accuracy: 0.3095 - val_loss: 0.0884
Epoch 5/5
[1m13005/13005[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6321s[0m 486ms/step - accuracy: 0.3169 - loss: 0.0851 - val_accuracy: 0.3112 - val_loss: 0.0884


In [23]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m6503/6503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m662s[0m 102ms/step - accuracy: 0.3105 - loss: 0.0884
Test Loss: 0.08840310573577881
Test Accuracy: 0.3111996352672577


In [24]:
import pickle

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('mlb.pkl', 'wb') as f:
    pickle.dump(mlb, f)

model.save('stackoverflow_lstm_model_edit.h5')



In [26]:
def predict_tags(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post')
    predictions = model.predict(padded)
    predicted_tags_list = []
    for prediction in predictions:
        predicted_tags = [mlb.classes_[i] for i in range(len(prediction)) if prediction[i] > 0.1]
        predicted_tags_list.append(predicted_tags)
    return predicted_tags_list

In [29]:
sample_text = ["Create an encrypted ZIP file in Python", "Java ClassLoader dont load all classes in jar",
               "PHP LoginScript MySQL Checking results error"]
print("Predicted Tags:", predict_tags(sample_text))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Predicted Tags: [["['python']"], ["['java']"], ["['php']"]]
