# ***Project Overview***

In this project, I am developing a machine learning service that automatically generates tags for StackOverflow questions. This task involves processing text data, classifying it, and using machine learning techniques to automatically label questions based on their content.

**Objective:** To predict tags for StackOverflow questions using their text content.

**Dataset for model training:**
Dataset with StackOverflow questions, available on Kaggle: https://www.kaggle.com/datasets/muhammedabdulazeem/500k-stackoverflow-questions/data. This dataset contains questions, answers, and tags associated with the questions. For this task, I only use questions and their associated tags.

**Input:** Text of a StackOverflow question (both title and body).

**Output:** List of tags most relevant to the question.

**Approach chosen to solve:**
1. **Data Exploration and Preprocessing:** I start with a dataset comprising 500k StackOverflow questions, each with associated tags. The preprocessing steps include cleaning text, tokenization, and transforming tags into a format suitable for multi-label classification.

2. **Model Training with Transfer Learning:** Leveraging BERT, a pre-trained transformer model renowned for its effectiveness in NLP tasks, I fine-tune it for the specific tag prediction task. This approach allows to benefit from BERT's understanding of language nuances without the need for extensive computational resources.

3. **MLOps Integration:** The project incorporates MLOps practices to streamline the machine learning workflow. I use MLflow for experiment tracking, allowing to log parameters, metrics, and models. For a more comprehensive overview, Neptune.ai is utilized, offering advanced experiment tracking and visualization capabilities.

# **Loading data**

In [5]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'stacksample:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F265%2F726723%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240310%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240310T122334Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D7378857a4ec1011323f7010c07ccaecccdf51791e2adeb1da6d85a2927fd82823f53ea1e9a9aab4b44b90ecc4ae37b520e66212ab21e92eaed5fa5681a7397432f96fdeb08e8e438e5eaea5a62d7c9a5df1bb8ecad586d02b65def5a597961e81fdfd753bade22e650fd9668d724a2822e346b438bd2ac6ca8474fe95cbd2353a80b35a4bc179b860148026976d23d9231842bbeb576cbec6a76ff58b83956bdf31448201943114ad0578dda48a0f5951782e8f247d4d3288e4d89bec94674330f44d788083ef253a29dfdea86bfc4791d857f04edc507150278110dd923e5833cb8ecc6a1c2f806b35895f2b3ca08951d51da3fee39e8b938f26a8b0b543838,500k-stackoverflow-questions:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1232267%2F2056442%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240310%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240310T122334Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D7763511eb86c99a3e91033e90ed861fa944f7925f753eb4a6f05996258d83828e80962c407379a310d782887aa38acdfc21f1e3260dbe41902b9d88db310cb32c092d39ad2ce39946053ebb7ba0b7bb00f345f57d2bca21cf79f2f2d15aaf93b30f1ebfad57b267fae59f67768d6e79109e37e2d5b756baf19979ba414daf301374f352e605865a5dd125dddb09cec83de15672b0a95a7d84b10f1a9d5d2fe26c04bd871e24ba8c79436d31c4c7c69c5b9d4e9bc37fbd0df5f60d2aa26854545ab758692d701c9bf6065c1113a600dd3bb27dc407ae289f723c2376811daed553c56784bfdeb7e6d91989d0cdb55366e2fe7de55b2b1286fcd4086e3a1f235c6'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading stacksample, 1189724316 bytes compressed
Downloaded and uncompressed: stacksample
Downloading 500k-stackoverflow-questions, 50856690 bytes compressed
Downloaded and uncompressed: 500k-stackoverflow-questions
Data source import complete.


# **Data Exploration and Preprocessing**

I explore the dataset to understand its structure and then preprocess the text data to make it suitable for training a machine learning model.

In [6]:
import pandas as pd

questions_path = '/kaggle/input/500k-stackoverflow-questions/questions.csv'
df = pd.read_csv(questions_path)

print(df.head())
print(df.info())

                         date  \
0  2021-03-24 11:01:18.812726   
1  2021-03-24 11:01:18.814679   
2  2021-03-24 11:01:18.817728   
3  2021-03-24 11:01:18.818696   
4  2021-03-24 11:01:18.820720   

                                               links  \
0  /questions/66775243/how-to-display-jalali-date...   
1  /questions/66775242/automate-creating-of-sales...   
2  /questions/66775240/java-jar-error-for-spring-...   
3  /questions/66775238/not-able-to-run-unfoldingm...   
4  /questions/66775237/serverless-graphql-lambda-...   

                                           questions  \
0  How to display jalali date in to view in Codei...   
1     Automate creating of sales order in Zoho Books   
2          java jar error for spring boot applicaton   
3               Not able to run UnfoldingMap library   
4  Serverless Graphql Lambda hard to understand t...   

                                                tags        time  
0            php,codeigniter,date,gregorian-calendar  2 mins

Preprocessing steps include cleaning text data, tokenization, and splitting the data into training and testing sets.

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^\w\s]','', text)
    text = text.strip()
    tokens = word_tokenize(text)
    filtered_words = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(filtered_words)

df['questions_cleaned'] = df['questions'].apply(lambda x: clean_text(x))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Process tags for multi-label binarization
df['tags_list'] = df['tags'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
tags_encoded = mlb.fit_transform(df['tags_list'])
tags_df = pd.DataFrame(tags_encoded, columns=mlb.classes_)
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('tags')), columns=mlb.classes_, index=df.index))

In [9]:
# Splitting the dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(df['questions_cleaned'], df[df.columns.difference(['date', 'links', 'questions', 'questions_cleaned', 'time'])], test_size=0.2, random_state=42)

# **Model Training with Transfer Learning**

I use a pre-trained BERT model from the Hugging Face library as a base model for transfer learning.

**Setting Up the Model:**

In [None]:
from transformers import BertTokenizer, TFBertModel, BertConfig
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = TFBertModel.from_pretrained('bert-base-uncased')

# Prepare model for transfer learning
input_ids = Input(shape=(512,), dtype='int32')
attention_masks = Input(shape=(512,), dtype='int32')

output = bert(input_ids, attention_mask=attention_masks)
output = output[1]
output = Dense(y_train.shape[1], activation='sigmoid')(output)

model = Model(inputs=[input_ids, attention_masks], outputs=output)

model.compile(optimizer=Adam(learning_rate=2e-5), loss='binary_crossentropy', metrics=['accuracy'])

**Preparing Data for BERT:**

In [None]:
def prepare_data_for_bert(texts, tokenizer, max_length=512):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf',
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return [tf.concat(input_ids, 0), tf.concat(attention_masks, 0)]

X_train_ids, X_train_masks = prepare_data_for_bert(X_train, tokenizer)
X_test_ids, X_test_masks = prepare_data_for_bert(X_test, tokenizer)

**Model Training:**

In [None]:
history = model.fit(
    [X_train_ids, X_train_masks], y_train,
    validation_data=([X_test_ids, X_test_masks], y_test),
    epochs=3,
    batch_size=16
)

# **MLOps Integration**

**MLflow for Experiment Tracking:**

In [None]:
import mlflow
from mlflow import log_metric, log_param, log_artifacts

mlflow.set_experiment('stackoverflow_tags_prediction')

with mlflow.start_run():
    mlflow.tensorflow.log_model(tf_model=model, artifact_path='bert_model')
    log_param("epochs", 3)
    log_param("batch_size", 16)
    log_metric("loss", history.history['loss'][-1])
    log_metric("accuracy", history.history['accuracy'][-1])

**Neptune.ai Integration:**

In [None]:
import neptune

run = neptune.init_run(
    project="alishanoskova/lsml2",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJmYjEzNjMyYS1iN2M0LTQ0YjUtODYyYS0zMjY0NDdiMTBiOWYifQ==",
)

# Log parameters
run["parameters"] = {"epochs": 3, "batch_size": 16}

# Log metrics
for epoch in range(len(history.history['loss'])):
    run["train/loss"].log(history.history['loss'][epoch])
    run["train/accuracy"].log(history.history['accuracy'][epoch])
    run["val/loss"].log(history.history['val_loss'][epoch])
    run["val/accuracy"].log(history.history['val_accuracy'][epoch])

run.stop()

_



**Making predictions and evaluating the model:**

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

predictions = model.predict([X_test_ids, X_test_masks])

preds_binary = np.where(predictions > 0.5, 1, 0)

accuracy = accuracy_score(y_test, preds_binary)
print(f'Accuracy: {accuracy}')

precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds_binary, average='micro')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

_

**Code for Serving:**

In [None]:
from flask import Flask, request, jsonify
import tensorflow as tf
from transformers import BertTokenizer
import numpy as np

app = Flask(__name__)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = tf.keras.models.load_model('model')

def prepare_data_for_bert(text, tokenizer):
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length = 512,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'tf',
    )

    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    return input_id, attention_mask

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    text = data['text']
    input_id, attention_mask = prepare_data_for_bert(text, tokenizer)
    predictions = model.predict([input_id, attention_mask])

    threshold = 0.5
    tag_indexes = np.where(predictions > threshold)
    predicted_tags = [mlb.classes_[i] for i in tag_indexes[1]]
    return jsonify(predicted_tags)

if __name__ == '__main__':
    app.run(debug=True)