In [19]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import SVC
from sklearn.metrics import hamming_loss, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import joblib  # For saving models

In [20]:
# 2. Load Dataset
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,report,type_blocker,type_regression,type_bug,type_documentation,type_enhancement,type_task,type_dependency_upgrade
0,The mention of Fix Super Stream Example in Doc...,0,0,1,1,0,0,0
1,It seems like you need a concise summary relat...,0,0,1,1,0,0,0
2,The issue AMQP 838 opened by Gary Russell invo...,0,0,1,1,0,0,0
3,I m unable to access external content directly...,0,0,1,1,0,0,0
4,In the discussion around AMQP 815 https jira s...,0,0,1,1,0,0,0


In [21]:
# 3. Preprocessing
labels = ['type_blocker', 'type_regression', 'type_bug', 'type_documentation', 'type_enhancement', 'type_task', 'type_dependency_upgrade']
X = df['report']
y = df[labels]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Train-test split
X_train, X_temp, y_train, y_temp = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [22]:
# 4. Logistic Regression (OvR)
log_reg = OneVsRestClassifier(LogisticRegression(max_iter=1000))
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# Save the Logistic Regression model
joblib.dump(log_reg, 'logistic_model.pkl')



['logistic_model.pkl']

In [23]:
# 5. SVM (OvR)
svm = OneVsRestClassifier(SVC(kernel='linear', probability=True))
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

# Save the SVM model
joblib.dump(svm, 'svm_model.pkl')



['svm_model.pkl']

In [24]:
# 6. Perceptron (Online Learning)
perceptron = OneVsRestClassifier(Perceptron(max_iter=5))
perceptron.fit(X_train, y_train)
y_pred_perc = perceptron.predict(X_test)

# Save the Perceptron model
joblib.dump(perceptron, 'perceptron_model.pkl')



['perceptron_model.pkl']

In [25]:
# 7. Deep Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', max_iter=50)
mlp.fit(X_train.toarray(), y_train)
y_pred_mlp = mlp.predict(X_test.toarray())

# Save the DNN model
joblib.dump(mlp, 'mlp_model.pkl')



['mlp_model.pkl']

In [26]:
# 8. Evaluation Function
def evaluate_model(name, y_true, y_pred):
    print(f"=== {name} ===")
    print("Hamming Loss:", hamming_loss(y_true, y_pred))
    print("Micro-F1 Score:", f1_score(y_true, y_pred, average='micro'))
    print("Macro-F1 Score:", f1_score(y_true, y_pred, average='macro'))
    print()

# Evaluate all models
evaluate_model("Logistic Regression", y_test, y_pred_log)
evaluate_model("SVM", y_test, y_pred_svm)
evaluate_model("Perceptron", y_test, y_pred_perc)
evaluate_model("DNN", y_test, y_pred_mlp)

=== Logistic Regression ===
Hamming Loss: 0.11469780219780219
Micro-F1 Score: 0.7925465838509317
Macro-F1 Score: 0.36148536318457686

=== SVM ===
Hamming Loss: 0.09684065934065934
Micro-F1 Score: 0.8269938650306748
Macro-F1 Score: 0.527265767540367

=== Perceptron ===
Hamming Loss: 0.12156593406593406
Micro-F1 Score: 0.7929824561403509
Macro-F1 Score: 0.540804941683196

=== DNN ===
Hamming Loss: 0.11607142857142858
Micro-F1 Score: 0.7916152897657214
Macro-F1 Score: 0.4702937499608698



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
# 9. Precision@k (Top-3)
def precision_at_k(y_true, y_scores, k=3):
    precisions = []
    for i in range(len(y_true)):
        top_k = np.argsort(y_scores[i])[-k:]
        true_labels = np.where(y_true[i] == 1)[0]
        precision = len(set(top_k) & set(true_labels)) / k
        precisions.append(precision)
    return np.mean(precisions)

# Get predicted probabilities
y_score_log = log_reg.predict_proba(X_test)
y_score_mlp = mlp.predict_proba(X_test.toarray())

# Calculate Precision@3
print("Precision@3 (Logistic Regression):", precision_at_k(y_test.values, np.array(y_score_log), k=3))
print("Precision@3 (DNN):", precision_at_k(y_test.values, np.array(y_score_mlp), k=3))


Precision@3 (Logistic Regression): 0.6105769230769231
Precision@3 (DNN): 0.6330128205128205
