In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import hamming_loss, f1_score, precision_score
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import warnings
import joblib
warnings.filterwarnings("ignore")

# Step 1: Load and Preprocess the Dataset
def load_and_preprocess_data():
    # Load the dataset
    df = pd.read_csv("dataset.csv")

    # Check for missing values in labels
    label_cols = [col for col in df.columns if col.startswith("type_")]
    print("Missing values in labels:", df[label_cols].isnull().sum())
    df = df.dropna(subset=label_cols)  # Drop rows with missing labels

    # Extract features using TF-IDF with reduced features
    vectorizer = TfidfVectorizer(max_features=500, stop_words="english")
    X = vectorizer.fit_transform(df["report"]).toarray()

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Extract labels
    y = df[label_cols].values

    # Analyze label distribution
    print("\nLabel Distribution:")
    for i, col in enumerate(label_cols):
        print(f"{col}: {np.sum(y[:, i])} positive samples ({np.sum(y[:, i]) / len(y) * 100:.2f}%)")

    return X, y, label_cols, vectorizer, scaler

# Step 2: Split the Data
def split_data(X, y):
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42)  # 0.1765 of 85% = 15%
    print(f"\nTrain: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")
    return X_train, X_val, X_test, y_train, y_val, y_test

# Step 3: Define Evaluation Metrics
def evaluate_model(y_true, y_pred, label_cols, k=3):
    # Hamming Loss
    hamming = hamming_loss(y_true, y_pred)

    # Micro and Macro F1
    micro_f1 = f1_score(y_true, y_pred, average="micro")
    macro_f1 = f1_score(y_true, y_pred, average="macro")

    # Precision@k
    precision_k = 0
    for i in range(len(y_true)):
        true_labels = np.where(y_true[i] == 1)[0]
        pred_labels = np.argsort(y_pred[i])[::-1][:k]  # Top k predictions
        precision_k += len(set(true_labels).intersection(pred_labels)) / k
    precision_k /= len(y_true)

    print(f"Hamming Loss: {hamming:.4f}")
    print(f"Micro-F1: {micro_f1:.4f}")
    print(f"Macro-F1: {macro_f1:.4f}")
    print(f"Precision@{k}: {precision_k:.4f}")
    return hamming, micro_f1, macro_f1, precision_k

# Step 4: Logistic Regression
def train_logistic_regression(X_train, y_train, X_val, y_val, label_cols):
    print("\nTraining Logistic Regression...")
    param_grid = {"estimator__C": [0.1, 1, 10]}
    base_lr = LogisticRegression(max_iter=2000)
    model = OneVsRestClassifier(base_lr)
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring="f1_micro", n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    print(f"Best C: {grid_search.best_params_['estimator__C']}")
    y_pred = grid_search.predict(X_val)
    print("Logistic Regression Validation Results:")
    evaluate_model(y_val, y_pred, label_cols)
    return grid_search.best_estimator_

# Step 5: SVM
def train_svm(X_train, y_train, X_val, y_val, label_cols):
    print("\nTraining SVM...")
    param_grid = {"estimator__C": [1]}  # Simplified to single value for speed
    base_svm = LinearSVC(max_iter=1000, random_state=42, dual="auto")
    model = OneVsRestClassifier(base_svm)
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring="f1_micro", n_jobs=-1, verbose=1)

    print("Starting GridSearchCV...")
    grid_search.fit(X_train, y_train)
    print("GridSearchCV completed.")

    print(f"Best C: {grid_search.best_params_['estimator__C']}")
    y_pred = grid_search.predict(X_val)
    print("SVM Validation Results:")
    evaluate_model(y_val, y_pred, label_cols)
    return grid_search.best_estimator_


In [4]:
# Step 6: Custom Perceptron for Online Learning
class MultiLabelPerceptron:
    def __init__(self, n_features, n_labels, learning_rate=0.01):
        self.weights = np.zeros((n_labels, n_features))
        self.bias = np.zeros(n_labels)
        self.lr = learning_rate

    def predict(self, X):
        scores = X @ self.weights.T + self.bias
        return (scores > 0).astype(int)

    def update(self, x, y_true):
        y_pred = self.predict(x.reshape(1, -1))[0]
        for label in range(len(y_true)):
            if y_true[label] != y_pred[label]:
                self.weights[label] += self.lr * (y_true[label] - y_pred[label]) * x
                self.bias[label] += self.lr * (y_true[label] - y_pred[label])

def train_perceptron(X_train, y_train, X_val, y_val, label_cols):
    print("\nTraining Perceptron (Online Learning)...")
    learning_rates = [0.001, 0.01, 0.1]
    best_lr, best_f1 = None, 0
    best_model = None

    for lr in learning_rates:
        model = MultiLabelPerceptron(X_train.shape[1], y_train.shape[1], learning_rate=lr)
        # Online learning: Update after each sample
        for epoch in range(10):
            for i in range(len(X_train)):
                model.update(X_train[i], y_train[i])

        y_pred = model.predict(X_val)
        micro_f1 = f1_score(y_val, y_pred, average="micro")
        if micro_f1 > best_f1:
            best_f1 = micro_f1
            best_lr = lr
            best_model = model

    print(f"Best Learning Rate: {best_lr}")
    y_pred = best_model.predict(X_val)
    print("Perceptron Validation Results:")
    evaluate_model(y_val, y_pred, label_cols)
    return best_model

# Step 7: Deep Neural Network
class MultiLabelDNN(nn.Module):
    def __init__(self, input_dim, n_labels):
        super(MultiLabelDNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, n_labels),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

def train_dnn(X_train, y_train, X_val, y_val, label_cols):
    print("\nTraining DNN...")
    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

    # Initialize model
    model = MultiLabelDNN(X_train.shape[1], y_train.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    batch_size = 32
    num_samples = X_train_tensor.shape[0]
    num_batches = (num_samples + batch_size - 1) // batch_size

    for epoch in range(50):
        model.train()
        for batch_idx in range(num_batches):
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, num_samples)
            batch_X = X_train_tensor[start_idx:end_idx]
            batch_y = y_train_tensor[start_idx:end_idx]

            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val_tensor)
            val_loss = criterion(val_outputs, y_val_tensor)
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}, Validation Loss: {val_loss.item():.4f}")

    # Predict
    model.eval()
    with torch.no_grad():
        y_pred_probs = model(X_val_tensor).numpy()
        y_pred = (y_pred_probs > 0.5).astype(int)

    print("DNN Validation Results:")
    evaluate_model(y_val, y_pred, label_cols)
    return model

# Main Execution
if __name__ == "__main__":
    # Load and preprocess data
    X, y, label_cols, vectorizer, scaler = load_and_preprocess_data()

    # Split data
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

    # Train models
    lr_model = train_logistic_regression(X_train, y_train, X_val, y_val, label_cols)
    svm_model = train_svm(X_train, y_train, X_val, y_val, label_cols)
    perceptron_model = train_perceptron(X_train, y_train, X_val, y_val, label_cols)
    dnn_model = train_dnn(X_train, y_train, X_val, y_val, label_cols)

    # Evaluate on test set
    print("\n=== Test Set Evaluation ===")
    for name, model in [("Logistic Regression", lr_model), ("SVM", svm_model)]:
        print(f"\n{name}:")
        y_pred = model.predict(X_test)
        evaluate_model(y_test, y_pred, label_cols)

    print("\nPerceptron:")
    y_pred = perceptron_model.predict(X_test)
    evaluate_model(y_test, y_pred, label_cols)

    print("\nDNN:")
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    dnn_model.eval()
    with torch.no_grad():
        y_pred_probs = dnn_model(X_test_tensor).numpy()
        y_pred = (y_pred_probs > 0.5).astype(int)
    evaluate_model(y_test, y_pred, label_cols)

    # Save Models
    joblib.dump(lr_model, "lr_defect_model.joblib")
    joblib.dump(svm_model, "svm_defect_model.joblib")
    joblib.dump(perceptron_model, "perceptron_defect_model.joblib")
    torch.save(dnn_model.state_dict(), "dnn_defect_model.pth")
    joblib.dump(vectorizer, "vectorizer.joblib")
    joblib.dump(scaler, "scaler.joblib")
    print("\nModels saved successfully for Part 2!")

Missing values in labels: type_blocker               0
type_regression            0
type_bug                   0
type_documentation         0
type_enhancement           0
type_task                  0
type_dependency_upgrade    0
dtype: int64

Label Distribution:
type_blocker: 134 positive samples (9.67%)
type_regression: 115 positive samples (8.30%)
type_bug: 676 positive samples (48.77%)
type_documentation: 1134 positive samples (81.82%)
type_enhancement: 676 positive samples (48.77%)
type_task: 0 positive samples (0.00%)
type_dependency_upgrade: 43 positive samples (3.10%)

Train: 970, Validation: 208, Test: 208

Training Logistic Regression...
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best C: 0.1
Logistic Regression Validation Results:
Hamming Loss: 0.1339
Micro-F1: 0.7619
Macro-F1: 0.5407
Precision@3: 0.5080

Training SVM...
Starting GridSearchCV...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
GridSearchCV completed.
Best C: 1
SVM Validation Results:
