In [1]:
!pip install pandas numpy scikit-learn lightgbm tensorflow matplotlib imbalanced-learn gensim
!pip install -U scikit-learn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Bidirectional
import matplotlib.pyplot as plt
import warnings
from gensim.models import KeyedVectors
import time



In [2]:
#Loading dataset
dataset_path = 'Mozilla_bugs_data.csv'
data = pd.read_csv(dataset_path)

In [3]:
print("First few rows of the dataset:")
print(data.head())
print("\nColumn names:")
print(data.columns)

First few rows of the dataset:
   Bug ID                                            Product  \
0     384  Mozilla (when: 895615200, who: 31); Bugzilla (...   
1     540  Bugzilla (when: 904209063, who: 3794); Webtool...   
2     939  Mozilla (when: 907138582, who: 3964); Webtools...   
3     920  NGLayout (when: 906958277, who: 3960); Mozilla...   
4    1658  Mozilla (when: 911995846, who: 4137); NGLayout...   

                                           Component  \
0   (when: 895615200, who: 31); UI (when: 9045786...   
1  UI (when: 904209063, who: 3794); Bugzilla (whe...   
2  Windows FE (when: 907138582, who: 3964); XP Ut...   
3  Plug-ins (when: 906958277, who: 3960); Plugins...   
4  Windows FE (when: 911995846, who: 4137); Layou...   

                                          Short Desc  \
0  testing accented char ŕáâăäĺćçčéęëěíîďđńňóôőö÷...   
1  Editing of long descriptions is completely hor...   
2  "Save Link As" Problem (when: 907138582, who: ...   
3  Problems with Plugin

In [4]:
#Preprocessing
import re

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

data['Short Desc'] = data['Short Desc'].apply(preprocess_text)

In [5]:
print("\nData after preprocessing:")
print(data[['Short Desc', 'Severity']].head())


Data after preprocessing:
                                          Short Desc  \
0  testing accented char ŕáâăäĺćçčéęëěíîďđńňóôőöř...   
1  editing of long descriptions is completely hor...   
2  save link as problem when 907138582 who 3964 b...   
3  problems with plugins inside layers in navigat...   
4  ss graphic defects sporadic crashing at rsacor...   

                                            Severity  
0             enhancement (when: 895615200, who: 31)  
1  normal (when: 904209063, who: 3794); enhanceme...  
2                normal (when: 907138582, who: 3964)  
3                normal (when: 906958277, who: 3960)  
4                 major (when: 911995846, who: 4137)  


In [6]:
#Extracting the most severe label from the Severity column
def extract_severity(severity):
    labels = severity.split(';')
    cleaned_labels = [label.split(' ')[0].split('(')[0].strip().lower() for label in labels]
    severity_order = ['critical', 'blocker', 'major', 'normal', 'minor', 'trivial', 'enhancement']
    for label in severity_order:
        if label in cleaned_labels:
            return label
    return None

#Applying the extraction function
data['Severity'] = data['Severity'].apply(extract_severity)

print("\nExtracted severity labels:")
print(data['Severity'].unique())


Extracted severity labels:
['enhancement' 'normal' 'major' 'critical' 'minor' 'trivial' 'blocker'
 None]


In [7]:
#Map severity to binary classes
def map_severity(severity):
    if severity in ['major', 'critical', 'blocker']:
        return 1  #Severe
    elif severity in ['minor', 'trivial', 'normal', 'enhancement']:
        return 0  #Non-severe
    else:
        return None

data['Severity'] = data['Severity'].apply(map_severity)

data = data.dropna(subset=['Severity'])

In [8]:
print("\nUpdated severity label counts:")
print(data['Severity'].value_counts())


Updated severity label counts:
Severity
0.0    323280
1.0     71597
Name: count, dtype: int64


In [9]:
#Extracting the base priority label from the Priority column
def extract_priority(priority):
    return priority.split(' ')[0].strip()

data['Priority'] = data['Priority'].apply(extract_priority)

#Extracting base labels from Component and Product columns
def extract_base_labels(value):
    labels = value.split(';')
    cleaned_labels = [label.split(' ')[0].strip() for label in labels]
    return ','.join(cleaned_labels)

data['Component'] = data['Component'].apply(extract_base_labels)
data['Product'] = data['Product'].apply(extract_base_labels)

In [10]:
#One-hot encoding additional features
priority_features = pd.get_dummies(data['Priority'], drop_first=True, dtype=int).values
component_features = pd.get_dummies(data['Component'], drop_first=True, dtype=int).values
product_features = pd.get_dummies(data['Product'], drop_first=True, dtype=int).values

In [11]:
X = data['Short Desc']
y = data['Severity']

In [12]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-12-23 17:30:42--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-12-23 17:30:42--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-12-23 17:30:42--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove

In [13]:
#Loading pre-trained word embeddings(GloVe)
print("Loading pre-trained word embeddings...")
embedding_path = 'glove.6B.50d.txt'
embeddings_index = {}
with open(embedding_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(f"Loaded {len(embeddings_index)} word vectors.")

Loading pre-trained word embeddings...
Loaded 400000 word vectors.


In [14]:
#Tokenizing and preparing embeddings matrix
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
vocab_size = len(tokenizer.word_index) + 1

max_length = 100
X_padded = pad_sequences(X_seq, maxlen=max_length, padding='post')

embedding_dim = 50
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
#Concatenating text features with one-hot encoded categorical features
categorical_features = np.hstack((priority_features, component_features, product_features))
X_combined = np.hstack((X_padded, categorical_features))

#Applying SMOTE on the combined feature set
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_combined, y)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [17]:
print("\nSample tokenized and padded data:")
print(X_padded[:5])


Sample tokenized and padded data:
[[ 1319  4609  1127 93877     1 93878     2  1975     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [  852     9   283  3171    10   730  4315     1 93879     2  6597   852
      9   283  3171   815    10   730  4315     1 93880     2  2967   126
    363     4   426   283  1085     1 93881     2  3908   126   363     4
    426   283 54332     1 93882     2   853     0     0     0     0     0
      0     0     0     0     0     0     0     0 

In [18]:
#CNN Model for Feature Extraction with GloVe embeddings
warnings.filterwarnings("ignore", category=UserWarning, module="keras")
cnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [19]:
start_time_cnn = time.time()
cnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
end_time_cnn = time.time()
cnn_training_time = end_time_cnn - start_time_cnn
print(f"Training time for CNN model: {cnn_training_time:.2f} seconds")

Epoch 1/5
[1m12932/12932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m538s[0m 42ms/step - accuracy: 0.7118 - loss: 0.5442 - val_accuracy: 0.7838 - val_loss: 0.4420
Epoch 2/5
[1m12932/12932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m536s[0m 41ms/step - accuracy: 0.7909 - loss: 0.4474 - val_accuracy: 0.7917 - val_loss: 0.4248
Epoch 3/5
[1m12932/12932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m536s[0m 41ms/step - accuracy: 0.8061 - loss: 0.4238 - val_accuracy: 0.8066 - val_loss: 0.4042
Epoch 4/5
[1m12932/12932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m541s[0m 42ms/step - accuracy: 0.8127 - loss: 0.4127 - val_accuracy: 0.8221 - val_loss: 0.3990
Epoch 5/5
[1m12932/12932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m540s[0m 42ms/step - accuracy: 0.8167 - loss: 0.4064 - val_accuracy: 0.7961 - val_loss: 0.4085
Training time for CNN model: 2693.53 seconds


In [20]:
#Extracting features using CNN
start_time_cnn_features = time.time()
cnn_feature_extractor = Sequential(cnn_model.layers[:-1])
X_train_features = cnn_feature_extractor.predict(X_train)
X_test_features = cnn_feature_extractor.predict(X_test)
end_time_cnn_features = time.time()
cnn_feature_extraction_time = end_time_cnn_features - start_time_cnn_features
print(f"Feature extraction time using CNN: {cnn_feature_extraction_time:.2f} seconds")

[1m16164/16164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 14ms/step
[1m4041/4041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 13ms/step
Feature extraction time using CNN: 280.55 seconds


In [21]:
#Training LightGBM Classifier with hyperparameter tuning
warnings.filterwarnings("ignore", category=FutureWarning)
lgb_params = {
    'boosting_type': ['gbdt'],
    'objective': ['binary'],
    'num_leaves': [31, 50],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200]
}
lgb_model = lgb.LGBMClassifier(class_weight='balanced', verbose=-1)
start_time_lgb = time.time()
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)
    clf = GridSearchCV(lgb_model, lgb_params, cv=3, scoring='f1', verbose=1)
    clf.fit(X_train_features, y_train)
end_time_lgb = time.time()
lgb_training_time = end_time_lgb - start_time_lgb
print(f"Training time for LightGBM model: {lgb_training_time:.2f} seconds")

print("Best parameters found by GridSearchCV:", clf.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Training time for LightGBM model: 59.12 seconds
Best parameters found by GridSearchCV: {'boosting_type': 'gbdt', 'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 50, 'objective': 'binary'}


In [22]:
# Total training time
total_training_time = cnn_training_time + cnn_feature_extraction_time + lgb_training_time
print(f"Total training time for hybrid model: {total_training_time:.2f} seconds")

Total training time for hybrid model: 3033.20 seconds


In [23]:
# Predictions and evaluation
y_pred = clf.predict(X_test_features)
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.79      0.88      0.84     64408
         1.0       0.87      0.77      0.82     64904

    accuracy                           0.83    129312
   macro avg       0.83      0.83      0.83    129312
weighted avg       0.83      0.83      0.83    129312
