In [1]:
!pip install pandas numpy scikit-learn lightgbm tensorflow matplotlib imbalanced-learn gensim
!pip install -U scikit-learn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Bidirectional
import matplotlib.pyplot as plt
import warnings
from gensim.models import KeyedVectors
import time
from gensim.models import KeyedVectors



In [2]:
#Loading dataset
dataset_path = 'Eclipse_bugs_data.csv'
data = pd.read_csv(dataset_path)

In [3]:
print("First few rows of the dataset:")
print(data.head())
print("\nColumn names:")
print(data.columns)

First few rows of the dataset:
   Bug ID                                            Product  \
0    4228  JDT (when: 1002748091, who: 29); Platform (whe...   
1    3328  JDT (when: 1002747198, who: 46); Platform (whe...   
2    4948  JDT (when: 1002896719, who: 57); Platform (whe...   
3    4961  JDT (when: 1002928536, who: 35); Platform (whe...   
4    4984  JDT (when: 1003136936, who: 25); Platform (whe...   

                                           Component  \
0                     UI (when: 1002748091, who: 29)   
1              Resources (when: 1002747198, who: 46)   
2  UI (when: 1002896719, who: 57); SWT (when: 100...   
3  Core (when: 1002928536, who: 35); SWT (when: 1...   
4                     UI (when: 1003136936, who: 25)   

                                          Short Desc  \
0  Widget is disposed error (1GKCZQM) (when: 1002...   
1  Data loss when disk is full (1GIX0JL) (when: 1...   
2  Outline scrolls horizontally when selecting me...   
3  Preferences dialog d

In [4]:
#Preprocessing
import re

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

data['Short Desc'] = data['Short Desc'].apply(preprocess_text)

In [5]:
print("\nData after preprocessing:")
print(data[['Short Desc', 'Severity']].head())


Data after preprocessing:
                                          Short Desc  \
0  widget is disposed error 1gkczqm when 10027480...   
1  data loss when disk is full 1gix0jl when 10027...   
2  outline scrolls horizontally when selecting me...   
3  preferences dialog disappears when you click o...   
4  errors arent remved from task list when filter...   

                                            Severity  
0                 normal (when: 1002748091, who: 29)  
1  normal (when: 1002747198, who: 46); major (whe...  
2                 normal (when: 1002896719, who: 57)  
3                 normal (when: 1002928536, who: 35)  
4                 normal (when: 1003136936, who: 25)  


In [6]:
#Extracting the most severe label from the Severity column
def extract_severity(severity):
    labels = severity.split(';')
    cleaned_labels = [label.split(' ')[0].split('(')[0].strip().lower() for label in labels]
    severity_order = ['critical', 'blocker', 'major', 'normal', 'minor', 'trivial', 'enhancement']
    for label in severity_order:
        if label in cleaned_labels:
            return label
    return None

#Applying the extraction function
data['Severity'] = data['Severity'].apply(extract_severity)

print("\nExtracted severity labels:")
print(data['Severity'].unique())


Extracted severity labels:
['normal' 'major' 'enhancement' 'minor' 'trivial' 'critical' 'blocker']


In [7]:
#Map severity to binary classes
def map_severity(severity):
    if severity in ['major', 'critical', 'blocker']:
        return 1  #Severe
    elif severity in ['minor', 'trivial', 'normal', 'enhancement']:
        return 0  #Non-severe
    else:
        return None

data['Severity'] = data['Severity'].apply(map_severity)

data = data.dropna(subset=['Severity'])

In [8]:
print("\nUpdated severity label counts:")
print(data['Severity'].value_counts())


Updated severity label counts:
Severity
0    147424
1     18123
Name: count, dtype: int64


In [9]:
#Extracting the base priority label from the Priority column
def extract_priority(priority):
    return priority.split(' ')[0].strip()

data['Priority'] = data['Priority'].apply(extract_priority)

#Extracting base labels from Component and Product columns
def extract_base_labels(value):
    labels = value.split(';')
    cleaned_labels = [label.split(' ')[0].strip() for label in labels]
    return ','.join(cleaned_labels)

data['Component'] = data['Component'].apply(extract_base_labels)
data['Product'] = data['Product'].apply(extract_base_labels)

In [10]:
#One-hot encoding additional features
priority_features = pd.get_dummies(data['Priority'], drop_first=True, dtype=int).values
component_features = pd.get_dummies(data['Component'], drop_first=True, dtype=int).values
product_features = pd.get_dummies(data['Product'], drop_first=True, dtype=int).values

In [11]:
X = data['Short Desc']
y = data['Severity']

In [12]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-12-23 16:44:18--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-12-23 16:44:18--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-12-23 16:44:18--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove

In [14]:
#Loading pre-trained word embeddings(GloVe)
print("Loading pre-trained word embeddings...")
embedding_path = 'glove.6B.50d.txt'
embeddings_index = {}
with open(embedding_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(f"Loaded {len(embeddings_index)} word vectors.")

Loading pre-trained word embeddings...
Loaded 400000 word vectors.


In [15]:
#Tokenizing and preparing embeddings matrix
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
vocab_size = len(tokenizer.word_index) + 1

max_length = 100
X_padded = pad_sequences(X_seq, maxlen=max_length, padding='post')

embedding_dim = 50
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [2]:
warnings.filterwarnings("ignore", category=FutureWarning)
#Concatenating text features with one-hot encoded categorical features
categorical_features = np.hstack((priority_features, component_features, product_features))
X_combined = np.hstack((X_padded, categorical_features))

#Applying SMOTE on the combined feature set
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_combined, y)

NameError: name 'np' is not defined

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [18]:
print("\nSample tokenized and padded data:")
print(X_padded[:5])


Sample tokenized and padded data:
[[  248    11   458    23 35504     1 35505     2   218     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [  658  1991     1  1778    11   718 21174     1 35506     2   439   207
    658  1991     1  1778    11   718 21174     1 35507     2   175     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0 

In [19]:
#CNN Model for Feature Extraction with GloVe embeddings
warnings.filterwarnings("ignore", category=UserWarning, module="keras")
cnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [20]:
start_time_cnn = time.time()
cnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
end_time_cnn = time.time()
cnn_training_time = end_time_cnn - start_time_cnn
print(f"Training time for CNN model: {cnn_training_time:.2f} seconds")

Epoch 1/5
[1m5897/5897[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 22ms/step - accuracy: 0.7758 - loss: 0.4553 - val_accuracy: 0.8590 - val_loss: 0.3395
Epoch 2/5
[1m5897/5897[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 21ms/step - accuracy: 0.8463 - loss: 0.3498 - val_accuracy: 0.8550 - val_loss: 0.3274
Epoch 3/5
[1m5897/5897[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 21ms/step - accuracy: 0.8537 - loss: 0.3359 - val_accuracy: 0.8696 - val_loss: 0.3080
Epoch 4/5
[1m5897/5897[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 21ms/step - accuracy: 0.8626 - loss: 0.3218 - val_accuracy: 0.8635 - val_loss: 0.3098
Epoch 5/5
[1m5897/5897[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 21ms/step - accuracy: 0.8671 - loss: 0.3140 - val_accuracy: 0.8664 - val_loss: 0.3061
Training time for CNN model: 630.18 seconds


In [21]:
#Extracting features using CNN
start_time_cnn_features = time.time()
cnn_feature_extractor = Sequential(cnn_model.layers[:-1])
X_train_features = cnn_feature_extractor.predict(X_train)
X_test_features = cnn_feature_extractor.predict(X_test)
end_time_cnn_features = time.time()
cnn_feature_extraction_time = end_time_cnn_features - start_time_cnn_features
print(f"Feature extraction time using CNN: {cnn_feature_extraction_time:.2f} seconds")

[1m7372/7372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 7ms/step
[1m1843/1843[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step
Feature extraction time using CNN: 65.08 seconds


In [22]:
#Training LightGBM Classifier with hyperparameter tuning
warnings.filterwarnings("ignore", category=FutureWarning)
lgb_params = {
    'boosting_type': ['gbdt'],
    'objective': ['binary'],
    'num_leaves': [31, 50],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200]
}
lgb_model = lgb.LGBMClassifier(class_weight='balanced', verbose=-1)
start_time_lgb = time.time()
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)
    clf = GridSearchCV(lgb_model, lgb_params, cv=3, scoring='f1', verbose=1)
    clf.fit(X_train_features, y_train)
end_time_lgb = time.time()
lgb_training_time = end_time_lgb - start_time_lgb
print(f"Training time for LightGBM model: {lgb_training_time:.2f} seconds")

print("Best parameters found by GridSearchCV:", clf.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Training time for LightGBM model: 43.99 seconds
Best parameters found by GridSearchCV: {'boosting_type': 'gbdt', 'learning_rate': 0.05, 'n_estimators': 200, 'num_leaves': 31, 'objective': 'binary'}


In [23]:
# Total training time
total_training_time = cnn_training_time + cnn_feature_extraction_time + lgb_training_time
print(f"Total training time for hybrid model: {total_training_time:.2f} seconds")

Total training time for hybrid model: 739.24 seconds


In [24]:
#Predictions and evaluation
y_pred = clf.predict(X_test_features)
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.94      0.88     29422
           1       0.93      0.81      0.87     29548

    accuracy                           0.88     58970
   macro avg       0.88      0.88      0.88     58970
weighted avg       0.88      0.88      0.88     58970
