# Investigating Bug Reports Resolution on Bugzilla

This Jupyter Notebook has the code for reproducing the experiment used in the master thesis.

### Dependencies

In [3]:
import sklearn

import numpy as np
import pandas as pd

# evaluate random forest algorithm for classification
from numpy import mean, std
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
import pickle
from numpy import std
from sklearn.model_selection import RepeatedStratifiedKFold

# plot
from matplotlib import pyplot as plt
import altair as alt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# models
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, confusion_matrix, accuracy_score, recall_score, f1_score, plot_confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# data balancing
from imblearn.over_sampling import RandomOverSampler

#### Loading dataset

In [None]:
data_raw = None
train = pd.read_csv('../data/train_balnced_no_test.csv')
test = pd.read_csv('../data/test_umbalanced.csv')

Removing some unwatend features

In [None]:
train = train.drop(['status_RESOLVED', 'status_VERIFIED', 'changes_status', 'changes_resolution'], axis=1)
test = test.drop(['status_RESOLVED', 'status_VERIFIED', 'changes_status', 'changes_resolution'], axis=1)

In [None]:
len(test)

# Data Processing

In [None]:
data = pd.DataFrame.copy(data_raw)

In [None]:
# filing the null values whit empty string

data['description'].fillna('', inplace=True)
data['total_words_desc'] = data.apply(lambda row: len(list(tokenize(row['description']))), axis=1)

data['summary'].fillna('', inplace=True)
data['total_words_summary'] = data.apply(lambda row: len(list(tokenize(row['summary']))), axis=1)

# removing features that have mostly empty values
data.drop("type", inplace=True, axis=1)
data.drop("flags", inplace=True, axis=1)
data.drop("assigned_to", inplace=True, axis=1)
data.drop("creator", inplace=True, axis=1)
data.drop("description", inplace=True, axis=1)
data.drop("summary", inplace=True, axis=1)
data.drop("id", inplace=True, axis=1)
data.drop("creation_time", inplace=True, axis=1)
data.drop("last_change_time", inplace=True, axis=1)

# Target feature
target_feature = data[["resolution"]]
data.drop("resolution", inplace=True, axis=1)

### Transforming non-numeric features in dummy features

In [None]:
data = pd.get_dummies(data)
data.head(5)

# Data Balancing

For balancind the data the follwoing chunks must be executed.
X_ros - data balanced
y_ros - labels

In [None]:
ros = RandomOverSampler()
X = data.drop('label', axis=1)
X_ros, y_ros = ros.fit_resample(X, data['label'])

### Util
Functions

In [None]:
def get_data(path):
    return [pd.read_csv(path + "train.csv"), pd.read_csv(path + "test.csv")]


def show_distribution_graph(df):
    df = df['label'].value_counts().to_frame()
    df = df.reset_index().rename(columns={"index": "Resolution", "label": "Total"})

    return alt.Chart(df).mark_bar().encode(
        x=alt.X('Resolution', sort='-y'),
        y='Total',
        color = alt.value("#ac97b4")
    )


def model_confusion_matrix(model, x_test, y_test, classes):
    return plot_confusion_matrix(model, x_test, y_test, labels=classes, cmap=plt.cm.Blues, xticks_rotation = "vertical")


def compute_metrics(pred, y_test):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')

    return {"Metrics": ["Accuracy", "Precision", "Recall", "F1"], "Scores": [accuracy, precision, recall, f1]}


def print_metrics(model_name, metrics):
    print(f"{model_name} Metrics:\n")
    for i in range(4):
        print(f"{metrics['Metrics'][i]} score is:\t{round(metrics['Scores'][i] * 100,2)}%")
    print("\n")


def compute_metrics_per_class(pred, y_test):

    # Print the confusion matrix
    print(metrics.confusion_matrix(y_test, pred))

    # Print the precision and recall, among other metrics
    print(metrics.classification_report(y_test, pred, digits=2))


def get_metrics(model, model_name, x_train, y_train, x_test, y_test, classes):
    model.fit(x_train, y_train)
    pred = model.predict(x_test)

    metrics = compute_metrics(pred,  y_test)
    print_metrics(model, model_name, metrics, x_test, y_test, classes)
    compute_metrics_per_class(pred, y_test)

    return metrics['Scores']


def get_metric_data(models, models_names):
    # models: [NB_metrics,LG_metrics,DT_metrics,RF_metrics, GB_metrics]
    # TODO: fix this method
    metrics = np.array(models).flatten()
    metrics = list(map(lambda x: x*100, metrics))

    data = {"Metric":
            ['Accuracy', 'Precision', 'Recall', 'F1 Score'] * 5,
            "Metric Score": metrics,
            "Model":
            ['Naive Bayes'] * 4 +
            ['Logistic Regression'] * 4 +
            ['Decision Tree'] * 4 +
            ['Random Forest'] * 4 +
            ['Gradient Boosting'] * 4
        }

    return pd.DataFrame(data)


def plot_metric_graph(data):
    g = sns.catplot(
        data=data,
        kind="bar", x="Metric", y="Metric Score", hue="Model",
        ci="sd", alpha=.6, height=6
    )
    g.set(ylim=(0, 100))
    g.despine(left=True)


def get_params(model_name):
    if (model_name == "Logistic Regression"):
        return { 'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0, 100000.0] }
    elif (model_name == "Gradient Boosting"):
        return {
            'n_estimators': [50, 100],
            'max_depth': [3, 8],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 5],
            'max_features': [None, 5],
            'subsample': [0.5, 1]
            }
    else:
        return { 'max_depth': [None, 15, 35, 50], 'max_leaf_nodes': [None, 250, 500, 750, 1000, 5000]}

def print_best_params(model_name, grid, folds):
    print(f'Hyperparams of {model_name}:\n')
    print(f'Got accuracy score of {grid.best_score_} in {folds}-fold')
    if (model_name == "Logistic Regression"):
        print(f'Best C: {grid.best_params_["C"]}')
    elif (model_name == "Gradient Boosting"):
        print(f'Best max depth: {grid.best_params_["max_depth"]}. Best number of estimators: {grid.best_params_["n_estimators"]}')
        print(f'Best min sample split: {grid.best_params_["min_samples_split"]}. Best min sample leaf: {grid.best_params_["min_samples_leaf"]}')
        print(f'Best max features: {grid.best_params_["max_features"]}. Best subsample: {grid.best_params_["subsample"]}')
    else:
        print(f'Best depth: {grid.best_params_["max_depth"]}. Best number of leafs: {grid.best_params_["max_leaf_nodes"]}')

def get_tuned_metrics(model, model_name, folds, x_train, y_train, x_test, y_test, classes):
    params = get_params(model_name)

    grid = GridSearchCV(model, params, cv = folds)
    grid.fit(x_train, y_train)
    pred = grid.predict(x_test)

    print_best_params(model_name, grid, folds)
    metrics = compute_metrics(pred, y_test)
    print_metrics(grid, model_name, metrics, x_test, y_test, classes)
    return metrics['Scores'], pred

### Data analisys

Balanced train distribution

In [None]:
show_distribution_graph(train)

Test distribution

In [None]:
show_distribution_graph(test)

### Machine Learning Models

In [None]:
NB = GaussianNB()
LG = LogisticRegression(max_iter=5000)
DT = DecisionTreeClassifier()
RF = RandomForestClassifier(random_state=42)
GB = GradientBoostingClassifier(random_state=0)

Classification artifacts

In [None]:
x_train = train.drop('label', axis=1)
x_test = test.drop('label', axis=1)
y_train = train['label']
y_test = test['label']
classes = train['label'].unique()

#### Gaussian Naive Bayers

In [None]:
NB_metrics = get_metrics(NB, "Naive Bayes", x_train, y_train, x_test, y_test, classes)

#### Logistic Regression

In [None]:
LG_metrics = get_metrics(LG, "Logistic Regression", x_train, y_train, x_test, y_test, classes)

#### Decision Tree

In [None]:
DT_metrics = get_metrics(DT, "Decision Tree", x_train, y_train, x_test, y_test, classes)

#### Random Forest

In [None]:
RF_metrics = get_metrics(RF, "Random Forest", x_train, y_train, x_test, y_test, classes)

#### Gradient Boosting

In [None]:
GB_metrics = get_metrics(GB, "Gradient Boosting", x_train, y_train, x_test, y_test, classes)

### Cross Validation

Cross validation model

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

Naive Bayers

In [None]:
n_scores_NB = cross_val_score(NB, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores_NB), std(n_scores_NB)))

Logistic Regression

In [None]:
n_scores_LG = cross_val_score(LG, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores_LG), std(n_scores_LG)))

Random forest
TODO: instance the model again and run it

In [None]:
n_scores_RF = cross_val_score(RF, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores_RF), std(n_scores_RF)))

Gradient Boosting

In [None]:
#took too much time
#n_scores_GB = cross_val_score(GB, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
#print('Accuracy: %.3f (%.3f)' % (mean(n_scores_GB), std(n_scores_GB)))

### Models Finetunning

Naive Bayers

In [None]:
NB_tuned_metrics, nb_pred = get_tuned_metrics(NB, "Naive Bayes", 10, x_train, y_train, x_test, y_test, classes)
pickle.dump(NB_tuned_metrics, open('./data/models/NB_tuned_metrics.pkl', 'wb'))
pickle.dump(nb_pred, open('./data/models/nb_pred.pkl', 'wb'))

Logistic Regression

In [None]:
LG_tuned_metrics, lg_pred = get_tuned_metrics(LG, "Logistic Regression", 10, x_train, y_train, x_test, y_test, classes)
pickle.dump(LG_tuned_metrics, open('./data/models/LG_tuned_metrics.pkl', 'wb'))
pickle.dump(lg_pred, open('./data/models/lg_pred.pkl', 'wb'))

Decision Tree

In [None]:
DT_tuned_metrics, dt_pred = get_tuned_metrics(DT, "Decision Tree", 10, x_train, y_train, x_test, y_test, classes)
pickle.dump(DT_tuned_metrics, open('./data/models/DT_tuned_metrics.pkl', 'wb'))
pickle.dump(dt_pred, open('./data/models/dt_pred.pkl', 'wb'))

Random Forest

In [None]:
RF_tuned_metrics, rf_pred = get_tuned_metrics(RF, 'RandomForest', 10, x_train, y_train, x_test, y_test, classes)
pickle.dump(RF_tuned_metrics, open('./data/models/RF_tuned_metrics.pkl', 'wb'))
pickle.dump(rf_pred, open('./data/models/rf_pred.pkl', 'wb'))

Gradient Boosting

In [None]:
GB_tuned_metrics, gb_pred = get_tuned_metrics(GB, 'Gradient Boosting', 10, x_train, y_train, x_test, y_test, classes)
pickle.dump(GB_tuned_metrics, open('./data/models/GB_tuned_metrics.pkl', 'wb'))
pickle.dump(gb_pred, open('./data/models/gb_pred.pkl', 'wb'))

# Grouping Strategy

Configuring 4 differents groupings

**4 Classes**
- FIXED: FIXED
- INCOMPLETE: INCOMPLETE, WORKSFORME, and INVALID
- IGNORED: WONTFIX and DUPLICATED
- INACTIVE: INACTIVE and MOVED

**3 Classes**
- FIXED: FIXED
- INVALID: INVALID, WORKSFORME, DUPLICATED, and INCOMPLETE
- IGNORED: MOVED, WONTFIX, and INACTIVE

**2 Classes**
- FIXED: FIXED
- INCOMPLETE: INCOMPLETE, INACTIVE, WORKSFORME, INVALID, MOVED, DUPLICATE, WONTFIX

GROUPING

In [None]:
train_new_classes = train.copy()
test_new_classes = test.copy()

**4 Classes**

In [None]:
#Train
train_new_classes['label'] = train_new_classes['label'].replace(['MOVED', 'INACTIVE'], 'INACTIVE')
train_new_classes['label'] = train_new_classes['label'].replace(['WONTFIX', 'DUPLICATE'], 'IGNORED')
train_new_classes['label'] = train_new_classes['label'].replace(['INCOMPLETE', 'WORKSFORME', 'INVALID'], 'INCOMPLETE')

#Test
test_new_classes['label'] = test_new_classes['label'].replace(['MOVED', 'INACTIVE'], 'INACTIVE')
test_new_classes['label'] = test_new_classes['label'].replace(['WONTFIX', 'DUPLICATE'], 'IGNORED')
test_new_classes['label'] = test_new_classes['label'].replace(['INCOMPLETE', 'WORKSFORME', 'INVALID'], 'INCOMPLETE')

**3 Classes**

In [None]:
#Train
train_new_classes['label'] = train_new_classes['label'].replace(['INVALID', 'WORKSFORME', 'DUPLICATED', 'INCOMPLETE'], 'INVALID')
train_new_classes['label'] = train_new_classes['label'].replace(['MOVED', 'WONTFIX', 'INACTIVE'], 'IGNORED')

#Test
test_new_classes['label'] = test_new_classes['label'].replace(['INVALID', 'WORKSFORME', 'DUPLICATED', 'INCOMPLETE'], 'INVALID')
test_new_classes['label'] = test_new_classes['label'].replace(['MOVED', 'WONTFIX', 'INACTIVE'], 'IGNORED')

**2 Classes**

In [None]:
#Train
train_new_classes['label'] = train_new_classes['label'].replace(['INVALID', 'WORKSFORME', 'DUPLICATED', 'INCOMPLETE', 'MOVED', 'WONTFIX', 'INACTIVE'], 'INCOMPLETE')

#Test
test_new_classes['label'] = test_new_classes['label'].replace(['INVALID', 'WORKSFORME', 'DUPLICATED', 'INCOMPLETE', 'MOVED', 'WONTFIX', 'INACTIVE'], 'INCOMPLETE')

Executing Random Forest with the new classes
get_metrics(model, model_name, x_train, y_train, x_test, y_test, classes)

In [None]:
x_train_new_classes = train_new_classes.drop('label', axis=1)
x_test_new_classes = test_new_classes.drop('label', axis=1)
y_train_new_classes = train_new_classes['label']
y_test_new_classes = test_new_classes['label']
classes_new_classes = train_new_classes['label'].unique()

In [None]:
RF_new_classes = RandomForestClassifier(random_state=42)
RF_metrics_new_classes = get_metrics(RF_new_classes, "Random Forest", x_train_new_classes, y_train_new_classes, x_test_new_classes, y_test_new_classes, classes_new_classes)

# Random Forest Analysis 

In [None]:
fit_result = RF.fit(x_train, y_train)
pred = RF.predict(x_test)

In [None]:
data_eda = test.copy()
data_eda['predicted'] = pred

In order to check whether the FIXED classifications was right we defined:

In [None]:
def eval_classification(row):
    if ((row['label'] == row['predicted']) & (row['label'] == 'FIXED')):
        return 'right'
    elif ((row['label'] != row['predicted']) & (row['label'] == 'FIXED')):
        return 'wrong'
    else:
        return 'other'

In [None]:
data_eda['classification'] = data_eda.apply(lambda row: eval_classification(row), axis=1)
data_eda.head()


In [None]:
data_eda.to_csv('../data/data_eda.csv')

In [None]:
data_eda = pd.read_csv('../data/data_eda.csv')

In [None]:
sns.set(rc={"figure.figsize":(20.7, 12.27)})
sns.set_style('whitegrid')


ax = sns.boxplot(x='comment_count',y='classification',data=data_eda).set(xlim=(0, 60))

In [None]:
len(data_eda[data_eda['classification'] == 'wrong'])

INCOMPLETE

Data summary

- Tamanho dos comentários
- Descrição dos comentários
- Quantidade de palavras impactam?
- Qual a relação dos comentários com outras variáveis?
- Os classificados de forma correta têm mais mudanças(total_changes)?

In [None]:
data_eda[['label', 'total_users_commenting']].groupby('label').mean().plot.bar(y='total_users_commenting')

In [None]:
data_eda_plot = data_eda[['label', 'total_users_commenting']].groupby('label').median()
data_eda_plot.plot.bar(y='total_users_commenting')

In [None]:
data_eda.head()

In [None]:
fig, ax = plt.subplots()
df['median'].plot(ax=ax, marker='o', ls='-', color='#4C9A2A', alpha = 0.9)
df['total_users_commenting'].plot(kind='bar', ax=ax, alpha=0.7)

FEATURE: 

Label  | Mean | Median | R-Median | R-Mean | W-Median | W-Mean | 

In [None]:
def make_analysis_feature(df, feature):
    analysis = pd.DataFrame()
    median = data_eda[['label', feature]].groupby('label').median()[feature]
    right_median = data_eda[data_eda['label'] == data_eda['predicted']][['label', feature]].groupby('label').median().rename(columns = {feature:'right_median'})
    analysis = data_eda[data_eda['label'] != data_eda['predicted']][['label', feature]].groupby('label').median().rename(columns = {feature:'wrong_median'})

    analysis['median'] = median.values
    analysis['right_median'] = right_median['right_median']


    mean = data_eda[['label', feature]].groupby('label').mean()[feature]
    right_mean = data_eda[data_eda['label'] == data_eda['predicted']][['label', feature]].groupby('label').mean().rename(columns = {feature:'right_mean'})
    analysis['wrong_mean'] = data_eda[data_eda['label'] != data_eda['predicted']][['label', feature]].groupby('label').mean().rename(columns = {feature:'wrong_mean'})

    analysis['mean'] = mean.values
    analysis['right_mean'] = right_mean['right_mean']

    return analysis
    

In [None]:
def make_analysis_feature_mean(df, feature):
    analysis = pd.DataFrame()
    mean = data_eda[['label', feature]].groupby('label').mean()[feature]
    right_mean = data_eda[data_eda['label'] == data_eda['predicted']][['label', feature]].groupby('label').mean().rename(columns = {feature:'right_mean'})
    analysis = data_eda[data_eda['label'] != data_eda['predicted']][['label', feature]].groupby('label').mean().rename(columns = {feature:'wrong_mean'})

    analysis['median'] = mean.values
    analysis['right_mean'] = right_mean['right_mean']

    return analysis

In [None]:
data_eda[(data_eda['label'] == 'INCOMPLETE')  &  (data_eda['predicted'] == 'WORKSFORME')]

##### Checking models classification by features

_total_words_desc_

In [None]:
make_analysis_feature(data_eda, 'total_words_desc')

_total_words_summary_

In [None]:
make_analysis_feature(data_eda, 'total_words_summary')

_total_changes_

In [None]:
make_analysis_feature(data_eda, 'total_changes')

_comment_count_

In [None]:
make_analysis_feature(data_eda, 'comment_count')

_total_users_commenting_

In [None]:
make_analysis_feature(data_eda, 'total_users_commenting')

_total_users_changes_

In [None]:
make_analysis_feature(data_eda, 'total_users_changes')

_total_comments_by_author_

In [None]:
make_analysis_feature(data_eda, 'total_comments_by_author')

_total_attachment_comments_

In [None]:
make_analysis_feature(data_eda, 'total_attachment_comments')

_severity_

In [None]:
make_analysis_feature(data_eda, 'severity')

## Naive Bayes Fine tunning 

In [None]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=10)
}

param_grid_nb


from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV


nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nbModel_grid.fit(x_train, y_train)
print(nbModel_grid.best_estimator_)