In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%cd ..
%pwd

In [None]:
from experiments.data_utils import *
from experiments.plot_utils import *
from experiments.hyperparam_tuning import *

In [None]:
X, y, features = get_ml_data_traditional('bugbug', 'performance', 'commitlevel')

In [None]:
features

In [None]:
feature_names = features.columns
feature_names

In [None]:
import xgboost
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import shap
import sklearn.metrics as metrics

In [None]:
shap.initjs()

In [None]:
def plot_important_features(clf, X, feature_names):
    explainer = shap.TreeExplainer(clf)
    #explainer = shap.Explainer(clf)

    # print(explainer)
    shap_values = explainer.shap_values(X)
    shap.summary_plot(shap_values, X, max_display=10, feature_names=feature_names)
    plt.show()

In [None]:
def ml_pipeline(X_train, X_test, y_train, y_test, feature_names, clf, plot_feature_importance=True):
    clf.fit(X_train, y_train)

    print("Train:")
    y_pred = clf.predict(X_train)
    report = metrics.classification_report(y_train, y_pred)
    print(report)
    if plot_feature_importance:
        plot_important_features(clf, X_train, feature_names)
    #ConfusionMatrixDisplay(confusion_matrix(y_train, y_pred), display_labels=['no regr.', 'regr.']).plot()
    plt.show()

    print("Test:")
    y_pred = clf.predict(X_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    #plot_important_features(clf, X_test, feature_names)
    plot_precision_recall_curve_with_f1(clf, X_test, y_test)

    #ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred), display_labels=['no regr.', 'regr.']).plot()
    #plt.show()

In [None]:
def make_clf():
    clf = xgboost.XGBClassifier(
        n_jobs=4,
        use_label_encoder=False,
        eval_metric='logloss'
        )
    return clf

# def make_clf():
#     from sklearn.linear_model import LogisticRegression
#     clf = LogisticRegression()
#     return clf

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, shuffle=True)
clf = make_clf()
ml_pipeline(X_train, X_test, y_train, y_test, feature_names, clf)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, shuffle=False)
clf = make_clf()    
ml_pipeline(X_train, X_test, y_train, y_test, feature_names, clf)

In [None]:
# difference not as pronounced with Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, shuffle=True)
clf = default_pipeline()
clf.set_params(model=LogisticRegression(random_state=0, C=2), sampler=RandomOverSampler(random_state=0))

ml_pipeline(X_train, X_test, y_train, y_test, feature_names, clf, plot_feature_importance=False)

## Why?

In [None]:
labeling = pd.read_csv('data/labeling/bugbug.csv')
labeling['index'] = labeling['revision']
labeling.set_index('index', inplace=True)
labeling

In [None]:
bug_ids = set(labeling['bug_id'])

In [None]:
bug_id_counts = (labeling
    .loc[labeling['performance']==1, ['revision', 'bug_id']]
    .groupby('bug_id').count()
    .sort_values('revision'))

bug_id_counts.value_counts().sort_index()

### How many performance bug have single commit vs multiple commits?

In [None]:
single_commit_bug_ids = set(bug_id_counts[bug_id_counts['revision'] == 1].index)
multi_commit_bug_ids = set(bug_id_counts[bug_id_counts['revision'] > 1].index)

total = len(single_commit_bug_ids) + len(multi_commit_bug_ids)
print(f"#single = {len(single_commit_bug_ids)}, #multiple = {len(multi_commit_bug_ids)}, total = {total}")

### And how many commits ?

In [None]:
single_commit_bug_commits = labeling.loc[labeling['bug_id'].isin(single_commit_bug_ids), 'revision']
multi_commit_bug_commits = labeling.loc[labeling['bug_id'].isin(multi_commit_bug_ids), 'revision']

total = len(single_commit_bug_commits) + len(multi_commit_bug_commits)
print(f"#single = {len(single_commit_bug_commits)}, #multiple = {len(multi_commit_bug_commits)}, total = {total}")

In [None]:
from src.repo_miner import get_commit_log
commit_log = get_commit_log('data/repo_miner/commit_log.csv')
commits = commit_log.drop('revision', axis=1).join(labeling, how='inner')
commits

In [None]:
bug_id = 1700052
commits[commits['bug_id'] == bug_id]

In [None]:
features['id'] = commits['id']
df = features.loc[
    commits['bug_id'] == bug_id,
    [
        'id',
        'developer_age',
        'recent_developer_experience',
        'recent_backouts_developer'
    ]
].iloc[:5]
df.index.name = 'revision hash'
df.columns = ['Commit Id', 'Developer Seniority', 'Recent Developer Experience', 'Recent Backouts Developer']
df

### Does XGBoost learn proxy for revision id?

### Does it just memorize the position of the labels?

In [None]:
# only take id
y = np.array(commits['performance'], 'int')

X = commits[['id']]

feature_names = X.columns
X = np.array(X)
print(f'{X.shape=}\n')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, shuffle=True)
clf = make_clf()  
ml_pipeline(X_train, X_test, y_train, y_test, feature_names, clf)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, shuffle=False)
clf = make_clf()
ml_pipeline(X_train, X_test, y_train, y_test, feature_names, clf)

In [None]:
import matplotlib.pyplot as plt

labeling = pd.read_csv('data/labeling/bugbug.csv')
labeling.set_index('revision', inplace=True)

for kind in ['commitlevel', 'buglevel']:
    target = 'performance'

    features = pd.read_csv(f'data/feature_extractor/features_{kind}.csv')

    if kind == 'buglevel':
        # labeling is based on bugnumber, that's why it is ok to index at
        # first revision of a commit group in case of kind=='buglevel'
        features['revision'] = features['first_revision']
        features['id'] = features['first_id']

    features.set_index('revision', inplace=True)


    features['target'] = labeling[target] # works because index is revision hash

    subset = features[(483000 <= features['id']) & (features['id'] <= 485000)]

    plt.figure(figsize=(24, 4))
    plt.scatter(
        subset.loc[subset['target'] == 1, 'id'], 
        np.full((subset['target'] == 1).sum(), 1),
        s=1)
    plt.title(kind)
    plt.show()

## Now compare to buglevel

In [None]:
X, y, features = get_ml_data_traditional('bugbug', 'performance', 'buglevel')
feature_names = features.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, shuffle=True)
clf = make_clf()       
ml_pipeline(X_train, X_test, y_train, y_test, feature_names, clf)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, shuffle=False)
clf = make_clf()       
ml_pipeline(X_train, X_test, y_train, y_test, feature_names, clf)