In [315]:
import pandas as pd
import numpy as np
import functools

In [316]:
df_szz = pd.read_csv("szz_data.csv")
df_eszz = pd.read_csv("eszz_data.csv")

In [317]:
df_szz.columns

Index(['sha', 'num_of_insertions', 'num_of_deletions', 'num_of_changed_files',
       'day_of_week', 'hour_of_commit', 'solve_time', 'resolution_time',
       'solve_res_diff', 'number_of_comments', 'summary', 'description',
       'components', 'affects_versions', 'comments', 'number_of_patches',
       'patch_size_mean', 'patch_size_variance', 'patch_size_rel_variance',
       'filepath_contains_test', 'label'],
      dtype='object')

In [318]:
df_szz.drop(['summary', 'description', 'comments'], axis=1, inplace=True)
df_eszz.drop(['summary', 'description', 'comments'], axis=1, inplace=True)

In [319]:
def func(groupby, x):
    if isinstance(x, list):
        agg = groupby[x]
        result = (agg[:,"count"] * agg[:,"mean"]).sum() / agg[:,"count"].sum()
        return result
#     print(groupby)
    return groupby[:,"mean"].mean()

def multi_value_target_encoding(df, by_labels, on_label, separator):
    dummy = df.copy()
    results = []
    for by in by_labels:
        dummy[by] = dummy[by].str.split(separator)
        groupby_count_mean = (dummy
           .set_index(on_label)[by]
           .apply(pd.Series)
           .stack()
           .reset_index(name=by)  # Reshape the data
           .groupby([by])
           .label
           .apply(lambda x: x.agg(['count', 'mean'])))
#         print(groupby_count_mean)

        my_func = functools.partial(func, groupby_count_mean)
        res = dummy[by].map(my_func).to_numpy()
        results.append(res)
    return np.vstack(results).T
        

In [336]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score

In [321]:
numeric_features = ['num_of_insertions', 'num_of_deletions', 'num_of_changed_files', 
                      'day_of_week', 'hour_of_commit', 'solve_time', 'resolution_time', 
                     'solve_res_diff', 'number_of_comments', 'number_of_patches',
                    'patch_size_mean', 'patch_size_variance', 'patch_size_rel_variance', 'filepath_contains_test']

numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])

# categorical encoding: http://contrib.scikit-learn.org/categorical-encoding/

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)],
    remainder='drop')

categorical_features = ['components', 'affects_versions']
processed_features = numeric_features + categorical_features

In [323]:
X_szz = df_szz.drop('label', axis=1)
y_szz = df_szz['label']
X_eszz = df_eszz.drop('label', axis=1)
y_eszz = df_eszz['label']

X_train_szz, X_test_szz, y_train_szz, y_test_szz = train_test_split(X_szz, y_szz, test_size=0.2, random_state=10)
X_train_eszz, X_test_eszz, y_train_eszz, y_test_eszz = train_test_split(X_eszz, y_eszz, test_size=0.2, random_state=10)

In [324]:
X_cat_szz = multi_value_target_encoding(df_szz, categorical_features, 'label', ';')
X_cat_eszz = multi_value_target_encoding(df_szz, categorical_features, 'label', ';')

X_train_szz_cat, X_test_szz_cat = train_test_split(X_cat_szz, test_size=0.2, random_state=10)
X_train_eszz_cat, X_test_eszz_cat = train_test_split(X_cat_eszz, test_size=0.2, random_state=10)

Transform data

In [325]:
X_train_szz_tr = preprocessor.fit_transform(X_train_szz, y_train_szz)

X_train_szz_tr = np.concatenate((X_train_szz_tr,X_train_szz_cat), axis=1)

X_test_szz_tr = preprocessor.fit_transform(X_test_szz, y_test_szz)

X_test_szz_tr = np.concatenate((X_test_szz_tr,X_test_szz_cat), axis=1)

X_train_eszz_tr = preprocessor.fit_transform(X_train_eszz, y_train_eszz)

X_train_eszz_tr = np.concatenate((X_train_eszz_tr,X_train_eszz_cat), axis=1)

X_test_eszz_tr = preprocessor.fit_transform(X_test_eszz, y_test_eszz)

X_test_eszz_tr = np.concatenate((X_test_eszz_tr,X_test_eszz_cat), axis=1)

In [326]:
def get_feature_importances(model):
    fetaure_importances = zip(processed_features, model.feature_importances_)
    fetaure_importances = sorted(fetaure_importances, key=lambda x: x[1], reverse=True)
    for feature_name, importance in fetaure_importances:
        print("%s: %f" % (feature_name, importance))

# SZZ

In [327]:
model_szz = RandomForestClassifier(n_estimators=200)
model_szz.fit(X_train_szz_tr, y_train_szz)
print("model score: %.3f" % model_szz.score(X_test_szz_tr, y_test_szz))

model score: 0.778


In [328]:
get_feature_importances(model_szz)

num_of_insertions: 0.141997
patch_size_mean: 0.095069
affects_versions: 0.094390
num_of_deletions: 0.075261
num_of_changed_files: 0.073641
components: 0.070215
solve_res_diff: 0.067461
solve_time: 0.061226
resolution_time: 0.060110
number_of_comments: 0.048913
patch_size_variance: 0.048135
patch_size_rel_variance: 0.044961
hour_of_commit: 0.044509
day_of_week: 0.029942
number_of_patches: 0.027305
filepath_contains_test: 0.016865


In [329]:
szz_predictions = model_szz.predict(X_test_szz_tr)

In [339]:
print("precision score: %.3f" % precision_score(y_test_szz, szz_predictions))
print("recall score: %.3f" % recall_score(y_test_szz, szz_predictions))

precision score: 0.726
recall score: 0.593


# E-SZZ

In [330]:
model_eszz = RandomForestClassifier(n_estimators=200)
model_eszz.fit(X_train_eszz_tr, y_train_eszz)
print("model score: %.3f" % model_eszz.score(X_test_szz_tr, y_test_eszz))

model score: 0.786


In [331]:
get_feature_importances(model_eszz)

num_of_insertions: 0.136673
affects_versions: 0.090028
patch_size_mean: 0.089885
solve_res_diff: 0.075027
components: 0.073613
num_of_deletions: 0.073431
num_of_changed_files: 0.073407
solve_time: 0.061638
resolution_time: 0.060735
number_of_comments: 0.050259
hour_of_commit: 0.048577
patch_size_variance: 0.047350
patch_size_rel_variance: 0.042885
day_of_week: 0.030720
number_of_patches: 0.029753
filepath_contains_test: 0.016018


In [332]:
eszz_predictions = model_eszz.predict(X_test_eszz_tr)

In [340]:
print("precision score: %.3f" % precision_score(y_test_eszz, eszz_predictions))
print("recall score: %.3f" % recall_score(y_test_eszz, eszz_predictions))

precision score: 0.726
recall score: 0.629


In [333]:
X_test_szz.shape

(2672, 17)

In [334]:
X_test_szz[(szz_predictions == 0) & (eszz_predictions == 1)]

Unnamed: 0,sha,num_of_insertions,num_of_deletions,num_of_changed_files,day_of_week,hour_of_commit,solve_time,resolution_time,solve_res_diff,number_of_comments,components,affects_versions,number_of_patches,patch_size_mean,patch_size_variance,patch_size_rel_variance,filepath_contains_test
8823,531652e681e57aafdcfd7929c603131eebf8890a,1,1,1,2,1,967883.0,967913.0,30.0,9,,2.1.0,1,958.000000,0.000000,0.000000,0
6239,11b0d85786cd58469d5662c3027e9389cff07710,29,29,10,2,23,1055705.0,1056242.0,537.0,16,Query Processor,4.0.0,2,9354.000000,220.000000,0.023519,0
2189,68125c64aec3650760ee1acbada7cf0e0cea5a7e,200,0,4,3,20,160120.0,160145.0,25.0,19,Types;UDF,0.12.0,2,8991.500000,3923.500000,0.436357,1
8515,789f11b98d74fb8225270977ea842012458f46e6,2,0,1,2,8,338521.0,338606.0,85.0,5,,2.1.0,1,886.000000,0.000000,0.000000,0
4035,c38940f354337a9e56b3502139b7fe63398a1f28,59,20,8,1,15,629360.0,629469.0,109.0,6,HiveServer2;Logging,1.1.0,1,20548.000000,0.000000,0.000000,1
2816,a8ef2147fad5aeaaf01279230da9c584db6a2337,1032,253,8,0,23,333741.0,333837.0,96.0,12,,4.0.0,4,67708.000000,307.995942,0.004549,1
5460,35278429d9677b0878a4523ed7b03a5016f81e1d,14,7,2,5,23,276557.0,278089.0,1532.0,6,Transactions,4.0.0,1,4934.000000,0.000000,0.000000,1
5522,bef6c9fd2e0fa6cafe3c3aa14d7521d46c5612c2,128,6,5,2,6,599788.0,599833.0,45.0,11,Transactions,4.0.0;3.2.0;3.1.2,3,13443.666667,271.928095,0.020227,1
12755,447a525aae5f485e86ba2b0249c8de7c0d992a34,1,1,1,1,4,82805.0,82824.0,19.0,4,Configuration;HBase Handler,0.8.0,1,451.000000,0.000000,0.000000,0
5653,8c6b5c66d8442517710c779cf7b3e9adbf7ce019,67,2,2,3,22,10255308.0,18449972.0,8194664.0,5,,4.0.0,1,6209.000000,0.000000,0.000000,1


In [335]:
X_test_szz[(szz_predictions == 1) & (eszz_predictions == 0)]

Unnamed: 0,sha,num_of_insertions,num_of_deletions,num_of_changed_files,day_of_week,hour_of_commit,solve_time,resolution_time,solve_res_diff,number_of_comments,components,affects_versions,number_of_patches,patch_size_mean,patch_size_variance,patch_size_rel_variance,filepath_contains_test
13282,370ead7395fc9194a3dc2d05afc4b3409f6b27da,266,263,15,2,22,189306.0,189384.0,78.0,3,Query Processor,0.3.0,1,5.484500e+04,0.000000e+00,0.000000,1
10327,bd4114d99a8f881b9ec8681e8453a3aefbf0fb27,1915,2,8,3,22,392493.0,433038.0,40545.0,15,Vectorization,1.0.0,2,8.964900e+04,9.360000e+02,0.010441,1
9038,df827b7d3feb7f109cab1ddd12b0cf134eb91283,84,52,3,1,15,4100645.0,4102266.0,1621.0,20,SQL,1.3.0;2.0.0,4,1.389000e+04,7.345563e+03,0.528838,1
450,b88a22e21c3c34daa7ff547be3afc225c631db98,222,31,4,3,18,145125198.0,145125213.0,15.0,19,UDF,1.2.0,5,8.863000e+03,4.499882e+03,0.507715,1
783,a9f25c0e7ad3f81a9f00f601947a161516e33f1b,756,44,22,5,0,1389153.0,1389213.0,60.0,18,,3.0.0,7,3.855371e+04,1.257339e+04,0.326127,1
1602,d42093438421537060113b5c7c58bc46b8490443,2544,1035,134,4,17,15186224.0,15186136.0,88.0,16,CLI,0.10.0;0.11.0,4,1.749545e+05,6.473764e+04,0.370026,1
1704,84ee6f13d8ed84fa2dc6b2ba49ada5c79bbaef5a,748,294,18,5,8,5494565.0,5494625.0,60.0,18,Query Processor,0.13.0,5,5.236220e+04,1.089542e+03,0.020808,0
2128,3228ba7c13ced90f8e845ea8f3ca1a46737ec1fe,6331,6342,99,2,3,2347321.0,2347404.0,83.0,19,Types,2.0.0,4,4.592248e+05,2.243007e+04,0.048843,1
8102,0a4b3d8ff673f6f6670293a7491873c229cb0f40,152,57,4,0,20,871337.0,871386.0,49.0,21,,2.3.0,5,1.706640e+04,4.410460e+03,0.258429,1
6692,3e930bfa3a870ea85e017b751a07e49dfed32f74,98,172,12,4,5,209794.0,209837.0,43.0,9,Transactions,3.0.0,3,3.589933e+04,1.579550e+03,0.043999,1
