In [168]:
import pandas as pd
import numpy as np
import functools

In [2]:
df = pd.read_csv("szz_data.csv")

In [6]:
df.columns

Index(['sha', 'num_of_insertions', 'num_of_deletions', 'num_of_changed_files',
       'day_of_week', 'hour_of_commit', 'solve_time', 'resolution_time',
       'solve_res_diff', 'number_of_comments', 'components',
       'affects_versions', 'number_of_patches', 'patch_size_mean',
       'patch_size_variance', 'patch_size_rel_variance',
       'filepath_contains_test', 'label'],
      dtype='object')

In [4]:
df.drop(['summary', 'description', 'comments'], axis=1, inplace=True)

In [252]:
def func(groupby, x):
    if isinstance(x, list):
        agg = groupby[x]
        result = (agg[:,"count"] * agg[:,"mean"]).sum() / agg[:,"count"].sum()
        return result
#     print(groupby)
    return groupby[:,"mean"].mean()

def multi_value_target_encoding(df, by_labels, on_label, separator):
    dummy = df.copy()
    results = []
    for by in by_labels:
        dummy[by] = dummy[by].str.split(separator)
        groupby_count_mean = (dummy
           .set_index(on_label)[by]
           .apply(pd.Series)
           .stack()
           .reset_index(name=by)  # Reshape the data
           .groupby([by])
           .label
           .apply(lambda x: x.agg(['count', 'mean'])))
#         print(groupby_count_mean)

        my_func = functools.partial(func, groupby_count_mean)
        res = dummy[by].map(my_func).to_numpy()
        results.append(res)
    return np.vstack(results).T
        

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [250]:
numeric_features = ['num_of_insertions', 'num_of_deletions', 'num_of_changed_files', 
                      'day_of_week', 'hour_of_commit', 'solve_time', 'resolution_time', 
                     'solve_res_diff', 'number_of_comments', 'number_of_patches',
                    'patch_size_mean', 'patch_size_variance', 'patch_size_rel_variance', 'filepath_contains_test']

numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])

# categorical encoding: http://contrib.scikit-learn.org/categorical-encoding/

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)],
    remainder='drop')

categorical_features = ['components', 'affects_versions']
processed_features = numeric_features + categorical_features

In [253]:
X = df.drop('label', axis=1)
y = df['label']
# preprocess X
X = preprocessor.fit_transform(X, y)
X_categorical = multi_value_target_encoding(df, categorical_features, 'label', ';')
X = np.concatenate((X,X_categorical), axis=1)

In [256]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier(n_estimators=200)
model.fit(X_train, y_train)
print("model score: %.3f" % model.score(X_test, y_test))

model score: 0.780


In [257]:
def get_feature_importances(model):
    fetaure_importances = zip(processed_features, model.feature_importances_)
    fetaure_importances = sorted(fetaure_importances, key=lambda x: x[1], reverse=True)
    for feature_name, importance in fetaure_importances:
        print("%s: %f" % (feature_name, importance))

In [258]:
get_feature_importances(model)

num_of_insertions: 0.143263
affects_versions: 0.095665
patch_size_mean: 0.089580
num_of_changed_files: 0.079608
num_of_deletions: 0.072670
components: 0.068645
solve_res_diff: 0.067557
solve_time: 0.061444
resolution_time: 0.059784
number_of_comments: 0.049103
patch_size_variance: 0.046281
patch_size_rel_variance: 0.045633
hour_of_commit: 0.044846
number_of_patches: 0.030357
day_of_week: 0.029128
filepath_contains_test: 0.016436
