In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from algo_pre_processing import ProjectsData
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
from sklearn.model_selection import KFold
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


warnings.filterwarnings("ignore")

In the first section, you will load the projects' data. After that, you can proceed to run the statistical feature experiment and the embedding-based features experiment.

Since we used 2 batches of data, you will find `embedding_features1` and `embedding_features2` (or `statistical_features1` and `statistical_features2`) representing the two batches we used. However, if you are running an experiment with only one batch of data, you can choose to use only one of them. Simply assign the final dataset (`embedding_features` or `statistical_features`) as the dataset you created.

## Load projects' data

Change the following paths according to your implementation:
- ProjectsData Object creation:
    - `MCW_path`: the results of the label extraction part - the MCW run.
    - `projects_path`: the path of the arff files as defined in section 2 of label extraction, you can use either `arff_source` folder or `arff_dest` folder.
    - `features_path`: the path of the features you would like to use, for the statistical meta-features we used the output of `statistical meta features.py` and for the embedding based meta-features we used the output of the `embedding java files` folder instructions. 
    - `algo_matlab_results`: the results of the label extraction part - the MATLAB algorithms output.
    - `to_use`: select the folders numbers that you want to use - according to the number of groups defined in `random_groups.py`.
- Get data function: use mode for majority voting labeling.

In [None]:
# get project's data
projects_data = ProjectsData(MCW_path= 'data/MCW/batch1',
                 projects_path= 'data/Projects_Arff_batch1',
                 features_path= 'meta_model_features/embeddings/graph_embedding.csv',
                 algo_matlab_results= 'data/Results_Rest_batch1',
                 to_use=list(range(1, 51)))
embedding_features1 = projects_data.get_data(label_method='mode')
algo_list = projects_data.get_algo_list()
algo_scores = projects_data.get_algo_scores()
projects_data2 = ProjectsData(MCW_path='data/MCW/batch2',
                 projects_path='data/Projects_Arff_batch2',
                 features_path= 'meta_model_features/embeddings/graph_embedding.csv',
                 algo_matlab_results='data/Results_Rest_batch2',
                 to_use=list(range(1, 100)))
embedding_features2 = projects_data2.get_data(label_method='mode')

embedding_features = pd.concat([embedding_features1,embedding_features2])

In [None]:
# get project's data
projects_data = ProjectsData(MCW_path= 'data/MCW/batch1',
                 projects_path= 'data/Projects_Arff_batch1',
                 features_path= 'meta_model_features/embeddings/first_batch/base_buggy.csv',
                 algo_matlab_results= 'data/Results_Rest_batch1',
                 to_use=list(range(1, 51)))
code_embedding_features1 = projects_data.get_data(label_method='max')
algo_list = projects_data.get_algo_list()
algo_scores = projects_data.get_algo_scores()
projects_data2 = ProjectsData(MCW_path='data/MCW/batch2',
                 projects_path='data/Projects_Arff_batch2',
                 features_path= 'meta_model_features/embeddings/second_batch/base_buggy.csv',
                 algo_matlab_results='data/Results_Rest_batch2',
                 to_use=list(range(1, 100)))
code_embedding_features2 = projects_data2.get_data(label_method='max')

code_embedding_features = pd.concat([code_embedding_features1,code_embedding_features2]) 

In [None]:
# get project's data
projects_data = ProjectsData(MCW_path= 'data/MCW/batch1',
                 projects_path= 'data/Projects_Arff_batch1',
                 features_path= 'meta_model_features/statistical_features/features_first_batch.csv',
                 algo_matlab_results= 'data/Results_Rest_batch1',
                            to_use=list(range(1, 51)))
statistical_features1 = projects_data.get_data(label_method='max')
algo_list = projects_data.get_algo_list()
algo_scores = projects_data.get_algo_scores()
projects_data2 = ProjectsData(MCW_path='data/MCW/batch2',
                 projects_path='data/Projects_Arff_batch2',
                 features_path= 'meta_model_features/statistical_features/features_second_batch.csv',
                 algo_matlab_results='data/Results_Rest_batch2',
                 to_use=list(range(1, 100)))
statistical_features2 = projects_data2.get_data(label_method='max')

statistical_features = pd.concat([statistical_features1,statistical_features2]) # statistical features

In [None]:
# make sure we have the same projects for embedding experiment and statistical experiment
pv_embedding = embedding_features[['pv']]
pv_embedding.columns = ['pv2']
statistical_features = statistical_features.merge(pv_embedding, left_on='pv', right_on='pv2')
statistical_features = projects_data.get_mode_value(statistical_features)

In [None]:
from sklearn.tree import DecisionTreeClassifier

def results_processing(algo_list, joined, y_test, preds):
    # get the predictions of the algo + the real label
    new_df = pd.DataFrame(y_test)
    new_df['prediction'] = preds
    with_res = new_df.join(joined, lsuffix="l")
    with_res = with_res[['prediction', 'best_algo'] + algo_scores]

    for metric in ["", "_precision", "_recall", "_accuracy"]:
        # get comb results
        with_res['comb' + metric] = None
        for algo in algo_list:
            with_res['comb' + metric].loc[with_res['prediction'] == algo] = \
                with_res[with_res['prediction'] == algo][
                    algo + metric]
        with_res['comb' + metric] = with_res['comb' + metric].astype(float)

    # get best algorithm results
    with_res['best'] = None
    for algo in algo_list:
        with_res['best'].loc[with_res['best_algo'] == algo] = with_res[with_res['best_algo'] == algo][algo]
    with_res['best'] = with_res['best'].astype(float)

    # get the error for each algo
    for algo in algo_list + ['comb']:
        with_res[algo + "_error"] = (with_res['best'] - with_res[algo]).astype(float)
    return with_res
random_st = 14

grid_dictionary = {'RF': (RandomForestClassifier(random_state=random_st), 
                          {'n_estimators': [20,50,100,150],
                            'max_features': ['sqrt', 'log2'],
                            'max_depth': [2,3,4],
                            'criterion': ['gini', 'entropy', "log_loss"]}),
                   'XGboost': (GradientBoostingClassifier(random_state=random_st),
                               {"loss": ["deviance"],
                                "learning_rate": [0.05,0.1],
                                "min_samples_split": [0.28,0.4],
                                "min_samples_leaf": [0.1,0.2],
                                "max_depth": [2,3,4,5,6],
                                "max_features": ["sqrt"],
                                "criterion": ["friedman_mse","mse"],
                                "subsample": [0.5,0.8],
                                "n_estimators": [50,70,100],
                                "warm_start": [True, False]}),
                  'LR': (LogisticRegression(random_state=random_st),
                        {
                                'penalty' : ['l1','l2'], 
                                'C'       : np.logspace(-3,3,7),
                                'solver'  : ['liblinear'],}),
                  'SVM': (SVC(),
                                {'C': [0.1, 1, 10, 100, 1000], 
                                  'gamma': [1, 0.1, 0.01],
                                  'kernel': ['rbf']} ),
                   'DT':(DecisionTreeClassifier(random_state=0),
                               {'criterion': ['gini', 'entropy', "log_loss"],
                                                           'max_depth': [2,3,4,5,6],
                                'min_samples_split':[2,3,4],
                                'min_impurity_decrease':[0.01,0.1,0.5]
}),
}

## CALL-GRAPH Embedding based features experiment

1. Select the meta classifier that will be used: RF-Random Forest, XGboost, LR-Logistic Regression or SVM-Support Vector Machine.
2. Select the number of folds for the cross validation.

In [None]:
META_CLASSIFIER = 'XGboost'
NUM_OF_FOLDS = 5

In [None]:
embedding_features[['project', 'version']] = embedding_features.pv.str.split("_", expand=True)[[0, 1]]
pv_to_keep = pd.read_csv('call_graph_data_sources/pv_above_1000.csv')[['project','version']]
embedding_features= embedding_features.merge(pv_to_keep, on=['project','version'])
embedding_features = embedding_features.drop(columns=['project',
 'version'])                      
embedding_features = embedding_features[embedding_features['best_algo'] != 'Dycom']
embedding_features = embedding_features[embedding_features['best_algo'] != 'LT']

In [None]:
X = embedding_features.drop(columns=['pv'] + algo_scores + ['best_algo','mode_algo'])
X = X.fillna(0)
y = embedding_features['best_algo']

In [None]:
from sklearn.model_selection import StratifiedGroupKFold
acc_list_embd, f1_list = [], []
# split to train and test + oversampling
mean_results_embd = {'best':[],'MCW': [], 'MCW_precision': [], 'MCW_recall': [], 'MCW_accuracy': [], 'comb': [], 'comb_precision': [], 'comb_recall': [],
 'comb_accuracy': [], 'TPTL': [], 'TPTL_precision': [], 'TPTL_recall': [], 'TPTL_accuracy': [], 'TCA_rnd': [], 'TCA_rnd_precision': [],
 'TCA_rnd_recall': [], 'TCA_rnd_accuracy': [], 'LT': [], 'LT_precision': [], 'LT_recall': [], 'LT_accuracy': [], 'Dycom': [], 'Dycom_precision': [],
 'Dycom_recall': [], 'Dycom_accuracy': [], 'TDS': [], 'TDS_precision': [], 'TDS_recall': [], 'TDS_accuracy': []}

error_results = {'MCW': [], 'comb': [], 'TPTL': [], 'TCA_rnd': [], 'LT': [], 'Dycom': [], 'TDS': []}
ss = KFold(n_splits=5,shuffle=True,random_state=14)

fold = 0
for train_index, test_index in ss.split(X,y):
    fold += 1
    scaler = StandardScaler()
    x_train, y_train = X.iloc[train_index], y.iloc[train_index]
    print(y_train.value_counts())

    x_test, y_test = X.iloc[test_index], y.iloc[test_index]
    print(y_test.value_counts())

    sm = SMOTE(random_state=random_st, k_neighbors=2)
    X_samp, y_samp = sm.fit_resample(x_train, y_train)
    print("Oversampling is finished....")
    # building the model
    rfc = grid_dictionary[META_CLASSIFIER][0]
    param_grid = grid_dictionary[META_CLASSIFIER][1]

    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
    CV_rfc.fit(X_samp, y_samp)

    preds = CV_rfc.best_estimator_.predict(x_test)
    print(CV_rfc.best_estimator_)
    acc_list_embd.append(accuracy_score(y_test, preds))
    f1_list.append(f1_score(y_test, preds, average='micro'))

    ####### RESULTS ANALYSIS #######
    with_res = results_processing(algo_list, embedding_features, y_test, preds)
    
    for algo in algo_list +['comb']:
        mean_results_embd[algo].append(np.mean(with_res[algo]))
        mean_results_embd[algo + "_precision"].append(np.mean(with_res[algo + "_precision"]))
        mean_results_embd[algo + "_recall"].append(np.mean(with_res[algo + "_recall"]))
        mean_results_embd[algo + "_accuracy"].append(np.mean(with_res[algo + "_accuracy"]))
        error_results[algo].append(np.mean(with_res[algo + "_error"]))
    mean_results_embd['best'].append(np.mean(with_res['best']))
    with_res.to_csv(f"Results_meta_model/current_results/embedding_result_new_data_{fold}.csv")
    
print('random_state:', random_st)
print("MCW f1: ", np.mean(mean_results_embd['MCW']), ", MCW acc: ", np.mean(mean_results_embd['MCW_accuracy']),
      ", MCW precision: ", np.mean(mean_results_embd['MCW_precision']), ", MCW recall: ",
      np.mean(mean_results_embd['MCW_recall']))
print("comb f1: ", np.mean(mean_results_embd['comb']), ", comb acc: ", np.mean(mean_results_embd['comb_accuracy']),
      ", comb precision: ", np.mean(mean_results_embd['comb_precision']), ", comb recall: ",
      np.mean(mean_results_embd['comb_recall']))
print("best: ", np.mean(mean_results_embd['best']))
print("accuracy meta model list: " , acc_list_embd)

In [None]:
res_unioned = []
for i in range(1,6):
    temp = pd.read_csv(f'Results_meta_model/current_results/embedding_result_new_data_{i}.csv')
    res_unioned.append(temp)
res_pd = pd.concat(res_unioned)
res_pd.to_csv('exp_call_graph_best_new_data2.csv')

## Text Embedding based features experiment

In [None]:
pv_embedding = embedding_features[['pv']]
pv_embedding.columns = ['pv2']
code_embedding_features = code_embedding_features.merge(pv_embedding, left_on='pv', right_on='pv2')


In [None]:
X = code_embedding_features.drop(columns=['pv','pv2'] + algo_scores + ['best_algo'])
X = X.fillna(0)
y = code_embedding_features['best_algo']

In [None]:
code_embedding_features.drop_duplicates()['best_algo'].value_counts()

In [None]:
acc_list_embd, f1_list = [], []
# split to train and test + oversampling
mean_results_embd = {'best':[],'MCW': [], 'MCW_precision': [], 'MCW_recall': [], 'MCW_accuracy': [], 'comb': [], 'comb_precision': [], 'comb_recall': [],
 'comb_accuracy': [], 'TPTL': [], 'TPTL_precision': [], 'TPTL_recall': [], 'TPTL_accuracy': [], 'TCA_rnd': [], 'TCA_rnd_precision': [],
 'TCA_rnd_recall': [], 'TCA_rnd_accuracy': [], 'LT': [], 'LT_precision': [], 'LT_recall': [], 'LT_accuracy': [], 'Dycom': [], 'Dycom_precision': [],
 'Dycom_recall': [], 'Dycom_accuracy': [], 'TDS': [], 'TDS_precision': [], 'TDS_recall': [], 'TDS_accuracy': []}

error_results = {'MCW': [], 'comb': [], 'TPTL': [], 'TCA_rnd': [], 'LT': [], 'Dycom': [], 'TDS': []}
ss = KFold(n_splits=5,shuffle=True,random_state=14)

fold = 0
for train_index, test_index in ss.split(X,y):
    fold += 1
    scaler = StandardScaler()
    x_train, y_train = X.iloc[train_index], y.iloc[train_index]
    x_test, y_test = X.iloc[test_index], y.iloc[test_index]

    sm = SMOTE(random_state=random_st, k_neighbors=2)
    X_samp, y_samp = sm.fit_resample(x_train, y_train)
    print("Oversampling is finished....")
    # building the model
    rfc = grid_dictionary[META_CLASSIFIER][0]
    param_grid = grid_dictionary[META_CLASSIFIER][1]

    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
    CV_rfc.fit(X_samp, y_samp)

    preds = CV_rfc.best_estimator_.predict(x_test)
    print(CV_rfc.best_estimator_)
    acc_list_embd.append(accuracy_score(y_test, preds))
    f1_list.append(f1_score(y_test, preds, average='micro'))

    ####### RESULTS ANALYSIS #######
    with_res = results_processing(algo_list, embedding_features, y_test, preds)
    
    for algo in algo_list +['comb']:
        mean_results_embd[algo].append(np.mean(with_res[algo]))
        mean_results_embd[algo + "_precision"].append(np.mean(with_res[algo + "_precision"]))
        mean_results_embd[algo + "_recall"].append(np.mean(with_res[algo + "_recall"]))
        mean_results_embd[algo + "_accuracy"].append(np.mean(with_res[algo + "_accuracy"]))
        error_results[algo].append(np.mean(with_res[algo + "_error"]))
    mean_results_embd['best'].append(np.mean(with_res['best']))
    # results_all = with_res.describe()
    with_res.to_csv(f"Results_meta_model/current_results/text_embedding_result_new_data_{fold}2.csv")
    
print('random_state:', random_st)
print("MCW f1: ", np.mean(mean_results_embd['MCW']), ", MCW acc: ", np.mean(mean_results_embd['MCW_accuracy']),
      ", MCW precision: ", np.mean(mean_results_embd['MCW_precision']), ", MCW recall: ",
      np.mean(mean_results_embd['MCW_recall']))
print("comb f1: ", np.mean(mean_results_embd['comb']), ", comb acc: ", np.mean(mean_results_embd['comb_accuracy']),
      ", comb precision: ", np.mean(mean_results_embd['comb_precision']), ", comb recall: ",
      np.mean(mean_results_embd['comb_recall']))
print("best: ", np.mean(mean_results_embd['best']))
print("accuracy meta model list: " , acc_list_embd)

In [None]:
res_unioned = []
for i in range(1,6):
    temp = pd.read_csv(f'Results_meta_model/current_results/text_embedding_result_new_data_{i}2.csv')
    res_unioned.append(temp)
res_pd = pd.concat(res_unioned)
res_pd.to_csv('exp_text_embedding_best_new_data2.csv')

## Statistical features experiment

1. Select the meta classifier that will be used: RF-Random Forest, XGboost, LR-Logistic Regression or SVM-Support Vector Machine.
2. Select the number of folds for the cross validation.

In [None]:
META_CLASSIFIER = 'XGboost'
NUM_OF_FOLDS = 5

In [None]:
pv_embedding = embedding_features[['pv']]
pv_embedding.columns = ['pv2']
statistical_features = statistical_features.merge(pv_embedding, left_on='pv', right_on='pv2').drop_duplicates()

In [None]:
X = statistical_features.drop(columns=['pv'] + algo_scores + ['best_algo','Unnamed: 0_std','Unnamed: 0_avg', 'Unnamed: 0_min','Unnamed: 0_skew','mode_algo','pv2_x','pv2_y'])#,'pv2',
X = X.fillna(0)
y = statistical_features['best_algo']

In [None]:
acc_list_stat, f1_list = [], []
# split to train and test + oversampling
mean_results_stat = {'best':[],'MCW': [], 'MCW_precision': [], 'MCW_recall': [], 'MCW_accuracy': [], 'comb': [], 'comb_precision': [], 'comb_recall': [],
 'comb_accuracy': [], 'TPTL': [], 'TPTL_precision': [], 'TPTL_recall': [], 'TPTL_accuracy': [], 'TCA_rnd': [], 'TCA_rnd_precision': [],
 'TCA_rnd_recall': [], 'TCA_rnd_accuracy': [], 'LT': [], 'LT_precision': [], 'LT_recall': [], 'LT_accuracy': [], 'Dycom': [], 'Dycom_precision': [],
 'Dycom_recall': [], 'Dycom_accuracy': [], 'TDS': [], 'TDS_precision': [], 'TDS_recall': [], 'TDS_accuracy': []}

error_results = {'MCW': [], 'comb': [], 'TPTL': [], 'TCA_rnd': [], 'LT': [], 'Dycom': [], 'TDS': []}
ss = KFold(n_splits=NUM_OF_FOLDS,shuffle=True,random_state=14)

fold = 0
for train_index, test_index in ss.split(X,y):
    fold += 1
    scaler = StandardScaler()
    x_train, y_train = X.iloc[train_index], y.iloc[train_index]
    x_test, y_test = X.iloc[test_index], y.iloc[test_index]
    sm = SMOTE(random_state=random_st, k_neighbors=1)
    X_samp, y_samp = sm.fit_resample(x_train, y_train)
    print("Oversampling is finished....")
    # building the model
    rfc = grid_dictionary[META_CLASSIFIER][0]
    param_grid = grid_dictionary[META_CLASSIFIER][1]

    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
    CV_rfc.fit(X_samp, y_samp)

    preds = CV_rfc.best_estimator_.predict(x_test)
    print(CV_rfc.best_estimator_)
    acc_list_stat.append(accuracy_score(y_test, preds))
    f1_list.append(f1_score(y_test, preds, average='micro'))

    ####### RESULTS ANALYSIS #######
    with_res = results_processing(algo_list, statistical_features, y_test, preds)
    
    for algo in ['MCW', 'comb', 'TPTL', 'TCA_rnd', 'LT', 'Dycom', 'TDS']:
        mean_results_stat[algo].append(np.mean(with_res[algo]))
        mean_results_stat[algo + "_precision"].append(np.mean(with_res[algo + "_precision"]))
        mean_results_stat[algo + "_recall"].append(np.mean(with_res[algo + "_recall"]))
        mean_results_stat[algo + "_accuracy"].append(np.mean(with_res[algo + "_accuracy"]))
        error_results[algo].append(np.mean(with_res[algo + "_error"]))
    mean_results_stat['best'].append(np.mean(with_res['best']))
    with_res.to_csv(f"Results_meta_model/current_results/statistical_result_new_data_{fold}.csv")
    
print('random_state:', random_st)
print("MCW f1: ", np.mean(mean_results_stat['MCW']), ", MCW acc: ", np.mean(mean_results_stat['MCW_accuracy']),
      ", MCW precision: ", np.mean(mean_results_stat['MCW_precision']), ", MCW recall: ",
      np.mean(mean_results_stat['MCW_recall']))
print("comb f1: ", np.mean(mean_results_stat['comb']), ", comb acc: ", np.mean(mean_results_stat['comb_accuracy']),
      ", comb precision: ", np.mean(mean_results_stat['comb_precision']), ", comb recall: ",
      np.mean(mean_results_stat['comb_recall']))
print("best: ", np.mean(mean_results_stat['best']))
print("accuracy meta model list: " , acc_list_stat)

In [None]:
res_unioned = []
for i in range(1,6):
    temp = pd.read_csv(f'Results_meta_model/current_results/statistical_result_new_data_{i}.csv')
    res_unioned.append(temp)
res_pd = pd.concat(res_unioned)
res_pd.to_csv('exp_statistical_best_new_data2.csv')