### Import required modules

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import svm
import ntpath
import itertools
import os
import warnings
import csv

In [2]:
warnings.filterwarnings('ignore') 

### List of classifiers to choose from

In [3]:
rf = RandomForestClassifier(n_estimators=100)
gnb = GaussianNB() # Gaussian Naive Bayes
dt = tree.DecisionTreeClassifier()
neigh = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(random_state=0)
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) # Multi-layer Perceptron
#svm = svm.SVC() # Support Vector Machines

### Reusable predictions function

In [9]:
def run_predictions(train_file, test_file, classifier):
    ## Load datasets
    train_path = train_file
    test_path = test_file

    ## Store datasets in dataframes
    train_df = pd.read_csv(train_path, sep=',')
    test_df = pd.read_csv(test_path, sep=',')

    ## Drop unused columns from the dataframes
    train_df = train_df.drop("javafile", 1)
    train_df = train_df.drop("classfile", 1)

    test_df = test_df.drop("javafile", 1)
    test_df = test_df.drop("classfile", 1)

    ## Convert into NumPy arrays
    train_arr = np.array(train_df, dtype = 'float32')
    test_arr = np.array(test_df, dtype = 'float32')

    ## Split data into metrics (X) ang labels (Y)
    X_train = train_arr[:,0:16]
    y_train = train_arr[:,16]

    X_test = test_arr[:,0:16]
    y_test = test_arr[:,16]

    ## Initialize and run classifier
    classifier = classifier # CHOOSE FROM THE LIST ABOVE
    classifier.fit(X_train, y_train)
        
    ## Predict on test set and save the performance results
    prediction = classifier.predict(X_test)
    report = metrics.classification_report(y_test, prediction)
    accuracy = metrics.accuracy_score(y_test, prediction)
    
    return report, accuracy, classifier

# Prediction using projects - related to RQ1

In [18]:
projects = []
for files in os.walk('../data/projects_tables'):
    for f in files[2]:
        projects.append("../data/projects_tables/" + f)

In [19]:
print(projects)

['../data/projects_tables/activiti_BB.csv', '../data/projects_tables/agrona_US.csv', '../data/projects_tables/androidmaps_US.csv', '../data/projects_tables/antennapod_MB.csv', '../data/projects_tables/blaze_LB.csv', '../data/projects_tables/commons_UB.csv', '../data/projects_tables/daqeclipse_LB.csv', '../data/projects_tables/dependencycheck_US.csv', '../data/projects_tables/ebean_LB.csv', '../data/projects_tables/exoplayer_MB.csv', '../data/projects_tables/facebooksdk_BB.csv', '../data/projects_tables/flowable_LB.csv', '../data/projects_tables/googleauth_LS.csv', '../data/projects_tables/gsyvideoplayer_MS.csv', '../data/projects_tables/jabref_UB.csv', '../data/projects_tables/javacv_MS.csv', '../data/projects_tables/jbpm_BB.csv', '../data/projects_tables/jmock_LS.csv', '../data/projects_tables/k9_UB.csv', '../data/projects_tables/metadata_MS.csv', '../data/projects_tables/mindustry_MB.csv', '../data/projects_tables/modernmt_BS.csv', '../data/projects_tables/osmdroid_MB.csv', '../data/

In [20]:
with open('../results/projects_accuracy_results.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Predictor', 'Target', 'Accuracy'])

In [22]:
for tr in projects:
    for tst in projects:
        if tr != tst:
            report, accuracy, classifier = run_predictions(tr, tst, rf)
            
            with open('../results/projects_classification_reports.txt', 'a') as f:
                f.write(f"Classification report for {classifier} trained on {ntpath.basename(tr)} and tested on {ntpath.basename(tst)}:\n\n"
                f"{report}\n")
                
            with open('../results/projects_accuracy_results.csv', 'a', newline='') as f:
                writer = csv.writer(f)
                writer.writerow([ntpath.basename(tr), ntpath.basename(tst), accuracy])
                
print("All predictions for projects were executed successfully!")

All predictions for projects were executed successfully!


# Prediction using combinations of projects (diversity sets) - related to RQ2

In [12]:
diversity_sets = []
for files in os.walk('../data/combinations_tables'):
    for f in files[2]:
        diversity_sets.append("../data/combinations_tables/" + f)

In [13]:
print(diversity_sets)

['../data/combinations_tables/diverse1.csv', '../data/combinations_tables/diverse2.csv', '../data/combinations_tables/diverse3.csv', '../data/combinations_tables/diverse4.csv', '../data/combinations_tables/diverse5.csv', '../data/combinations_tables/diverse6.csv', '../data/combinations_tables/diverse7.csv', '../data/combinations_tables/diverse8.csv', '../data/combinations_tables/nondiverse_BB.csv', '../data/combinations_tables/nondiverse_BL.csv', '../data/combinations_tables/nondiverse_BM.csv', '../data/combinations_tables/nondiverse_BU.csv', '../data/combinations_tables/nondiverse_SB.csv', '../data/combinations_tables/nondiverse_SL.csv', '../data/combinations_tables/nondiverse_SM.csv', '../data/combinations_tables/nondiverse_SU.csv']


In [14]:
testing_projects = ['../data/projects_tables/facebooksdk_BB.csv', '../data/projects_tables/tessera_BS.csv', '../data/projects_tables/zxing_MB.csv', '../data/projects_tables/javacv_MS.csv', '../data/projects_tables/flowable_LB.csv', '../data/projects_tables/googleauth_LS.csv', '../data/projects_tables/k9_UB.csv', '../data/projects_tables/pipeline_US.csv']

In [8]:
with open('../results/combinations_accuracy_results.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Predictor', 'Target', 'Accuracy'])

In [16]:
for tr in diversity_sets:
    for tst in testing_projects:
        report, accuracy, classifier = run_predictions(tr, tst, rf)
                
        with open('../results/combinations_classification_reports.txt', 'a') as f:
            f.write(f"Classification report for {classifier} trained on {ntpath.basename(tr)} and tested on {ntpath.basename(tst)}:\n\n"
            f"{report}\n")
                
        with open('../results/combinations_accuracy_results.csv', 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([ntpath.basename(tr), ntpath.basename(tst), accuracy])
                
print("All predictions of diversity sets were executed successfully!")

All predictions of diversity sets were executed successfully!
