In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import svm
import ntpath
import itertools
import os
import warnings
import csv

In [2]:
warnings.filterwarnings('ignore')

In [3]:
rf = RandomForestClassifier(n_estimators=100)
gnb = GaussianNB() # Gaussian Naive Bayes
dt = tree.DecisionTreeClassifier()
neigh = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(random_state=0)
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) # Multi-layer Perceptron
#svm = svm.SVC() # Support Vector Machines

In [4]:
projects = []
for files in os.walk('../data'):
    for f in files[2]:
        projects.append("../data/" + f)

In [5]:
print(projects)

['../data/activiti_BB.csv', '../data/agrona_US.csv', '../data/androidmaps_US.csv', '../data/antennapod_MB.csv', '../data/blaze_LB.csv', '../data/commons_UB.csv', '../data/daqeclipse_LB.csv', '../data/dependencycheck_US.csv', '../data/ebean_LB.csv', '../data/exoplayer_MB.csv', '../data/facebooksdk_BB.csv', '../data/flowable_LB.csv', '../data/googleauth_LS.csv', '../data/gsyvideoplayer_MS.csv', '../data/jabref_UB.csv', '../data/javacv_MS.csv', '../data/jbpm_BB.csv', '../data/jmock_LS.csv', '../data/k9_UB.csv', '../data/metadata_MS.csv', '../data/mindustry_MB.csv', '../data/modernmt_BS.csv', '../data/osmdroid_MB.csv', '../data/phonograph_MS.csv', '../data/pipeline_US.csv', '../data/qlexpress_BS.csv', '../data/rxjava_LB.csv', '../data/seedstack_BS.csv', '../data/sentinel_UB.csv', '../data/snowflake_US.csv', '../data/spark_BB.csv', '../data/springdock_LS.csv', '../data/stroom_BB.csv', '../data/sudachi_BS.csv', '../data/talon_UB.csv', '../data/tessera_BS.csv', '../data/thumbnailator_LS.csv',

In [6]:
with open('../results/accuracy_results.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Predictor', 'Target', 'Accuracy'])

In [7]:
for a in projects:
    for b in projects:
        if a != b:
            
            ## Load datasets
            train_path = a
            test_path = b

            ## Get project name for manual testing
            train_file = ntpath.basename(train_path)
            test_file = ntpath.basename(test_path)

            ## Store datasets in dataframes
            train_df = pd.read_csv(train_path, sep=',')
            test_df = pd.read_csv(test_path, sep=',')

            ## Drop unused columns from the dataframes
            train_df = train_df.drop("javafile", 1)
            train_df = train_df.drop("classfile", 1)

            test_df = test_df.drop("javafile", 1)
            test_df = test_df.drop("classfile", 1)

            ## Convert into NumPy arrays
            train_arr = np.array(train_df, dtype = 'float32')
            test_arr = np.array(test_df, dtype = 'float32')

            ## Split data into metrics (X) ang labels (Y)
            X_train = train_arr[:,0:16]
            y_train = train_arr[:,16]

            X_test = test_arr[:,0:16]
            y_test = test_arr[:,16]

            ## Initialize and run classifier
            classifier = rf  # CHOOSE FROM THE LIST ABOVE
            classifier.fit(X_train, y_train)

            ## Predict on test set and save the performance results
            prediction = classifier.predict(X_test)
            report = metrics.classification_report(y_test, prediction)
            accuracy = metrics.accuracy_score(y_test, prediction)
            
            
            with open('../results/classification_reports.txt', 'a') as f:
                f.write(f"Classification report for {classifier} trained on {train_file} and tested on {test_file}:\n\n"
                f"{report}\n")
                
            with open('../results/accuracy_results.csv', 'a', newline='') as f:
                writer = csv.writer(f)
                writer.writerow([train_file, test_file, accuracy])
                
print("All predictions were executed successfully!")

All predictions were executed successfully!
