In [19]:
#project_name = "."
project_name = "jfreechart"
#project_name = "argouml"
#project_name = "kafka"

In [20]:
import pandas as pd

def load_data(file_path):
    data = pd.read_csv(file_path, sep='\t')
    return data

def remove_empty_rows(data):
    data = data.dropna(subset=['Name'])
    return data

def remove_unnamed_columns(data):
    data = data.filter(regex='^(?!Unnamed.*)')
    return data

def remove_nan_columns(data):
    data = data.dropna(axis=1, how='all')
    return data

def remove_package_rows(data):
    data = data[~data['QualifiedName'].str.contains("<Package>")]
    return data

def convert_ordinal_to_numerical(data):
    ordinal_mapping = {
        'low': 1,
        'low-medium': 2,
        'medium-high': 3,
        'high': 4,
        'very-high': 5
    }
    columns_to_convert = ['Complexity', 'Coupling', 'Size', 'Lack of Cohesion']
    for column in columns_to_convert:
        data[column] = data[column].replace(ordinal_mapping)
    return data

dataset_file = "input/" + project_name + "/class-metrics-dataset.csv"
dataset = load_data(dataset_file)
dataset = remove_empty_rows(dataset)
dataset = remove_unnamed_columns(dataset)
dataset = remove_nan_columns(dataset)
dataset = remove_package_rows(dataset)
dataset = convert_ordinal_to_numerical(dataset)

print(dataset.head())

                         QualifiedName                 Name  Complexity   
2           org.jfree.chart.ChartColor           ChartColor           2  \
3         org.jfree.chart.ChartElement         ChartElement           1   
4  org.jfree.chart.ChartElementVisitor  ChartElementVisitor           1   
5         org.jfree.chart.ChartFactory         ChartFactory           5   
6           org.jfree.chart.ChartHints           ChartHints           1   

   Coupling  Size  Lack of Cohesion   CBO    RFC  SRFC  DIT  ...  NOF  NOSF   
2         1     2                 1   0.0    2.0   0.0  2.0  ...  0.0  24.0  \
3         1     1                 1   1.0    1.0   0.0  1.0  ...  0.0   0.0   
4         1     1                 1   1.0    1.0   0.0  1.0  ...  0.0   0.0   
5         5     3                 3  81.0  232.0  84.0  1.0  ...  0.0   1.0   
6         1     1                 1   0.0    1.0   0.0  1.0  ...  0.0   2.0   

   NOM  NOSM  NORM  LCOM   LCAM   LTCC  ATFD   SI  
2  1.0   1.0   0.0   0

In [21]:
def label_problematic_class(row, problematic_class_threshold=2, highly_problematic_class_threshold=3):
    problematic_score = (row['Complexity'] + row['Coupling'] + row['Size'] + row['Lack of Cohesion']) / 4
    if problematic_score > highly_problematic_class_threshold:
        return 2
    elif problematic_score > problematic_class_threshold:
        return 1
    else:
        return 0

# Label the dataset
dataset['Problematic'] = dataset.apply(label_problematic_class, axis=1)

# Count the number of Problematic Classes
num_dataset_rows = dataset.shape[0]
num_problematic_classes = dataset[dataset['Problematic'] == 1].shape[0]
num_highly_problematic_classes = dataset[dataset['Problematic'] == 2].shape[0]
print(f'Number of instances in the dataset: {num_dataset_rows}')
print(f'Number of Problematic Classes in the dataset: {num_problematic_classes}')
print(f'Number of Highly Problematic Classes in the dataset: {num_highly_problematic_classes}')

Number of instances in the dataset: 633
Number of Problematic Classes in the dataset: 84
Number of Highly Problematic Classes in the dataset: 87


In [22]:
def remove_labeling_columns(data):
    data = data.drop(['Name', 'Complexity', 'Coupling', 'Size', 'Lack of Cohesion'], axis=1)
    return data

dataset = remove_labeling_columns(dataset)
print(dataset.head())

                         QualifiedName   CBO    RFC  SRFC  DIT  NOC    WMC   
2           org.jfree.chart.ChartColor   0.0    2.0   0.0  2.0  0.0    2.0  \
3         org.jfree.chart.ChartElement   1.0    1.0   0.0  1.0  7.0    1.0   
4  org.jfree.chart.ChartElementVisitor   1.0    1.0   0.0  1.0  0.0    1.0   
5         org.jfree.chart.ChartFactory  81.0  232.0  84.0  1.0  0.0  137.0   
6           org.jfree.chart.ChartHints   0.0    1.0   0.0  1.0  0.0    1.0   

     LOC  CMLOC  NOF  NOSF  NOM  NOSM  NORM  LCOM   LCAM   LTCC  ATFD   SI   
2   64.0   39.0  0.0  24.0  1.0   1.0   0.0   0.0  0.250  1.000   0.0  0.0  \
3    2.0    1.0  0.0   0.0  1.0   0.0   0.0   0.0  0.000  0.000   0.0  0.0   
4    2.0    1.0  0.0   0.0  1.0   0.0   0.0   0.0  0.000  0.000   0.0  0.0   
5  782.0  780.0  0.0   1.0  0.0  50.0   0.0   0.0  0.799  0.569   3.0  0.0   
6   17.0    1.0  0.0   2.0  1.0   0.0   0.0   0.0  0.000  0.000   0.0  0.0   

   Problematic  
2            0  
3            0  
4          

In [23]:
def preprocess_data(data):
    X = data.drop(['QualifiedName', 'Problematic'], axis=1)
    y = data['Problematic']
    return X, y

X, y = preprocess_data(dataset)

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

def train_decision_tree(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    return clf, X_test, y_test

clf, X_test, y_test = train_decision_tree(X, y)


In [25]:
def make_predictions(clf, X_test):
    y_pred = clf.predict(X_test)
    return y_pred

y_pred = make_predictions(clf, X_test)

In [26]:
from sklearn.metrics import accuracy_score

print(f'Test Accuracy: {accuracy_score(y_test, y_pred):.4f}')

Test Accuracy: 0.9158


In [27]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_results(y_test, y_pred):
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

evaluate_results(y_test, y_pred)

Confusion Matrix:
[[122   9   0]
 [  4  19   1]
 [  1   1  33]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.95       131
           1       0.66      0.79      0.72        24
           2       0.97      0.94      0.96        35

    accuracy                           0.92       190
   macro avg       0.86      0.89      0.87       190
weighted avg       0.92      0.92      0.92       190

