In [11]:
import pandas as pd

def load_data(file_path):
    data = pd.read_csv(file_path, sep='\t')
    return data

def remove_empty_rows(data):
    data = data.dropna(subset=['Name'])
    return data

def remove_unnamed_columns(data):
    data = data.filter(regex='^(?!Unnamed.*)')
    return data

def remove_nan_columns(data):
    data = data.dropna(axis=1, how='all')
    return data

def convert_ordinal_to_numerical(data):
    ordinal_mapping = {
        'low': 1,
        'low-medium': 2,
        'medium-high': 3,
        'high': 4,
        'very-high': 5
    }
    columns_to_convert = ['Complexity', 'Coupling', 'Size', 'Lack of Cohesion']
    for column in columns_to_convert:
        data[column] = data[column].replace(ordinal_mapping)
    return data

#project_name = "jfreechart"
project_name = "argouml"
dataset_file = "input/" + project_name + "/package-metrics-dataset.csv"
dataset = load_data(dataset_file)
dataset = remove_empty_rows(dataset)
dataset = remove_unnamed_columns(dataset)
dataset = remove_nan_columns(dataset)
dataset = convert_ordinal_to_numerical(dataset)

print(dataset.head())

                             QualifiedName                            Name   
1           <Package>org.argouml.activity2           org.argouml.activity2  \
2   <Package>org.argouml.activity2.diagram   org.argouml.activity2.diagram   
3         <Package>org.argouml.application         org.argouml.application   
4     <Package>org.argouml.application.api     org.argouml.application.api   
5  <Package>org.argouml.application.events  org.argouml.application.events   

   Complexity  Coupling  Size  Lack of Cohesion  #(C&I)    #C   #I    LOC   
1           1         1     1                 1     3.0   3.0  0.0   39.0  \
2           1         4     3                 1    30.0  26.0  4.0  847.0   
3           1         1     2                 1     6.0   6.0  0.0  476.0   
4           1         1     2                 1     7.0   2.0  5.0  170.0   
5           1         2     3                 1    16.0   9.0  7.0  353.0   

     AC    EC    Abs    Ins     ND    WMC  
1   0.0   2.0  0.000  1.

In [12]:
def label_problematic_component(row, problematic_class_threshold=3, highly_problematic_class_threshold=4):
    problematic_score = (row['Complexity'] + row['Coupling'] + row['Size'] + row['Lack of Cohesion']) / 4
    if problematic_score >= highly_problematic_class_threshold:
        return 2
    elif problematic_score >= problematic_class_threshold:
        return 1
    else:
        return 0

# Label the dataset
dataset['Problematic'] = dataset.apply(label_problematic_component, axis=1)

# Count the number of Problematic Classes
num_dataset_rows = dataset.shape[0]
num_problematic_classes = dataset[dataset['Problematic'] == 1].shape[0]
num_highly_problematic_classes = dataset[dataset['Problematic'] == 2].shape[0]
print(f'Number of instances in the dataset: {num_dataset_rows}')
print(f'Number of Problematic Components in the dataset: {num_problematic_classes}')
print(f'Number of Highly Problematic Components in the dataset: {num_highly_problematic_classes}')

Number of instances in the dataset: 93
Number of Problematic Components in the dataset: 14
Number of Highly Problematic Components in the dataset: 1


In [13]:
def remove_labeling_columns(data):
    data = data.drop(['QualifiedName', 'Complexity', 'Coupling', 'Size', 'Lack of Cohesion'], axis=1)
    return data

dataset = remove_labeling_columns(dataset)
print(dataset.head())

                             Name  #(C&I)    #C   #I    LOC    AC    EC   
1           org.argouml.activity2     3.0   3.0  0.0   39.0   0.0   2.0  \
2   org.argouml.activity2.diagram    30.0  26.0  4.0  847.0   1.0  23.0   
3         org.argouml.application     6.0   6.0  0.0  476.0   0.0   4.0   
4     org.argouml.application.api     7.0   2.0  5.0  170.0  91.0   2.0   
5  org.argouml.application.events    16.0   9.0  7.0  353.0  41.0   7.0   

     Abs    Ins     ND    WMC  Problematic  
1  0.000  1.000  0.000   12.0            0  
2  0.133  0.958  0.091  187.0            0  
3  0.000  1.000  0.000   97.0            0  
4  0.857  0.022  0.121   31.0            0  
5  0.500  0.146  0.354  121.0            0  


In [14]:
def preprocess_data(data):
    X = data.drop(['Name', 'Problematic'], axis=1)
    y = data['Problematic']
    return X, y

X, y = preprocess_data(dataset)

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

def train_decision_tree(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    return clf, X_test, y_test

clf, X_test, y_test = train_decision_tree(X, y)


In [16]:
def make_predictions(clf, X_test):
    y_pred = clf.predict(X_test)
    return y_pred

y_pred = make_predictions(clf, X_test)

In [17]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_results(y_test, y_pred):
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

evaluate_results(y_test, y_pred)

Confusion Matrix:
[[16  1  0]
 [ 0  1  0]
 [ 0  1  0]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        17
           1       0.33      1.00      0.50         1
           2       0.00      0.00      0.00         1

    accuracy                           0.89        19
   macro avg       0.44      0.65      0.49        19
weighted avg       0.91      0.89      0.89        19



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
