In [112]:
#project_name = "."
#project_name = "jfreechart"
#project_name = "argouml"
project_name = "weka"

In [113]:
import pandas as pd

def load_data(file_path):
    data = pd.read_csv(file_path, sep='\t')
    return data

def remove_empty_rows(data):
    data = data.dropna(subset=['Name'])
    return data

def remove_unnamed_columns(data):
    data = data.filter(regex='^(?!Unnamed.*)')
    return data

def remove_nan_columns(data):
    data = data.dropna(axis=1, how='all')
    return data

def convert_ordinal_to_numerical(data):
    ordinal_mapping = {
        'low': 1,
        'low-medium': 2,
        'medium-high': 3,
        'high': 4,
        'very-high': 5
    }
    columns_to_convert = ['Complexity', 'Coupling', 'Size', 'Lack of Cohesion']
    for column in columns_to_convert:
        data[column] = data[column].replace(ordinal_mapping)
    return data

dataset_file = "input/" + project_name + "/package-metrics-dataset.csv"
dataset = load_data(dataset_file)
dataset = remove_empty_rows(dataset)
dataset = remove_unnamed_columns(dataset)
dataset = remove_nan_columns(dataset)
dataset = convert_ordinal_to_numerical(dataset)

print(dataset.head())

                      QualifiedName                     Name  Complexity   
1                     <Package>weka                     weka           1  \
2        <Package>weka.associations        weka.associations           4   
3  <Package>weka.attributeSelection  weka.attributeSelection           4   
4         <Package>weka.classifiers         weka.classifiers           3   
5   <Package>weka.classifiers.bayes   weka.classifiers.bayes           2   

   Coupling  Size  Lack of Cohesion  #(C&I)    #C   #I     LOC     AC    EC   
1         1     1                 1     2.0   2.0  0.0   204.0    1.0   1.0  \
2         3     3                 1    29.0  26.0  3.0  3917.0    8.0  19.0   
3         4     4                 1    31.0  25.0  6.0  6166.0    8.0  21.0   
4         4     3                 1    27.0  20.0  7.0  3186.0  114.0  24.0   
5         2     3                 1     7.0   7.0  0.0  1640.0   37.0   6.0   

     Abs    Ins     ND     WMC  
1  0.000  0.500  0.500    74.0  
2 

In [114]:
def label_problematic_component(row, problematic_class_threshold=3, highly_problematic_class_threshold=4):
    if row['Complexity'] >= problematic_class_threshold and \
        row['Coupling'] >= problematic_class_threshold or \
        row['Lack of Cohesion'] >= problematic_class_threshold:
        return 1
    elif row['Complexity'] >= highly_problematic_class_threshold or \
        row['Coupling'] >= highly_problematic_class_threshold or \
        row['Lack of Cohesion'] >= highly_problematic_class_threshold or \
        row['Size'] >= highly_problematic_class_threshold:
        return 1
    else:
        return 0
    
# Label the dataset
dataset['Problematic'] = dataset.apply(label_problematic_component, axis=1)

# Count the number of Problematic Classes
num_dataset_rows = dataset.shape[0]
num_problematic_classes = dataset[dataset['Problematic'] == 1].shape[0]
print(f'Number of instances in the dataset: {num_dataset_rows}')
print(f'Number of Problematic Components in the dataset: {num_problematic_classes}')

Number of instances in the dataset: 89
Number of Problematic Components in the dataset: 30


In [115]:
def remove_labeling_columns(data):
    data = data.drop(['QualifiedName', 'Complexity', 'Coupling', 'Size', 'Lack of Cohesion'], axis=1)
    return data

dataset = remove_labeling_columns(dataset)
print(dataset.head())

                      Name  #(C&I)    #C   #I     LOC     AC    EC    Abs   
1                     weka     2.0   2.0  0.0   204.0    1.0   1.0  0.000  \
2        weka.associations    29.0  26.0  3.0  3917.0    8.0  19.0  0.276   
3  weka.attributeSelection    31.0  25.0  6.0  6166.0    8.0  21.0  0.387   
4         weka.classifiers    27.0  20.0  7.0  3186.0  114.0  24.0  0.704   
5   weka.classifiers.bayes     7.0   7.0  0.0  1640.0   37.0   6.0  0.000   

     Ins     ND     WMC  Problematic  
1  0.500  0.500    74.0            0  
2  0.704  0.020  1061.0            1  
3  0.724  0.111  1681.0            1  
4  0.174  0.122   877.0            1  
5  0.140  0.860   448.0            0  


In [116]:
def preprocess_data(data):
    X = data.drop(['Name', 'Problematic'], axis=1)
    y = data['Problematic']
    return X, y

X, y = preprocess_data(dataset)

In [117]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

def train_decision_tree(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    return clf, X_test, y_test

clf, X_test, y_test = train_decision_tree(X, y)


In [118]:
def make_predictions(clf, X_test):
    y_pred = clf.predict(X_test)
    return y_pred

y_pred = make_predictions(clf, X_test)

In [119]:
from sklearn.metrics import accuracy_score

print(f'Test Accuracy: {accuracy_score(y_test, y_pred):.4f}')

Test Accuracy: 0.8889


In [120]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_results(y_test, y_pred):
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

evaluate_results(y_test, y_pred)

Confusion Matrix:
[[18  0]
 [ 3  6]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.92        18
           1       1.00      0.67      0.80         9

    accuracy                           0.89        27
   macro avg       0.93      0.83      0.86        27
weighted avg       0.90      0.89      0.88        27

