In [1]:
import pandas as pd

def load_data(file_path):
    data = pd.read_csv(file_path, sep='\t')
    return data

def remove_empty_rows(data):
    data = data.dropna(subset=['Name'])
    return data

def remove_unnamed_columns(data):
    data = data.filter(regex='^(?!Unnamed.*)')
    return data

def remove_nan_columns(data):
    data = data.dropna(axis=1, how='all')
    return data

def convert_ordinal_to_numerical(data):
    ordinal_mapping = {
        'low': 1,
        'low-medium': 2,
        'medium-high': 3,
        'high': 4,
        'very-high': 5
    }
    columns_to_convert = ['Complexity', 'Coupling', 'Size', 'Lack of Cohesion']
    for column in columns_to_convert:
        data[column] = data[column].replace(ordinal_mapping)
    return data

project_name = "jfreechart"
#project_name = "argouml"
dataset_file = "input/" + project_name + "/package-metrics-dataset.csv"
dataset = load_data(dataset_file)
dataset = remove_empty_rows(dataset)
dataset = remove_unnamed_columns(dataset)
dataset = remove_nan_columns(dataset)
dataset = convert_ordinal_to_numerical(dataset)

print(dataset.head())

                          QualifiedName                         Name   
1              <Package>org.jfree.chart              org.jfree.chart  \
2  <Package>org.jfree.chart.annotations  org.jfree.chart.annotations   
3          <Package>org.jfree.chart.api          org.jfree.chart.api   
4         <Package>org.jfree.chart.axis         org.jfree.chart.axis   
5        <Package>org.jfree.chart.block        org.jfree.chart.block   

   Complexity  Coupling  Size  Lack of Cohesion  #(C&I)    #C   #I     LOC   
1           3         1     3                 1    13.0   9.0  4.0  2441.0  \
2           2         3     3                 1    20.0  16.0  4.0  1929.0   
3           1         1     3                 1    14.0  13.0  1.0   418.0   
4           4         4     5                 1    46.0  44.0  2.0  7056.0   
5           2         3     3                 1    22.0  17.0  5.0  1723.0   

      AC    EC    Abs    Ins     ND     WMC  
1   64.0   5.0  0.462  0.072  0.466   576.0  
2    6

In [2]:
def label_god_component(row, complexity_threshold=2, size_threshold=2):
    if row['Complexity'] > complexity_threshold and row['Size'] > size_threshold:
        return 1
    else:
        return 0

# Label the dataset
dataset['is_god_component'] = dataset.apply(label_god_component, axis=1)

# Count the number of God Components
num_dataset_rows = dataset.shape[0]
num_god_components = dataset['is_god_component'].sum()
print(f'Number of instances in the dataset: {num_dataset_rows}')
print(f'Number of God Components in the dataset: {num_god_components}')

Number of instances in the dataset: 42
Number of God Components in the dataset: 11


In [3]:
def remove_labeling_columns(data):
    data = data.drop(['QualifiedName', 'Complexity', 'Coupling', 'Size', 'Lack of Cohesion'], axis=1)
    return data

dataset = remove_labeling_columns(dataset)
print(dataset.head())

                          Name  #(C&I)    #C   #I     LOC     AC    EC    Abs   
1              org.jfree.chart    13.0   9.0  4.0  2441.0   64.0   5.0  0.462  \
2  org.jfree.chart.annotations    20.0  16.0  4.0  1929.0    6.0  20.0  0.300   
3          org.jfree.chart.api    14.0  13.0  1.0   418.0  215.0   3.0  0.071   
4         org.jfree.chart.axis    46.0  44.0  2.0  7056.0   91.0  32.0  0.152   
5        org.jfree.chart.block    22.0  17.0  5.0  1723.0   19.0  18.0  0.227   

     Ins     ND     WMC  is_god_component  
1  0.072  0.466   576.0                 1  
2  0.769  0.069   457.0                 0  
3  0.014  0.915   105.0                 0  
4  0.260  0.588  1828.0                 1  
5  0.486  0.287   485.0                 0  


In [4]:
def preprocess_data(data):
    X = data.drop(['Name', 'is_god_component'], axis=1)
    y = data['is_god_component']
    return X, y

X, y = preprocess_data(dataset)

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

def train_decision_tree(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    return clf, X_test, y_test

clf, X_test, y_test = train_decision_tree(X, y)


In [6]:
def make_predictions(clf, X_test):
    y_pred = clf.predict(X_test)
    return y_pred

y_pred = make_predictions(clf, X_test)

In [7]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_results(y_test, y_pred):
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

evaluate_results(y_test, y_pred)

Confusion Matrix:
[[7 0]
 [0 2]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00         2

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9

