In [16]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report
from sklearn.model_selection import KFold
from sklearn.svm import SVC


import os
import pickle

In [None]:
data=pd.read_csv("DPD_Att_Classifier_256.csv")
data.head()

Unnamed: 0,Directory,File,DesignPattern,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,...,Feature_246,Feature_247,Feature_248,Feature_249,Feature_250,Feature_251,Feature_252,Feature_253,Feature_254,Feature_255
0,111,AbstractFactory,AbstractFactory,0.009923,0.019961,-0.013562,-0.003069,-0.01323,0.04897,0.021512,...,-0.051777,0.097458,-0.009427,0.027184,-0.011889,-0.074672,0.146362,0.034088,0.118662,-0.033793
1,111,DPAbstractFactory,Unknown,0.025731,-0.041754,-0.094777,0.015,-0.090856,-0.008057,-0.041296,...,-0.112172,0.047929,-0.042468,-0.011076,-0.084124,-0.026756,0.148384,0.083692,0.105897,-0.024621
2,111,FactoryNotebookGamer,AbstractFactory,0.011921,0.001784,-0.058958,0.012461,-0.050621,0.017335,0.022669,...,-0.050007,0.069064,-0.048668,0.040862,0.088394,-0.042939,0.092905,0.0749,0.085188,-0.103398
3,111,FactoryNotebookOfficeW,Unknown,0.028296,-0.004623,-0.085723,0.020339,-0.030802,0.018229,-0.014785,...,-0.018946,0.079698,-0.030226,0.004979,0.08209,-0.078173,0.135302,0.115543,0.121336,-0.057344
4,111,FactoryProvider,AbstractFactory,0.012418,0.035557,-0.071139,-0.038619,-0.06249,-0.015598,0.011185,...,-0.066771,0.020901,-0.00059,-0.00211,0.001909,0.011913,0.184227,0.125253,0.07769,-0.030622


In [18]:
data["DesignPattern"].unique()

array(['AbstractFactory', 'Unknown', nan, 'Facade', 'Adapter', 'Memento',
       'Singleton', 'Proxy', 'Prototype', 'Observer', 'Decorator',
       'FactoryMethod', 'Builder', 'Visitor', 'Strategy'], dtype=object)

In [19]:
data["DesignPattern"].value_counts()

DesignPattern
Prototype          127
Observer           127
Strategy           127
Memento            107
Adapter            106
Decorator          105
Singleton          101
Builder            101
FactoryMethod      100
Facade              99
Visitor             98
AbstractFactory     97
Proxy               96
Unknown             75
Name: count, dtype: int64

In [20]:
data['DesignPattern'].fillna('Unknown', inplace=True)
data["DesignPattern"].value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['DesignPattern'].fillna('Unknown', inplace=True)


DesignPattern
Unknown            163
Prototype          127
Observer           127
Strategy           127
Memento            107
Adapter            106
Decorator          105
Singleton          101
Builder            101
FactoryMethod      100
Facade              99
Visitor             98
AbstractFactory     97
Proxy               96
Name: count, dtype: int64

In [21]:
unknown_class = data[data['DesignPattern'] == 'Unknown']
sampled_unknown = unknown_class.sample(n=130, random_state=42)
other_classes = data[data['DesignPattern'] != 'Unknown']
data = pd.concat([sampled_unknown, other_classes])
data["DesignPattern"].value_counts()

DesignPattern
Unknown            130
Prototype          127
Observer           127
Strategy           127
Memento            107
Adapter            106
Decorator          105
Singleton          101
Builder            101
FactoryMethod      100
Facade              99
Visitor             98
AbstractFactory     97
Proxy               96
Name: count, dtype: int64

In [22]:
label_encoder = LabelEncoder()
data['pattern_encoded'] = label_encoder.fit_transform(data['DesignPattern'])


In [23]:
y = data['pattern_encoded']
X = data.drop(['DesignPattern', 'pattern_encoded','Directory','File'], axis=1)  # Features

In [None]:

# Assuming X and y are pandas DataFrames or Series
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


In [25]:

# Define folder for saving models
save_folder = "cross_valid_model_256"
os.makedirs(save_folder, exist_ok=True)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize different classifiers
classifiers = {
    'Support Vector Machine': SVC(),
    'Logistic Regression': LogisticRegression(),
    'Multi-layer Perceptron': MLPClassifier(max_iter=1000),
}

# Initialize dictionaries to store evaluation metrics
accuracy_scores = {}
precision_scores = {}
recall_scores = {}
f1_scores = {}

# Perform cross-validation
for name, classifier in classifiers.items():
    accuracy_scores[name] = []
    precision_scores[name] = []
    recall_scores[name] = []
    f1_scores[name] = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)

        # Calculate and store metrics for each fold
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracy_scores[name].append(accuracy)
        precision_scores[name].append(precision)
        recall_scores[name].append(recall)
        f1_scores[name].append(f1)

    # Calculate average metrics across folds
    avg_accuracy = sum(accuracy_scores[name]) / len(accuracy_scores[name])
    avg_precision = sum(precision_scores[name]) / len(precision_scores[name])
    avg_recall = sum(recall_scores[name]) / len(recall_scores[name])
    avg_f1 = sum(f1_scores[name]) / len(f1_scores[name])

    print(f'{name} Metrics (Average Across Folds):')
    print(f'Average Accuracy: {avg_accuracy:.2f}')
    print(f'Average Precision (Macro): {avg_precision:.2f}')
    print(f'Average Recall (Macro): {avg_recall:.2f}')
    print(f'Average F1-Score (Macro): {avg_f1:.2f}')
    print('-' * 30)

    # Save the trained classifier using pickle
    model_path = os.path.join(save_folder, f"{name.replace(' ', '_')}.pkl")
    with open(model_path, 'wb') as model_file:
        pickle.dump(classifier, model_file)

print(f"Trained models have been saved to '{save_folder}'")


Support Vector Machine Metrics (Average Across Folds):
Average Accuracy: 0.86
Average Precision (Macro): 0.86
Average Recall (Macro): 0.86
Average F1-Score (Macro): 0.86
------------------------------
Logistic Regression Metrics (Average Across Folds):
Average Accuracy: 0.81
Average Precision (Macro): 0.82
Average Recall (Macro): 0.82
Average F1-Score (Macro): 0.81
------------------------------
Multi-layer Perceptron Metrics (Average Across Folds):
Average Accuracy: 0.84
Average Precision (Macro): 0.85
Average Recall (Macro): 0.84
Average F1-Score (Macro): 0.84
------------------------------
Trained models have been saved to 'cross_valid_model_256'


In [27]:
# Train and evaluate each classifier
for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    # Print the classification report
    report = classification_report(y_test, y_pred, target_names= label_encoder.classes_)
    print(f'{name} Classification Report:\n{report}')
    print('-' * 30)

Support Vector Machine Classification Report:
                 precision    recall  f1-score   support

AbstractFactory       0.75      1.00      0.86        18
        Adapter       0.89      0.94      0.92        18
        Builder       0.60      1.00      0.75        15
      Decorator       0.82      0.88      0.85        16
         Facade       0.91      0.83      0.87        24
  FactoryMethod       0.80      0.40      0.53        20
        Memento       1.00      0.88      0.94        26
       Observer       0.90      1.00      0.95        27
      Prototype       0.71      0.87      0.78        23
          Proxy       0.88      1.00      0.94        22
      Singleton       1.00      0.70      0.82        20
       Strategy       0.84      0.87      0.85        30
        Unknown       0.73      0.55      0.63        20
        Visitor       1.00      0.84      0.91        25

       accuracy                           0.84       304
      macro avg       0.85      0.84    