In [4]:
import sqlite3
import json
import pandas as pd
import glob
from datetime import datetime

# Make Database 

Create the database and tables for reading my logs

In [54]:
conn = sqlite3.connect("results.db")
cursor = conn.cursor()

# Create table
cursor.execute("""
CREATE TABLE IF NOT EXISTS nd_model_registry (
    name TEXT NOT NULL,
    models TEXT NOT NULL,
    dataset TEXT NOT NULL,
    kfold INTEGER,
    nd_structure TEXT NOT NULL,
    model_structure TEXT NOT NULL,
    accuracy_score REAL,
    run_time_seconds REAL,
    inner_kfolds INTEGER NOT NULL,
    run_timestamp TEXT,
    notes TEXT,
    PRIMARY KEY (name, models, dataset, kfold, inner_kfolds)
    )
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS competitor_model_registry (
    name TEXT NOT NULL,
    model TEXT NOT NULL,
    dataset TEXT NOT NULL,
    kfold INTEGER,
    accuracy REAL,
    run_time_seconds REAL,
    run_timestamp TEXT,
    notes TEXT,
    PRIMARY KEY (name, model, dataset, kfold)
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS layer_by_layer_model_registry (
    name TEXT NOT NULL,
    models TEXT NOT NULL,
    dataset TEXT NOT NULL,
    kfold INTEGER,
    nd_structure TEXT NOT NULL,
    model_structure TEXT NOT NULL,
    accuracy_score REAL,
    run_time_seconds REAL,
    inner_kfolds INTEGER NOT NULL,
    run_timestamp TEXT,
    notes TEXT,
    PRIMARY KEY (name, models, dataset, kfold, inner_kfolds)
    )
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS all_trees_model_registry (
    name TEXT NOT NULL,
    models TEXT NOT NULL,
    dataset TEXT NOT NULL,
    kfold INTEGER,
    nd_structure TEXT NOT NULL,
    model_structure TEXT NOT NULL,
    accuracy_score REAL,
    run_time_seconds REAL,
    inner_kfolds INTEGER NOT NULL,
    run_timestamp TEXT,
    notes TEXT,
    PRIMARY KEY (name, models, dataset, kfold, inner_kfolds)
    )
""")

conn.commit()

In [84]:
# Delete my tables
cursor.execute("""
DROP TABLE all_trees_model_registry
""")

conn.commit()

In [148]:
# Delete my tables
# cursor.execute("""
# DELETE FROM nd_model_registry
# """)

# conn.commit()

In [6]:
def insert_dichotomies(cursor, all_dichotomies):
    cursor.executemany("""
        INSERT INTO nd_model_registry (
                        name,
                        models,
                        dataset,
                        kfold,
                        nd_structure,
                        model_structure,
                        accuracy_score,
                        run_time_seconds,
                        inner_kfolds,
                        run_timestamp,
                        notes)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ON CONFLICT(name, models, dataset, kfold, inner_kfolds)
    DO UPDATE SET
        nd_structure = excluded.nd_structure,
        model_structure = excluded.model_structure,
        accuracy_score = excluded.accuracy_score,
        run_time_seconds = excluded.run_time_seconds,
        run_timestamp = excluded.run_timestamp,
        notes = excluded.notes
    WHERE excluded.run_timestamp > nd_model_registry.run_timestamp;
    """, all_dichotomies)

def insert_competitors(cursor, all_competitors):
    cursor.executemany("""
        INSERT INTO competitor_model_registry (
            name,
            model,
            dataset,
            kfold,
            accuracy,
            run_time_seconds,
            run_timestamp,
            notes)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    ON CONFLICT(name, model, dataset, kfold)
    DO UPDATE SET
        accuracy=excluded.accuracy,
        run_time_seconds=excluded.run_time_seconds,
        run_timestamp = excluded.run_timestamp,
        notes = excluded.notes
    WHERE excluded.run_timestamp > competitor_model_registry.run_timestamp;
    """, all_competitors)

def insert_layer_by_layer(cursor, all_dichotomies):
    cursor.executemany("""
        INSERT INTO layer_by_layer_model_registry (
                        name,
                        models,
                        dataset,
                        kfold,
                        nd_structure,
                        model_structure,
                        accuracy_score,
                        run_time_seconds,
                        inner_kfolds,
                        run_timestamp,
                        notes)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ON CONFLICT(name, models, dataset, kfold, inner_kfolds)
    DO UPDATE SET
        nd_structure = excluded.nd_structure,
        model_structure = excluded.model_structure,
        accuracy_score = excluded.accuracy_score,
        run_time_seconds = excluded.run_time_seconds,
        run_timestamp = excluded.run_timestamp,
        notes = excluded.notes
    WHERE excluded.run_timestamp > layer_by_layer_model_registry.run_timestamp;
    """, all_dichotomies)

def insert_all_trees(cursor, all_dichotomies):
    cursor.executemany("""
        INSERT INTO all_trees_model_registry (
                        name,
                        models,
                        dataset,
                        kfold,
                        nd_structure,
                        model_structure,
                        accuracy_score,
                        run_time_seconds,
                        inner_kfolds,
                        run_timestamp,
                        notes)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ON CONFLICT(name, models, dataset, kfold, inner_kfolds)
    DO UPDATE SET
        nd_structure = excluded.nd_structure,
        model_structure = excluded.model_structure,
        accuracy_score = excluded.accuracy_score,
        run_time_seconds = excluded.run_time_seconds,
        run_timestamp = excluded.run_timestamp,
        notes = excluded.notes
    WHERE excluded.run_timestamp > all_trees_model_registry.run_timestamp;
    """, all_dichotomies)

# Fill ND table results

There are 3 methods for finding NDs: 

1. ND-Stepwise Method
2. All Layer Traversal 
3. All Trees


1. ND-Stepwise Method

In [29]:
all_datasets = [
    # 'letter_recognition',
    'car_evaluation',
    'mfeat-factors',
    'mfeat-fouriers',
    'mfeat-karhunen',
    'mfeat-morphological',
    'mfeat-pixel',
    'mfeat-zernlike',
    'optdigits',
    'pageblocks',
    'handwritten_digits',
    'satimage',
    'image_segment',
    'beans_data',
]

all_datasets += [
    "wine_quality",
    "academic_dropout",
    "maternal_health_risk",
    "rt_iot",
    "land_mines"
]

all_datasets

['car_evaluation',
 'mfeat-factors',
 'mfeat-fouriers',
 'mfeat-karhunen',
 'mfeat-morphological',
 'mfeat-pixel',
 'mfeat-zernlike',
 'optdigits',
 'pageblocks',
 'handwritten_digits',
 'satimage',
 'image_segment',
 'beans_data',
 'wine_quality',
 'academic_dropout',
 'maternal_health_risk',
 'rt_iot',
 'land_mines']

In [30]:
models = "mlp"
models = "svm"
models = "logisticregression_xgboostgpu_svm_knnhyper"
name_add = ""
log_path = fr"C:\Users\maxdi\OneDrive\Documents\uni_honours_docs\getting_paper_ready\my models\{models.replace('_',',')}\logs\outter_larger_split\*"
includes_inner_kfolds = 0
all_files = glob.glob(log_path)
notes = "Using 0.3 split as the test set in inner."
if len(all_files) == 0:
    print("FAILED: no models founds - check the models and log_path.")

In [32]:
to_be_added = []
new_kfold = 0
for dataset in all_datasets:
    found_flag = False
    data = []
    insert_data = []
    # print(f"Starting dataset: {dataset}")
    my_file = [file for file in all_files if dataset in file]
    new_kfold = 0
    
    for log_file_name in my_file:
        with open(log_file_name, 'r') as file:
            for line in file:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    
    data = sorted(data, key=lambda x: datetime.strptime(x["timestamp"], "%Y-%m-%d %H:%M:%S"))
    
    for line_num in range(0,len(data)):
        # Looking for line: Model diagram saved here:
        line = data[line_num]
        if ("inner_kfolds" in line['message']) == includes_inner_kfolds:
            pass
        if dataset not in line['message']:
            continue
        if models.lower() not in line['message'].lower():
            continue
        if "Model diagram saved here: models/plot_" in line['message']:
            name = name_add + dataset + "_" + models
            kfold_num = line['message'].find(dataset) - 1 - includes_inner_kfolds
            new_kfold = line['message'][kfold_num]
            # if kfold > new_kfold:
            #     insert_data = []
            kfold = line['message'][kfold_num]
            models_used = data[line_num-5]['message']
            nd_structure = data[line_num-6]['message']
            run_time_log = data[line_num-7]['message']
            run_time = ''.join(c for c in run_time_log if c.isdigit() or c == '.').strip(".")
            accuracy = data[line_num+1]['message'].split(" ")[-1]
            timestamp = line["timestamp"]
            insert_string = [name, models, dataset, kfold, nd_structure, models_used, accuracy, run_time, includes_inner_kfolds, str(timestamp), notes]
            insert_data.append(insert_string)
            if str(kfold) == "5":
                print(f"✅ Found all fold scores for {dataset}!")
                insert_dichotomies(cursor, insert_data)
                found_flag = True
                break
    if not found_flag:
        print(f"❌ Failed to find all of dataset {dataset}")
    conn.commit()
    # Convert to DataFrame
    # df = pd.DataFrame(data)

Starting dataset: car_evaluation
✅ Found all fold scores!
Starting dataset: mfeat-factors
✅ Found all fold scores!
Starting dataset: mfeat-fouriers
✅ Found all fold scores!
Starting dataset: mfeat-karhunen
✅ Found all fold scores!
Starting dataset: mfeat-morphological
✅ Found all fold scores!
Starting dataset: mfeat-pixel
✅ Found all fold scores!
Starting dataset: mfeat-zernlike
✅ Found all fold scores!
Starting dataset: optdigits
✅ Found all fold scores!
Starting dataset: pageblocks
✅ Found all fold scores!
Starting dataset: handwritten_digits
✅ Found all fold scores!
Starting dataset: satimage
✅ Found all fold scores!
Starting dataset: image_segment
✅ Found all fold scores!
Starting dataset: beans_data
✅ Found all fold scores!
Starting dataset: wine_quality
✅ Found all fold scores!
Starting dataset: academic_dropout
✅ Found all fold scores!
Starting dataset: maternal_health_risk
✅ Found all fold scores!
Starting dataset: rt_iot
❌ Failed to find all of dataset rt_iot
Starting dataset:

2. All Layer Traversal


In [15]:
all_datasets = [
    # 'letter_recognition',
    'car_evaluation',
    'mfeat-factors',
    'mfeat-fouriers',
    'mfeat-karhunen',
    'mfeat-morphological',
    'mfeat-pixel',
    'mfeat-zernlike',
    'optdigits',
    'pageblocks',
    'handwritten_digits',
    'satimage',
    'image_segment',
    'beans_data',
]

all_datasets += [
    "wine_quality",
    "academic_dropout",
    "maternal_health_risk",
    "rt_iot",
    "land_mines"
]

In [16]:
models = "logisticregression_xgboostgpu_svm_knnhyper"
models = "svm"
name_add = "layer_by_layer"
log_path = fr"C:\Users\maxdi\OneDrive\Documents\uni_honours_docs\getting_paper_ready\my models\{models.replace('_',',')}\logs\all_layers\*"
includes_inner_kfolds = 0
all_files = glob.glob(log_path)
notes = "Have iterated through every layer of the model."
if len(all_files) == 0:
    print("FAILED: no models founds - check the models and log_path.")

In [17]:
to_be_added = []
new_kfold = 0
for dataset in all_datasets:
    found_flag = False
    data = []
    insert_data = []
    # print(f"Starting dataset: {dataset}")
    my_file = [file for file in all_files if dataset in file]
    new_kfold = 0
    
    for log_file_name in my_file:
        with open(log_file_name, 'r') as file:
            for line in file:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    
    data = sorted(data, key=lambda x: datetime.strptime(x["timestamp"], "%Y-%m-%d %H:%M:%S"))
    
    for line_num in range(0,len(data)):
        # Looking for line: Model diagram saved here:
        line = data[line_num]
        if ("inner_kfolds" in line['message']) == includes_inner_kfolds:
            pass
        if dataset not in line['message']:
            continue
        if models.lower() not in line['message'].lower():
            continue
        if "Model diagram saved here: models/plot_" in line['message']:
            name = name_add + dataset + "_" + models
            kfold_num = line['message'].find(dataset) - 1 - includes_inner_kfolds
            new_kfold = line['message'][kfold_num]
            # if kfold > new_kfold:
            #     insert_data = []
            kfold = line['message'][kfold_num]
            models_used = data[line_num-5]['message']
            nd_structure = data[line_num-6]['message']
            run_time_log = data[line_num-7]['message']
            run_time = ''.join(c for c in run_time_log if c.isdigit() or c == '.').strip(".")
            accuracy = data[line_num+1]['message'].split(" ")[-1]
            timestamp = line["timestamp"]
            insert_string = [name, models, dataset, kfold, nd_structure, models_used, accuracy, run_time, includes_inner_kfolds, str(timestamp), notes]
            insert_data.append(insert_string)
            if str(kfold) == "5":
                print(f"✅ Found all fold scores for {dataset}!")
                insert_layer_by_layer(cursor, insert_data)
                found_flag = True
                break

    if not found_flag:
        print(f"❌ Failed to find all of dataset {dataset}. Found {new_kfold} folds.")


    conn.commit()
    # Convert to DataFrame
    # df = pd.DataFrame(data)

✅ Found all fold scores for car_evaluation!
✅ Found all fold scores for mfeat-factors!
✅ Found all fold scores for mfeat-fouriers!
✅ Found all fold scores for mfeat-karhunen!
✅ Found all fold scores for mfeat-morphological!
✅ Found all fold scores for mfeat-pixel!
✅ Found all fold scores for mfeat-zernlike!
✅ Found all fold scores for optdigits!
✅ Found all fold scores for pageblocks!
✅ Found all fold scores for handwritten_digits!
✅ Found all fold scores for satimage!
✅ Found all fold scores for image_segment!
✅ Found all fold scores for beans_data!
✅ Found all fold scores for wine_quality!
✅ Found all fold scores for academic_dropout!
✅ Found all fold scores for maternal_health_risk!
❌ Failed to find all of dataset rt_iot. Found 0 folds.
✅ Found all fold scores for land_mines!


3. All Trees


In [59]:
all_datasets = [
    # 'letter_recognition',
    'car_evaluation',
    'mfeat-factors',
    'mfeat-fouriers',
    'mfeat-karhunen',
    'mfeat-morphological',
    'mfeat-pixel',
    'mfeat-zernlike',
    'optdigits',
    'pageblocks',
    'handwritten_digits',
    'satimage',
    'image_segment',
    'beans_data',
]

all_datasets += [
    "wine_quality",
    "academic_dropout",
    "maternal_health_risk",
    "rt_iot",
    "land_mines"
]

In [63]:
models = "svm"
# models = "logisticregression_xgboostgpu_svm_knnhyper"
name_add = "all_trees_"
log_path = fr"C:\Users\maxdi\OneDrive\Documents\uni_honours_docs\getting_paper_ready\my models\{models.replace('_',',')}\logs\all_trees\*"
includes_inner_kfolds = 0
all_files = glob.glob(log_path)
notes = "Run all possible trees"
if len(all_files) == 0:
    print("FAILED: no models founds - check the models and log_path.")

In [64]:
to_be_added = []
new_kfold = 0
for dataset in all_datasets:
    found_flag = False
    data = []
    insert_data = []
    # print(f"Starting dataset: {dataset}")
    my_file = [file for file in all_files if dataset in file]
    new_kfold = 0
    
    for log_file_name in my_file:
        with open(log_file_name, 'r') as file:
            for line in file:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    
    data = sorted(data, key=lambda x: datetime.strptime(x["timestamp"], "%Y-%m-%d %H:%M:%S"))
    
    for line_num in range(0,len(data)):
        # Looking for line: Model diagram saved here:
        line = data[line_num]
        if ("inner_kfolds" in line['message']) == includes_inner_kfolds:
            pass
        if dataset not in line['message']:
            continue
        if models.lower() not in line['message'].lower():
            continue
        if "Model diagram saved here: models/plot_" in line['message']:
            name = name_add + dataset + "_" + models
            kfold_num = line['message'].find(dataset) - 1 - includes_inner_kfolds - 1
            new_kfold = line['message'][kfold_num]
            # if kfold > new_kfold:
            #     insert_data = []
            kfold = line['message'][kfold_num]
            models_used = data[line_num-5]['message']
            nd_structure = data[line_num-6]['message']
            run_time_log = data[line_num-7]['message']
            run_time = ''.join(c for c in run_time_log if c.isdigit() or c == '.').strip(".")
            accuracy = data[line_num+1]['message'].split(" ")[-1]
            timestamp = line["timestamp"]
            insert_string = [name, models, dataset, kfold, nd_structure, models_used, accuracy, run_time, includes_inner_kfolds, str(timestamp), notes]
            insert_data.append(insert_string)
            if str(kfold) == "5":
                print(f"✅ Found all fold scores for {dataset}!")
                insert_all_trees(cursor, insert_data)
                found_flag = True
                break
    if not found_flag:
        print(f"❌ FAILED to find all of dataset {dataset}")
    conn.commit()
    # Convert to DataFrame
    # df = pd.DataFrame(data)

✅ Found all fold scores for car_evaluation!
❌ FAILED to find all of dataset mfeat-factors
❌ FAILED to find all of dataset mfeat-fouriers
❌ FAILED to find all of dataset mfeat-karhunen
❌ FAILED to find all of dataset mfeat-morphological
❌ FAILED to find all of dataset mfeat-pixel
❌ FAILED to find all of dataset mfeat-zernlike
❌ FAILED to find all of dataset optdigits
❌ FAILED to find all of dataset pageblocks
❌ FAILED to find all of dataset handwritten_digits
❌ FAILED to find all of dataset satimage
❌ FAILED to find all of dataset image_segment
❌ FAILED to find all of dataset beans_data
❌ FAILED to find all of dataset wine_quality
✅ Found all fold scores for academic_dropout!
✅ Found all fold scores for maternal_health_risk!
❌ FAILED to find all of dataset rt_iot
❌ FAILED to find all of dataset land_mines


# Fill Competitor

In [39]:
competitor_models = ['Multinomial', 'Random Forest', 'KNN', 'SVM OVO', 'SVM OVR',
       'LDA', 'Xgboost OVO', 'Xgboost OVR', 'Multilayer Perceptron']
file_path = r"C:\Users\maxdi\OneDrive\Documents\uni_honours_docs\getting_paper_ready\Outer_kfolds_results_for_paper.xlsx"
# Read the specific sheet into a DataFrame
df_accuracy = pd.read_excel(file_path, sheet_name='Competitors Accuracy').rename(columns={"Accuracy": "name"})
df_timing = pd.read_excel(file_path, sheet_name='Competitors Timing').rename(columns={"Timing": "name"})

df_accuracy = df_accuracy[df_accuracy['name'].str.contains("_fold")][["name"] + competitor_models]
df_timing = df_timing[df_timing['name'].str.contains("_fold")][["name"] + competitor_models]

df_accuracy

Unnamed: 0,name,Multinomial,Random Forest,KNN,SVM OVO,SVM OVR,LDA,Xgboost OVO,Xgboost OVR,Multilayer Perceptron
1,letter_fold1,0.767000,0.958500,0.954750,0.953750,0.924750,0.693000,0.947250,0.962750,0.948750
2,letter_fold2,0.779000,0.961250,0.958500,0.958000,0.930500,0.710250,0.951000,0.965250,0.952500
3,letter_fold3,0.769500,0.968000,0.959250,0.959250,0.933250,0.697000,0.950500,0.966750,0.955250
4,letter_fold4,0.773750,0.961500,0.956750,0.953000,0.926000,0.706000,0.947750,0.964250,0.951500
5,letter_fold5,0.778000,0.968000,0.956750,0.958750,0.929500,0.701000,0.948250,0.967500,0.955750
...,...,...,...,...,...,...,...,...,...,...
79,car-evaluation_fold1,0.907514,0.923981,0.901734,0.973988,0.956647,0.898844,0.991329,0.994220,0.994220
80,car-evaluation_fold2,0.927746,0.915503,0.895954,0.976879,0.947977,0.916185,0.991329,0.985549,0.991329
81,car-evaluation_fold3,0.916185,0.934607,0.878613,0.982659,0.950867,0.875723,0.997110,0.997110,0.994220
82,car-evaluation_fold4,0.907246,0.930566,0.907246,0.956522,0.942029,0.886957,0.991304,0.988406,1.000000


In [40]:
# Melt dataframes to long format
accuracy_long = df_accuracy.melt(id_vars=["name"], var_name="model", value_name="accuracy")
timing_long = df_timing.melt(id_vars=["name"], var_name="model", value_name="run_time_seconds")

# Merge the two dataframes on 'name' and 'dataset'
combined = pd.merge(accuracy_long, timing_long, on=["name", "model"])

# Extract the kfold number from the 'name' column
combined["kfold"] = combined["name"].str.extract(r"fold(\d+)").astype(int)
combined["dataset"] = combined["name"].str.replace(r"_fold\d+", "", regex=True)
combined["name"] = combined["name"].str.replace(r"_fold\d+", "", regex=True) + "_" + combined["model"]
combined["notes"] = ""
combined["run_timestamp"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Final schema
combined = combined[["name", "model", "dataset", "kfold", "accuracy", "run_time_seconds", "run_timestamp", "notes"]]

In [195]:
all_competitors_data = combined.values.tolist()
all_competitors_data
insert_competitors(cursor, all_competitors_data)
# for index, row in combined.iterrows():
#     print(list(row))
conn.commit()


-- If you want to rerun any competitor model or add another competitor here is the code:

In [58]:
# Running competitor model over all datasets

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_curve, ConfusionMatrixDisplay, auc, roc_auc_score, f1_score, confusion_matrix

from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import time
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
import xgboost as xgb

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)

# REMEMEBR TO CHANGE MODEL NAME
# ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ 
model_name = "Multilayer Perceptron"
# ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ ❌ 

notes = ""

files = [
    'letter_recognition.csv',
    'mfeat-factors.csv',
    'mfeat-fouriers.csv',
    'mfeat-karhunen.csv',
    'mfeat-morphological.csv',
    'mfeat-pixel.csv',
    'mfeat-zernlike.csv',
    'optdigits.csv',
    'pageblocks.csv',
    'handwritten_digits.csv',
    'satimage.csv',
    'image_segment.csv',
    'beans_data.csv',
    'car_evaluation.csv',
]
all_datasets = [
    'letter_recognition',
    'mfeat-factors',
    'mfeat-fouriers',
    'mfeat-karhunen',
    'mfeat-morphological',
    'mfeat-pixel',
    'mfeat-zernlike',
    'optdigits',
    'pageblocks',
    'handwritten_digits',
    'satimage',
    'image_segment',
    'beans_data',
    'car_evaluation',
]

# New files
files = [
    "wine_quality.csv",
    "academic_dropout.csv",
    "maternal_health_risk.csv",
    "rt_iot.csv",
    "land_mines.csv"
]

all_datasets = [
    "wine_quality",
    "academic_dropout",
    "maternal_health_risk",
    "rt_iot",
    "land_mines"
]

for index, file in enumerate(files):
    dataset = file
    dataset_location = "../data/" + dataset

    df = pd.read_csv(dataset_location)
    df.drop(df.columns[0], axis=1, inplace=True)
    Y = df['Y']
    df_x = df.drop('Y', axis=1)
    categories = tuple(df['Y'].unique())
    accuracy_all = []
    time_all = []
    insert_data = []

    for fold, (train_index, test_index) in enumerate(cv.split(df_x, Y)):
        start = time.perf_counter()
        X_train, X_test = df_x.iloc[train_index], df_x.iloc[test_index]
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

        #CHANGE MODEL HERE TO WHATEVER YOU NEED
        categories = tuple(Y.unique())
        model = make_pipeline(StandardScaler(), MLPClassifier(max_iter = 400))
        # model = OneVsOneClassifier(model)

        # model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
        # model = LinearDiscriminantAnalysis()

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'Accuracy of {fold+1}: {accuracy:.5f}')
        accuracy_all.append(accuracy)
        # print(classification_report(y_test, y_pred))
        timer = round(time.perf_counter()-start,3)
        # print(f"Time of {fold+1}: {timer}")
        time_all.append(timer)
        row_of_data = [all_datasets[index] + "_" + model_name, model_name, all_datasets[index], fold+1, accuracy, timer, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), notes]
        insert_data.append(row_of_data)

    insert_competitors(cursor, insert_data)
    print(insert_data)
    print(file)
    print(accuracy_all)
    print(time_all)

conn.commit()



Accuracy of 1: 0.57615




Accuracy of 2: 0.57077
Accuracy of 3: 0.57968




Accuracy of 4: 0.56736




Accuracy of 5: 0.57506
[['wine_quality_Multilayer Perceptron', 'Multilayer Perceptron', 'wine_quality', 1, 0.5761538461538461, 9.256, '2025-02-10 15:33:52', ''], ['wine_quality_Multilayer Perceptron', 'Multilayer Perceptron', 'wine_quality', 2, 0.5707692307692308, 8.397, '2025-02-10 15:34:00', ''], ['wine_quality_Multilayer Perceptron', 'Multilayer Perceptron', 'wine_quality', 3, 0.5796766743648961, 8.169, '2025-02-10 15:34:08', ''], ['wine_quality_Multilayer Perceptron', 'Multilayer Perceptron', 'wine_quality', 4, 0.567359507313318, 9.403, '2025-02-10 15:34:17', ''], ['wine_quality_Multilayer Perceptron', 'Multilayer Perceptron', 'wine_quality', 5, 0.5750577367205543, 9.224, '2025-02-10 15:34:27', '']]
wine_quality.csv
[0.5761538461538461, 0.5707692307692308, 0.5796766743648961, 0.567359507313318, 0.5750577367205543]
[9.256, 8.397, 8.169, 9.403, 9.224]




Accuracy of 1: 0.74011




Accuracy of 2: 0.74915




Accuracy of 3: 0.72429




Accuracy of 4: 0.74689




Accuracy of 5: 0.70249
[['academic_dropout_Multilayer Perceptron', 'Multilayer Perceptron', 'academic_dropout', 1, 0.7401129943502824, 10.675, '2025-02-10 15:34:37', ''], ['academic_dropout_Multilayer Perceptron', 'Multilayer Perceptron', 'academic_dropout', 2, 0.7491525423728813, 10.488, '2025-02-10 15:34:48', ''], ['academic_dropout_Multilayer Perceptron', 'Multilayer Perceptron', 'academic_dropout', 3, 0.7242937853107345, 10.839, '2025-02-10 15:34:59', ''], ['academic_dropout_Multilayer Perceptron', 'Multilayer Perceptron', 'academic_dropout', 4, 0.7468926553672316, 10.492, '2025-02-10 15:35:09', ''], ['academic_dropout_Multilayer Perceptron', 'Multilayer Perceptron', 'academic_dropout', 5, 0.7024886877828054, 10.421, '2025-02-10 15:35:20', '']]
academic_dropout.csv
[0.7401129943502824, 0.7491525423728813, 0.7242937853107345, 0.7468926553672316, 0.7024886877828054]
[10.675, 10.488, 10.839, 10.492, 10.421]
Accuracy of 1: 0.73892




Accuracy of 2: 0.69951
Accuracy of 3: 0.66502




Accuracy of 4: 0.68966
Accuracy of 5: 0.66337
[['maternal_health_risk_Multilayer Perceptron', 'Multilayer Perceptron', 'maternal_health_risk', 1, 0.7389162561576355, 1.556, '2025-02-10 15:35:21', ''], ['maternal_health_risk_Multilayer Perceptron', 'Multilayer Perceptron', 'maternal_health_risk', 2, 0.6995073891625616, 1.315, '2025-02-10 15:35:23', ''], ['maternal_health_risk_Multilayer Perceptron', 'Multilayer Perceptron', 'maternal_health_risk', 3, 0.6650246305418719, 1.15, '2025-02-10 15:35:24', ''], ['maternal_health_risk_Multilayer Perceptron', 'Multilayer Perceptron', 'maternal_health_risk', 4, 0.6896551724137931, 1.589, '2025-02-10 15:35:25', ''], ['maternal_health_risk_Multilayer Perceptron', 'Multilayer Perceptron', 'maternal_health_risk', 5, 0.6633663366336634, 1.157, '2025-02-10 15:35:27', '']]
maternal_health_risk.csv
[0.7389162561576355, 0.6995073891625616, 0.6650246305418719, 0.6896551724137931, 0.6633663366336634]
[1.556, 1.315, 1.15, 1.589, 1.157]
Accuracy of 1: 0.99622




Accuracy of 1: 0.47059




Accuracy of 2: 0.51471




Accuracy of 3: 0.51471




Accuracy of 4: 0.52239
Accuracy of 5: 0.53731
[['land_mines_Multilayer Perceptron', 'Multilayer Perceptron', 'land_mines', 1, 0.47058823529411764, 0.612, '2025-02-10 15:42:41', ''], ['land_mines_Multilayer Perceptron', 'Multilayer Perceptron', 'land_mines', 2, 0.5147058823529411, 0.558, '2025-02-10 15:42:42', ''], ['land_mines_Multilayer Perceptron', 'Multilayer Perceptron', 'land_mines', 3, 0.5147058823529411, 0.548, '2025-02-10 15:42:42', ''], ['land_mines_Multilayer Perceptron', 'Multilayer Perceptron', 'land_mines', 4, 0.5223880597014925, 0.577, '2025-02-10 15:42:43', ''], ['land_mines_Multilayer Perceptron', 'Multilayer Perceptron', 'land_mines', 5, 0.5373134328358209, 0.544, '2025-02-10 15:42:43', '']]
land_mines.csv
[0.47058823529411764, 0.5147058823529411, 0.5147058823529411, 0.5223880597014925, 0.5373134328358209]
[0.612, 0.558, 0.548, 0.577, 0.544]




In [55]:
insert_competitors(cursor, insert_data)
print(insert_data)
print(file)
print(accuracy_all)
print(time_all)

conn.commit()

[['rt_iot_Xgboost OVR', 'Xgboost OVR', 'rt_iot', 1, 0.9982537361923327, 29.499, '2025-02-10 13:08:27', ''], ['rt_iot_Xgboost OVR', 'Xgboost OVR', 'rt_iot', 2, 0.9985786224821313, 29.877, '2025-02-10 13:08:57', ''], ['rt_iot_Xgboost OVR', 'Xgboost OVR', 'rt_iot', 3, 0.9986191771920562, 30.373, '2025-02-10 13:09:28', ''], ['rt_iot_Xgboost OVR', 'Xgboost OVR', 'rt_iot', 4, 0.998822239369695, 29.904, '2025-02-10 13:09:58', ''], ['rt_iot_Xgboost OVR', 'Xgboost OVR', 'rt_iot', 5, 0.9984161150144174, 29.529, '2025-02-10 13:10:27', '']]
rt_iot.csv
[0.9982537361923327, 0.9985786224821313, 0.9986191771920562, 0.998822239369695, 0.9984161150144174]
[29.499, 29.877, 30.373, 29.904, 29.529]


In [197]:
#To Update competitor names in competitor model registry
"""
UPDATE competitor_model_registry
SET dataset = CASE
    WHEN dataset = 'car-evaluation' THEN 'car_evaluation'
    WHEN dataset = 'Beans' THEN 'beans_data'
    WHEN dataset = 'mfeat-zernike' THEN 'mfeat-zernlike'
    WHEN dataset = 'page-blocks' THEN 'pageblocks'
    WHEN dataset = 'pendigits' THEN 'handwritten_digits'
    WHEN dataset = 'segment' THEN 'image_segment'
    WHEN dataset = 'mfeat-fourier' THEN 'mfeat-fouriers'
    -- Add more conditions as needed
    ELSE dataset -- Keeps the value unchanged for rows not matching any condition
END;

"""

<sqlite3.Cursor at 0x1ae1b7b52c0>

In [198]:
conn.close()

# Compare results

Now I can compare all my performance to each other

In [None]:
# We won or lost
"""
WITH top_nds as (
	SELECT 
		name,
		models,
		dataset,
		kfold,
		nd_structure,
		model_structure,
		MAX(accuracy_score) as best_accuracy,
		run_time_seconds,
		inner_kfolds
	FROM
		nd_model_registry
    WHERE name LIKE 'new_split%'
	GROUP BY dataset, kfold
),
best_comps as (
	SELECT
		model,
		dataset,
		kfold,
		MAX(accuracy) as best_accuracy,
		run_time_seconds
	FROM 
		competitor_model_registry
	GROUP BY dataset, kfold
)
SELECT 
	best_comps.dataset,
	best_comps.kfold, 
	best_comps.model as best_competitor, 
	best_comps.best_accuracy as best_comp,
	top_nds.best_accuracy as best_nd,
	top_nds.name as best_nd,
	CASE 
		WHEN abs(top_nds.best_accuracy - best_comps.best_accuracy) < 0.001 THEN 'DRAW'
		WHEN top_nds.best_accuracy > best_comps.best_accuracy THEN 'WE WON'
		ELSE 'WE LOST'
	END AS who_won
FROM 
	best_comps
INNER JOIN top_nds ON 
	best_comps.dataset = top_nds.dataset
	AND best_comps.kfold = top_nds.kfold

"""

In [None]:
# TOP RANKED PER KFOLD
"""
WITH best_nd_model AS (
	SELECT 
		'nd' as who_ran,
		dataset,
		kfold,
		name AS name,
		models AS models,
		MAX(accuracy_score) AS accuracy
	FROM nd_model_registry
	WHERE name not like 'z'
    GROUP BY dataset, kfold
    
),
competitors AS (
	SELECT 
		'competitor' as who_ran,
		dataset,
		kfold,
		name AS name,
		model AS models,
		accuracy
	FROM competitor_model_registry
),
all_together_now AS (
	SELECT 
		*
	FROM 
		best_nd_model nd
	UNION ALL 
	SELECT 
		* 
	FROM competitors
),
all_with_rank AS (
	SELECT 
		who_ran,
		dataset,
		kfold,
		name,
		models,
		accuracy,
		RANK() OVER (PARTITION BY dataset, kfold ORDER BY accuracy DESC) AS rank
	FROM 
		all_together_now
)
SELECT 
	* 
FROM 
	all_with_rank
WHERE 
	who_ran = 'nd'
"""

In [None]:
# TOP RANKED PER DATASET
"""
WITH best_nd_model AS (
	SELECT 
		'nd' as who_ran,
		dataset,
		name AS name,
		model AS models,
		MAX(average_accuracy) AS accuracy
	FROM average_per_dataset
	WHERE 
		comp_type = 'mine' 
	GROUP BY dataset
),
competitors AS (
	SELECT 
		'competitor' as who_ran,
		dataset,
		name AS name,
		model AS models,
		average_accuracy AS accuracy
	FROM average_per_dataset
    WHERE comp_type = 'comp'
),
all_together_now AS (
	SELECT 
		*
	FROM 
		best_nd_model nd
	UNION ALL 
	SELECT 
		* 
	FROM competitors
),
all_with_rank AS (
	SELECT 
		who_ran,
		dataset,
		name,
		models,
		accuracy,
		RANK() OVER (PARTITION BY dataset ORDER BY accuracy DESC) AS rank
	FROM 
		all_together_now
)
SELECT 
	dataset,
	rank,
	name,
	models,
	accuracy
FROM 
	all_with_rank
WHERE 
	who_ran = 'nd'
"""

In [None]:
# LR VS MULTINOMIAL

"""
WITH best_nd_model AS (
	SELECT 
		'nd' as who_ran,
		dataset,
		name AS name,
		model AS models,
		MAX(average_accuracy) AS accuracy
	FROM average_per_dataset
	WHERE 
		comp_type = 'mine' 
		and model = 'logisticregression'
	GROUP BY dataset
),
competitors AS (
	SELECT 
		'competitor' as who_ran,
		dataset,
		name AS name,
		model AS models,
		average_accuracy AS accuracy
	FROM average_per_dataset
    WHERE comp_type = 'comp'
	and model = 'Multinomial'
),
all_together_now AS (
	SELECT
		*
	FROM 
		best_nd_model nd
	UNION ALL
	SELECT 
		* 
	FROM competitors
),
all_with_rank AS (
	SELECT 
		who_ran,
		dataset,
		name,
		models,
		accuracy,
		RANK() OVER (PARTITION BY dataset ORDER BY accuracy DESC) AS rank
	FROM 
		all_together_now
)
SELECT 
	dataset,
	rank,
	name,
	models,
	accuracy
FROM 
	all_with_rank
WHERE 
	who_ran = 'nd'
"""

In [60]:
import sqlite3

conn = sqlite3.connect("results.db")
cursor = conn.cursor()

tables = ["nd_model_registry", "layer_by_layer_model_registry", "all_trees_model_registry"]

# Get column names for each table

query = f"""CREATE VIEW aggregate_view AS
SELECT * FROM nd_model_registry
UNION ALL
SELECT * FROM layer_by_layer_model_registry
UNION ALL
SELECT * FROM all_trees_model_registry;
"""

cursor.execute("DROP VIEW IF EXISTS aggregate_view")  # Ensure no duplicate views
cursor.execute(query)
conn.close()


In [119]:
# Hosted db

# pip install sqlitecloud

import sqlitecloud

# Open the connection to SQLite Cloud
conn = sqlitecloud.connect("sqlitecloud://cfmjnloknk.g2.sqlite.cloud:8860/results.db?apikey=n5c5fneg6ke9xbva6ddE17U2Tf2EPBMxbaDadNNVW60")
cursor = conn.execute('SELECT * FROM aggregate_view')
result = cursor.fetchall()

print(result)

conn.close()

[('car_evaluation_logisticregression_xgboostgpu_svm_knnhyper', 'logisticregression_xgboostgpu_svm_knnhyper', 'car_evaluation', 1, '[((3,), (2, 0, 1)), ((2,), (0, 1)), ((0,), (1,))]', "['logisticregression', 'xgboostgpu', 'logisticregression']", 0.913294797687861, 1126.497, 0, '2025-01-20 14:37:48', ''), ('car_evaluation_logisticregression_xgboostgpu_svm_knnhyper', 'logisticregression_xgboostgpu_svm_knnhyper', 'car_evaluation', 2, '[((3,), (2, 0, 1)), ((2,), (0, 1)), ((0,), (1,))]', "['xgboostgpu', 'xgboostgpu', 'xgboostgpu']", 0.982658959537572, 2131.257, 0, '2025-01-20 14:40:03', ''), ('car_evaluation_logisticregression_xgboostgpu_svm_knnhyper', 'logisticregression_xgboostgpu_svm_knnhyper', 'car_evaluation', 3, '[((2,), (0, 3, 1)), ((0,), (3, 1)), ((3,), (1,))]', "['xgboostgpu', 'knnhyper', 'xgboostgpu']", 0.985549132947977, 3108.88, 0, '2025-01-20 14:41:57', ''), ('car_evaluation_logisticregression_xgboostgpu_svm_knnhyper', 'logisticregression_xgboostgpu_svm_knnhyper', 'car_evaluatio

In [None]:
# Pull rank all into excel for Adriano
import sqlite3
import pandas as pd
import glob
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font

# Define database and output file
db_path = "results.db"  # Change this to your SQLite database
output_file = "rank_results_for_adriano.xlsx"

# Define the special file and its parameter sets
special_file = "/sql_script/select_model_rank_per_dataset.txt"  # Change as needed

# Find all SQL script files
sql_files = glob.glob("sql_scripts/*.txt")  # Change path if needed
print(sql_files)

sql_files = [file for file in sql_files if "select_model_rank" not in file]
# Connect to SQLite
conn = sqlite3.connect(db_path)

pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", 200)  # Increase width for better formatting

# Create an Excel writer
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    for file in sql_files:
        with open(file, "r", encoding="utf-8") as f:
            sql_query = f.read()

        query_with_params = sql_query
        try:
            df = pd.read_sql_query(query_with_params, conn)
            sheet_name = file.split("\\")[-1].replace(".txt", "")[:31]  # Excel sheets have a 31-char limit
            df.to_excel(writer, sheet_name=sheet_name, index=False)
            print(f"Saved results from {file} to {sheet_name}")
        except Exception as e:
            print(f"Error executing {file}: {e}")

# Close database connection
conn.close()
print(f"Results saved in {output_file}")

wb = load_workbook(output_file)

for sheet in wb.sheetnames:
    ws = wb[sheet]

    # Apply bold font to headers
    for col in range(1, ws.max_column + 1):
        ws[f"{get_column_letter(col)}1"].font = Font(bold=True)

    # Auto-adjust column widths
    for col in ws.columns:
        max_length = 0
        col_letter = get_column_letter(col[0].column)  # Get column letter
        for cell in col:
            try:
                max_length = max(max_length, len(str(cell.value)))
            except:
                pass
        ws.column_dimensions[col_letter].width = max_length + 2  # Adjust width

wb.save(output_file)
print(f"Formatted results saved in {output_file}")


['sql_scripts\\best_nd_rank_per_dataset.txt', 'sql_scripts\\best_nd_rank_per_kfold.txt', 'sql_scripts\\dataset_info.txt', 'sql_scripts\\logisticregression_vs_multinomial.txt', 'sql_scripts\\nd_knn_vs_knn.txt', 'sql_scripts\\select_model_rank_per_dataset.txt', 'sql_scripts\\svm_vs_OVO_OVR.txt', 'sql_scripts\\xgboost_vs_OVO_OVR.txt']
Saved results from sql_scripts\best_nd_rank_per_dataset.txt to best_nd_rank_per_dataset
Saved results from sql_scripts\best_nd_rank_per_kfold.txt to best_nd_rank_per_kfold
Saved results from sql_scripts\dataset_info.txt to dataset_info
Saved results from sql_scripts\logisticregression_vs_multinomial.txt to logisticregression_vs_multinomi
Saved results from sql_scripts\nd_knn_vs_knn.txt to nd_knn_vs_knn
Saved results from sql_scripts\svm_vs_OVO_OVR.txt to svm_vs_OVO_OVR
Saved results from sql_scripts\xgboost_vs_OVO_OVR.txt to xgboost_vs_OVO_OVR
Results saved in rank_results_for_adriano.xlsx


Exception ignored in: <function ZipFile.__del__ at 0x0000016F927FF520>
Traceback (most recent call last):
  File "C:\Users\maxdi\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 1808, in __del__
    self.close()
  File "C:\Users\maxdi\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 1825, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file
Exception ignored in: <function ZipFile.__del__ at 0x0000016F927FF520>
Traceback (most recent call last):
  File "C:\Users\maxdi\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 1808, in __del__
    self.close()
  File "C:\Users\maxdi\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 1825, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file
Exception ignored in: <function ZipFile.__del__ at 0x0000016F927FF520>
Traceback (most recent call last):
  File "C:\Users\maxdi\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 1808, in __del__
    self.clo

Formatted results saved in rank_results_for_adriano.xlsx
