In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix
from matplotlib import pyplot as plt
from CNN import test_resnet50

In [None]:
def generate_file_pairs(base_path="data/splitted_csv",
                        test_sets=5, variations=['0.00', '0.25', '0.50', '0.75', '1.00'], train_val_pairs=5):
    # Dictionary to hold all train-validation-test triples for all test sets
    all_triples = {}
    base_path = "/Users/niko/Documents/uni/6. semester/bsc/Project/BSc-Project/data/splitted_csv"

    # Generate file names
    for test_set in range(test_sets):
        triples = []

        # Generating test set filename
        test_filename = f'{base_path}/m_f_ca_nc_test_{test_set}.csv'

        for variation in variations:
            for pair in range(train_val_pairs):
                # Include the base path in the filename
                train_filename = f'{base_path}/m_f_ca_nc_train_{test_set}_{variation}_{pair}.csv'
                val_filename = f'{base_path}/m_f_ca_nc_val_{test_set}_{variation}_{pair}.csv'
                triples.append((train_filename, val_filename, test_filename))
        
        all_triples[f'test_set_{test_set}'] = triples
    
    return all_triples

# Example usage
all_file_triples = generate_file_pairs()

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5002")  # Ensure no extra spaces or slashes beyond this


mlflow.set_experiment("CNN_Tester")

def load_dataset(filepath):
    # Adjust this function to fit how your data is structured
    # For example, load the CSV file and return features and labels
    cancer = ['SCC', 'BCC', 'MEL']


    def is_cancerous(condition):
        return any(cancer in condition for cancer in cancer)
    
    
    df = pd.read_csv(filepath)
    df['is_cancerous'] = df['diagnostic'].apply(is_cancerous).astype(int)

    # "pigment_network_coverage", "blue_veil_pixels", "globules_count", "streaks_irregularity",
    # "irregular_pigmentation_coverage", "regression_pixels", "compactness_x", "avg_red_channel", 
    # "avg_green_channel", "avg_blue_channel", "multicolor_rate", "asymmetry", "average_hue",
    # "average_saturation", "average_value", "mean_asymmetry", "best_asymmetry", "worst_asymmetry",
    # "red_var", "green_var", "blue_var", "hue_var", "sat_var", "val_var", "dom_hue", "dom_sat", 
    # "dom_val", "compactness_y", "convexity", "F1", "F2", "F3", "F10", "F11", "F12"
    
    X = df[["img_id"]]  # Features
    y = df[["is_cancerous"]]   # Labels
    return X, y


def train_and_evaluate_model(file_triples):
    results = []
    count = 0
    img_path = "/Users/niko/Documents/uni/6. semester/bsc/Project/BSc-Project/pad-ufes/images"
    for test_set, triples in file_triples.items():
        for train_file, val_file, test_file in triples:
            X_val, y_val = load_dataset(val_file)
            X_test, y_test = load_dataset(test_file)
            count += 1

            with mlflow.start_run(run_name=f"cnn_{count}"):
                # Define model and pipeline elements here as needed
                print('before cnn')

                y_pred, cnn = test_resnet50(train_file, val_file, test_file, img_path)

                print('after cnn')
                #y_pred = cnn.predict(X_test)

                #mlflow.log_params(cnn.best_params_)
                #mlflow.log_metric("accuracy", search.best_score_)
                accuracy = accuracy_score(y_test, y_pred)
                #print("Accuracy for all groups:", accuracy)
                #print("Best gridsearch score", search.best_score_)
                #print(f"Penalty {search.best_params_["logistic__penalty"]}, \
                #Solver {search.best_params_["logistic__solver"]}, \
                #Fit intercept {search.best_params_["logistic__fit_intercept"]}, \
                #C {search.best_params_["logistic__C"]}, \
                #Class weight {search.best_params_["logistic__class_weight"]}, \
                #Max iterations {search.best_params_["logistic__max_iter"]}")
                

                # cm = confusion_matrix(y_val, y_pred)
                # cm = ConfusionMatrixDisplay(cm)
                # cm = cm.plot()
                # plt.savefig(f"confusion_matrix_{count}.png")
                # plt.show()

                # feature_importance = np.abs(search.best_estimator_.named_steps['logistic'].coef_[0])
                # feature_names = X_train.columns
                # plt.figure(figsize=(10, 8))
                # plt.barh(feature_names, feature_importance)
                # plt.xlabel('Coefficient Magnitude')
                # plt.ylabel('Features')
                # plt.title('Feature Importance for Logistic Regression Model')
                # plt.tight_layout()
                # mlflow.log_figure(plt.gcf(), "feature_importance")

                
                metrics = [
                ("ROC", roc_auc_score(y_test, y_pred), []),
                ("Accuracy", accuracy_score(y_test, y_pred), []),
                ("Recall", recall_score(y_test, y_pred), []),
                ("Precision", precision_score(y_test, y_pred), []),
                ("F1-score", f1_score(y_test, y_pred), [])
                ]

                for name, value, _ in metrics:
                    mlflow.log_metric(name, value)


                accuracy = accuracy_score(y_test, y_pred)
                variation = train_file.split('_')[-2]  # This will extract the variation component based on your path format
                results.append({
                    "variation": variation,
                    # "train_file": train_file,
                    # "validation_file": val_file,
                    # "test_file": test_file,
                    "accuracy": accuracy,
                    #"best_params": search.best_params_,
                    #"best_score": search.best_score_,
                    "ROC/AUC score": roc_auc_score(y_test, y_pred), 
                })
                


                # mlflow.log_figure(cm, "confusion_matrix")

                # mlflow.log_artifact(f"confusion_matrix_{count}.png", "confusion_matrices")

                
                # Log model and metrics
                mlflow.sklearn.log_model(cnn, "cnn")

    # Convert results to DataFrame and save to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv('model_training_results_cnn.csv', index=False)
    return results_df

In [None]:
base_path = "/Users/niko/Documents/uni/6. semester/bsc/Project/BSc-Project/data/splitted_csv"
all_file_triples = generate_file_pairs(base_path=base_path)#base_path=base_path
results = train_and_evaluate_model(all_file_triples)
