In [11]:
import mlflow
import pandas as pd
import numpy as np
import mlflow.sklearn
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import PredefinedSplit

In [12]:
def generate_file_pairs(base_path="data/LR_splitted_csv",
                        test_sets=5, variations=['0.00', '0.25', '0.50', '0.75', '1.00'], train_val_pairs=5):
    # Dictionary to hold all train-validation-test triples for all test sets
    all_triples = {}
    base_path = r"C:\Users\andre\dev\BSc-Project\BSc-Project\data\LR_splitted_csv"

    # Generate file names
    for test_set in range(test_sets):
        triples = []

        # Generating test set filename
        test_filename = f'{base_path}/m_f_ca_nc_test_{test_set}.csv'

        for variation in variations:
            for pair in range(train_val_pairs):
                # Include the base path in the filename
                train_filename = f'{base_path}/m_f_ca_nc_train_{test_set}_{variation}_{pair}.csv'
                val_filename = f'{base_path}/m_f_ca_nc_val_{test_set}_{variation}_{pair}.csv'
                triples.append((train_filename, val_filename, test_filename))
        
        all_triples[f'test_set_{test_set}'] = triples
    
    return all_triples

# Example usage
all_file_triples = generate_file_pairs()

# Example output for test set 0
# for test_set, triples in all_file_triples.items():
#     print(f'{test_set}:')
#     for triple in triples:
#         print(f'  Train file: {triple[0]}, Val file: {triple[1]}, Test file: {triple[2]}')
    # Break to only show the first test set for brevity


In [13]:
mlflow.set_tracking_uri("http://127.0.0.1:5001")  # Ensure no extra spaces or slashes beyond this


mlflow.set_experiment("Andreas_Superseje_Logistic_Regression_Experiments")

def load_dataset(filepath):
    # Adjust this function to fit how your data is structured
    # For example, load the CSV file and return features and labels
    # cancer = ['SCC', 'BCC', 'MEL']


    # def is_cancerous(condition):
        # return any(cancer in condition for cancer in cancer)
    
    
    df = pd.read_csv(filepath)
    # df['is_cancerous'] = df['diagnostic'].apply(is_cancerous).astype(int)

    # "pigment_network_coverage", "blue_veil_pixels", "globules_count", "streaks_irregularity",
    # "irregular_pigmentation_coverage", "regression_pixels", "compactness_x", "avg_red_channel", 
    # "avg_green_channel", "avg_blue_channel", "multicolor_rate", "asymmetry", "average_hue",
    # "average_saturation", "average_value", "mean_asymmetry", "best_asymmetry", "worst_asymmetry",
    # "red_var", "green_var", "blue_var", "hue_var", "sat_var", "val_var", "dom_hue", "dom_sat", 
    # "dom_val", "compactness_y", "convexity", "F1", "F2", "F3", "F10", "F11", "F12"
    
    X = df[["age", "F2", "compactness_x", "compactness_y", "regression_pixels", "F11", "dom_hue",
            "sat_var", "mean_asymmetry", "streaks_irregularity", "multicolor_rate"]]  # Features
    y = df['is_cancerous'] 
    # y_1 = df['diagnostic']  # Labels
    return X, y


def train_and_evaluate_model(file_triples):
    results = []
    count = 0
    for test_set, triples in file_triples.items():
        for train_file, val_file, test_file in triples:
            count += 1
                    
            X_train, y_train = load_dataset(train_file)
            X_val, y_val = load_dataset(val_file)
            X_test, y_test = load_dataset(test_file)

            with mlflow.start_run(run_name=f"logistic_regression_{count}"):
                # Define model and pipeline elements here as needed

                logistic = LogisticRegression()


                scaler = StandardScaler()

                pipe = Pipeline(steps=[("scaler", scaler), ("logistic", logistic)])

                combined_X = np.vstack((X_train, X_val))
                combined_y = pd.concat([y_train, y_val])
                split_index = [-1] * len(X_train) + [0] * len(X_val)
                pds = PredefinedSplit(test_fold=split_index)

                
                parameters = [
                    {"logistic__solver": ['liblinear'],
                    "logistic__penalty": ["l1", "l2"],
                    "logistic__fit_intercept": [True, False],
                    "logistic__C": [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
                    "logistic__class_weight": ["balanced", None],
                    "logistic__max_iter": [50, 100, 500, 1000]},
                    {"logistic__solver": ['lbfgs'],
                    "logistic__penalty": ["l2", None],
                    "logistic__fit_intercept": [True, False],
                    "logistic__C": [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
                    "logistic__class_weight": ["balanced", None],
                    "logistic__max_iter": [50, 100, 500, 1000]}
                    ]

                grid_search = GridSearchCV(estimator=pipe,
                                           param_grid=parameters,
                                           cv=pds,
                                           scoring='accuracy')
                search = grid_search.fit(combined_X, combined_y)


                y_pred = search.predict(X_test)

                mlflow.log_params(search.best_params_)
                mlflow.log_metric("accuracy", search.best_score_)
                accuracy = accuracy_score(y_test, y_pred)
                print("Accuracy for all groups:", accuracy)
                print("Best gridsearch score", search.best_score_)
                print(f"Penalty {search.best_params_["logistic__penalty"]}, \
                Solver {search.best_params_["logistic__solver"]}, \
                Fit intercept {search.best_params_["logistic__fit_intercept"]}, \
                C {search.best_params_["logistic__C"]}, \
                Class weight {search.best_params_["logistic__class_weight"]}, \
                Max iterations {search.best_params_["logistic__max_iter"]}")
                

                # cm = confusion_matrix(y_val, y_pred)
                # cm = ConfusionMatrixDisplay(cm)
                # cm = cm.plot()
                # plt.savefig(f"confusion_matrix_{count}.png")
                # plt.show()

                # feature_importance = np.abs(search.best_estimator_.named_steps['logistic'].coef_[0])
                # feature_names = X_train.columns
                # plt.figure(figsize=(10, 8))
                # plt.barh(feature_names, feature_importance)
                # plt.xlabel('Coefficient Magnitude')
                # plt.ylabel('Features')
                # plt.title('Feature Importance for Logistic Regression Model')
                # plt.tight_layout()
                # mlflow.log_figure(plt.gcf(), "feature_importance")

                # TODO: skaæ jeg rykke rundt på den her? så den er lige under når jeg
                # laber gridsearch
                metrics = [
                ("ROC", roc_auc_score(y_test, y_pred), []),
                ("Accuracy", accuracy_score(y_test, y_pred), []),
                ("Recall", recall_score(y_test, y_pred), []),
                ("Precision", precision_score(y_test, y_pred), []),
                ("F1-score", f1_score(y_test, y_pred), [])
                ]

                for name, value, _ in metrics:
                    mlflow.log_metric(name, value)




                accuracy = accuracy_score(y_test, y_pred)
                variation = train_file.split('_')[-2]  # This will extract the variation component based on your path format
                results.append({
                    "variation": variation,
                    # "train_file": train_file,
                    # "validation_file": val_file,
                    # "test_file": test_file,
                    "accuracy": accuracy,
                    "best_params": search.best_params_,
                    "best_score": search.best_score_,
                    "roc_auc": roc_auc_score(y_test, y_pred), 
                })


                # mlflow.log_figure(cm, "confusion_matrix")

                # mlflow.log_artifact(f"confusion_matrix_{count}.png", "confusion_matrices")

                
                # Log model and metrics
                mlflow.sklearn.log_model(grid_search, "Logistic_regression_model")

    # Convert results to DataFrame and save to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv('model_training_results_lr.csv', index=False)
    return results_df

In [14]:
base_path = r"C:\Users\andre\dev\BSc-Project\BSc-Project\data\LR_splitted_csv"
all_file_triples = generate_file_pairs(base_path=base_path)#base_path=base_path
results = train_and_evaluate_model(all_file_triples)

# # Print results
# for train_file, val_file, accuracy in results:
#     print(f"Train file: {train_file}, Val file: {val_file}, Accuracy: {accuracy}")




Accuracy for all groups: 0.67
Best gridsearch score 0.7948717948717948
Penalty None,                 Solver lbfgs,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.67
Best gridsearch score 0.7796610169491526
Penalty l2,                 Solver lbfgs,                 Fit intercept True,                 C 2,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.67
Best gridsearch score 0.7692307692307693
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 2,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.67
Best gridsearch score 0.7350427350427351
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.67
Best gridsearch score 0.717948717948718
Penalty None,                 Solver lbfgs,                 Fit intercept False,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.7
Best gridsearch score 0.7863247863247863
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.72
Best gridsearch score 0.7777777777777778
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.1,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.68
Best gridsearch score 0.7008547008547008
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.69
Best gridsearch score 0.8050847457627118
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 5,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.7
Best gridsearch score 0.8290598290598291
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.68
Best gridsearch score 0.7711864406779662
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.67
Best gridsearch score 0.8205128205128205
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.66
Best gridsearch score 0.7948717948717948
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.69
Best gridsearch score 0.7350427350427351
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.67
Best gridsearch score 0.7692307692307693
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.69
Best gridsearch score 0.6949152542372882
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.71
Best gridsearch score 0.7008547008547008
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.73
Best gridsearch score 0.8205128205128205
Penalty l1,                 Solver liblinear,                 Fit intercept False,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.69
Best gridsearch score 0.7948717948717948
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.71
Best gridsearch score 0.7350427350427351
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 1,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.71
Best gridsearch score 0.7435897435897436
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.73
Best gridsearch score 0.7350427350427351
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.71
Best gridsearch score 0.7542372881355932
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.7
Best gridsearch score 0.717948717948718
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.7
Best gridsearch score 0.6581196581196581
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.63
Best gridsearch score 0.7692307692307693
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.63
Best gridsearch score 0.6779661016949152
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.64
Best gridsearch score 0.7350427350427351
Penalty None,                 Solver lbfgs,                 Fit intercept False,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.65
Best gridsearch score 0.7777777777777778
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.62
Best gridsearch score 0.7521367521367521
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.6
Best gridsearch score 0.8050847457627118
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.6
Best gridsearch score 0.7350427350427351
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.58
Best gridsearch score 0.6923076923076923
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.6
Best gridsearch score 0.6837606837606838
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.58
Best gridsearch score 0.6752136752136753
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.58
Best gridsearch score 0.7435897435897436
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.58
Best gridsearch score 0.7692307692307693
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.58
Best gridsearch score 0.7435897435897436
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.59
Best gridsearch score 0.7435897435897436
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.56
Best gridsearch score 0.711864406779661
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.61
Best gridsearch score 0.7692307692307693
Penalty l2,                 Solver lbfgs,                 Fit intercept True,                 C 0.01,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.66
Best gridsearch score 0.7692307692307693
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.6
Best gridsearch score 0.7264957264957265
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.6
Best gridsearch score 0.7457627118644068
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.61
Best gridsearch score 0.7264957264957265
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.63
Best gridsearch score 0.7372881355932204
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.62
Best gridsearch score 0.6666666666666666
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 2,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.63
Best gridsearch score 0.7264957264957265
Penalty l1,                 Solver liblinear,                 Fit intercept False,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.63
Best gridsearch score 0.7094017094017094
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.62
Best gridsearch score 0.7692307692307693
Penalty l2,                 Solver lbfgs,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.76
Best gridsearch score 0.7008547008547008
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.76
Best gridsearch score 0.7542372881355932
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.76
Best gridsearch score 0.7777777777777778
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.76
Best gridsearch score 0.6923076923076923
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.76
Best gridsearch score 0.6837606837606838
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.75
Best gridsearch score 0.7435897435897436
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.74
Best gridsearch score 0.7264957264957265
Penalty l1,                 Solver liblinear,                 Fit intercept False,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.74
Best gridsearch score 0.7435897435897436
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.74
Best gridsearch score 0.7435897435897436
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.75
Best gridsearch score 0.7542372881355932
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 2,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.71
Best gridsearch score 0.7203389830508474
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.68
Best gridsearch score 0.7350427350427351
Penalty l2,                 Solver lbfgs,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.72
Best gridsearch score 0.7606837606837606
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.71
Best gridsearch score 0.6666666666666666
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.69
Best gridsearch score 0.811965811965812
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.71
Best gridsearch score 0.7372881355932204
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.73
Best gridsearch score 0.7521367521367521
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.73
Best gridsearch score 0.6923076923076923
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.71
Best gridsearch score 0.7435897435897436
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.72
Best gridsearch score 0.7521367521367521
Penalty None,                 Solver lbfgs,                 Fit intercept False,                 C 0.01,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.7
Best gridsearch score 0.7350427350427351
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.69
Best gridsearch score 0.7350427350427351
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.68
Best gridsearch score 0.7966101694915254
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.69
Best gridsearch score 0.6752136752136753
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.73
Best gridsearch score 0.6410256410256411
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.76
Best gridsearch score 0.7542372881355932
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.74
Best gridsearch score 0.6752136752136753
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.72
Best gridsearch score 0.6752136752136753
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.77
Best gridsearch score 0.7264957264957265
Penalty l2,                 Solver lbfgs,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.74
Best gridsearch score 0.7863247863247863
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.68
Best gridsearch score 0.7542372881355932
Penalty l2,                 Solver lbfgs,                 Fit intercept True,                 C 0.01,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.7
Best gridsearch score 0.7692307692307693
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.68
Best gridsearch score 0.6837606837606838
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.67
Best gridsearch score 0.6752136752136753
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.69
Best gridsearch score 0.7264957264957265
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.72
Best gridsearch score 0.6666666666666666
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 5,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.71
Best gridsearch score 0.7203389830508474
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.72
Best gridsearch score 0.717948717948718
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.73
Best gridsearch score 0.7008547008547008
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.73
Best gridsearch score 0.6923076923076923
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.74
Best gridsearch score 0.7350427350427351
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.74
Best gridsearch score 0.7457627118644068
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.75
Best gridsearch score 0.7264957264957265
Penalty None,                 Solver lbfgs,                 Fit intercept False,                 C 0.01,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.72
Best gridsearch score 0.7606837606837606
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.74
Best gridsearch score 0.6837606837606838
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.78
Best gridsearch score 0.717948717948718
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.75
Best gridsearch score 0.6666666666666666
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.74
Best gridsearch score 0.7692307692307693
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.76
Best gridsearch score 0.7094017094017094
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.75
Best gridsearch score 0.652542372881356
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.73
Best gridsearch score 0.7606837606837606
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.73
Best gridsearch score 0.7521367521367521
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.74
Best gridsearch score 0.7372881355932204
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 2,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.72
Best gridsearch score 0.7008547008547008
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.74
Best gridsearch score 0.7521367521367521
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.76
Best gridsearch score 0.7008547008547008
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.76
Best gridsearch score 0.7948717948717948
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.75
Best gridsearch score 0.6752136752136753
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.76
Best gridsearch score 0.7435897435897436
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.76
Best gridsearch score 0.6949152542372882
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.72
Best gridsearch score 0.7521367521367521
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.67
Best gridsearch score 0.6949152542372882
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.64
Best gridsearch score 0.7008547008547008
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.68
Best gridsearch score 0.6837606837606838
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.73
Best gridsearch score 0.7863247863247863
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 1,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.6
Best gridsearch score 0.6410256410256411
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.6
Best gridsearch score 0.635593220338983
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.66
Best gridsearch score 0.7435897435897436
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.66
Best gridsearch score 0.7094017094017094
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.67
Best gridsearch score 0.717948717948718
Penalty l2,                 Solver liblinear,                 Fit intercept False,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.69
Best gridsearch score 0.7777777777777778
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.5,                 Class weight None,                 Max iterations 50




Accuracy for all groups: 0.67
Best gridsearch score 0.7692307692307693
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.63
Best gridsearch score 0.7008547008547008
Penalty l1,                 Solver liblinear,                 Fit intercept True,                 C 0.05,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.65
Best gridsearch score 0.7350427350427351
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.01,                 Class weight balanced,                 Max iterations 50




Accuracy for all groups: 0.69
Best gridsearch score 0.6949152542372882
Penalty l2,                 Solver liblinear,                 Fit intercept True,                 C 0.1,                 Class weight balanced,                 Max iterations 50


In [15]:
%%script echo plz
#dont look at this please

from lr_final.py import logisticregression

def load_dataset(filepath):
    # Adjust this function to fit how your data is structured
    # For example, load the CSV file and return features and labels
    df = pd.read_csv(filepath)
    # Example: assuming the last column is the label and the rest are features
    X = df[['diameter_2', 'diameter_1']]  # Features
    y = df['diagnostic']   # Labels
    return X, y

# Train and evaluate logistic regression model
def train_and_evaluate_lr(file_pairs):
    results = []

    for test_set, pairs in file_pairs.items():

        for train_file, val_file in pairs:
            # Load training and validation sets
            X_train, y_train = load_dataset(train_file)
            X_val, y_val = load_dataset(val_file)
            
            # Initialize and train the logistic regression model
            model = LogisticRegression(max_iter=1000)
            model.fit(X_train, y_train)
            
            # Predict on the validation set
            y_pred = model.predict(X_val)
            
            # Calculate accuracy
            accuracy = accuracy_score(y_val, y_pred)
            
            # Store the result
            results.append((train_file, val_file, accuracy))
    
    return results

# Example usage
base_path = "/Users/regitzesydendal/Documents/GitHub/BSc-Project/data/splitted_csv"
all_file_pairs = generate_file_pairs(base_path=base_path)
results = train_and_evaluate_lr(all_file_pairs)

# Print results
for train_file, val_file, accuracy in results:
    print(f"Train file: {train_file}, Val file: {val_file}, Accuracy: {accuracy}")


Couldn't find program: 'echo'


In [16]:
%%script echo old code

def generate_file_pairs(base_path="/BSc-Project/data/LR_splitted_csv",
                        test_sets=5, variations=['0.00', '0.25', '0.50', '0.75', '1.00'], train_val_pairs=5):
    # Dictionary to hold all train-validation pairs for all test sets
    all_pairs = {}

    # Generate file names
    for test_set in range(test_sets):
        print(test_set)
        pairs = []

        for variation in variations:
            for pair in range(train_val_pairs):
                # Include the base path in the filename
                train_filename = f'{base_path}/m_f_ca_nc_train_{test_set}_{variation}_{pair}.csv'
                val_filename = f'{base_path}/m_f_ca_nc_val_{test_set}_{variation}_{pair}.csv'
                pairs.append((train_filename, val_filename))
        
        all_pairs[f'test_set_{test_set}'] = pairs
    
    return all_pairs

# Example usage
all_file_pairs = generate_file_pairs()

# Example output for test set 0
for test_set, pairs in all_file_pairs.items():
    print(f'{test_set}:')
    for pair in pairs:
        print(f'  Train file: {pair[0]}, Val file: {pair[1]}')
    # Break to only show the first test set for brevity
    break

Couldn't find program: 'echo'
