# Get results from Undersampling with multiple seeds

In [2]:
import pandas as pd
import glob
import os

# Define the path pattern for the input files
input_pattern = './results/Mix_BreastCancer.m.bal_pool_8708_UnderBeforeSplit_seed*.csv'

# List to store all dataframes
all_dfs = []

# Iterate through all matching files
for file_path in glob.glob(input_pattern):
    # Extract the seed number from the filename
    seed = int(file_path.split('seed')[-1].split('.')[0])
    
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Add a SEED column
    df['SEED'] = seed
    
    # Append to the list of dataframes
    all_dfs.append(df)

# Combine all dataframes
combined_df = pd.concat(all_dfs, ignore_index=True)

# Define the output file path for the combined data
output_file = './results/Mix_BreastCancer.m.bal_pool_8708_UnderBeforeSplit_combined.csv'

# Save the combined dataframe to a CSV file
combined_df.to_csv(output_file, index=False)

print(f"Combined data saved to {output_file}")
print(f"Total number of rows: {len(combined_df)}")
print(f"Number of unique seeds: {combined_df['SEED'].nunique()}")

# Calculate averages for each seed
# First, identify non-numeric columns (assuming 'SEED' and potentially others are non-numeric)
non_numeric_cols = combined_df.select_dtypes(exclude=['number']).columns.tolist()
numeric_cols = [col for col in combined_df.columns if col not in non_numeric_cols and col != 'SEED']

# Group by SEED and calculate mean for numeric columns
avg_df = combined_df.groupby('SEED')[numeric_cols].mean().reset_index()

# Add back any non-numeric columns (except 'SEED') with mode values
for col in non_numeric_cols:
    if col != 'SEED':
        mode_values = combined_df.groupby('SEED')[col].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
        avg_df = avg_df.merge(mode_values, on='SEED')

# Reorder columns to match original order
avg_df = avg_df[['SEED'] + [col for col in combined_df.columns if col != 'SEED']]

# Define the output file path for the averaged data
avg_output_file = './results/Mix_BreastCancer.m.bal_pool_8708_UnderBeforeSplit_averaged.csv'

# Save the averaged dataframe to a CSV file
avg_df.to_csv(avg_output_file, index=False)

print(f"\nAveraged data saved to {avg_output_file}")
print(f"Number of rows in averaged data: {len(avg_df)}")


Combined data saved to ./results/Mix_BreastCancer.m.bal_pool_8708_UnderBeforeSplit_combined.csv
Total number of rows: 150
Number of unique seeds: 50

Averaged data saved to ./results/Mix_BreastCancer.m.bal_pool_8708_UnderBeforeSplit_averaged.csv
Number of rows in averaged data: 50


In [3]:
import pandas as pd
import numpy as np

# Read the averaged data
avg_df = pd.read_csv('./results/Mix_BreastCancer.m.bal_pool_8708_UnderBeforeSplit_averaged.csv')

# Calculate average and standard deviation for MLP column
mlp_avg = avg_df['MLP'].mean()
mlp_std = avg_df['MLP'].std()

print(f"MLP Average: {mlp_avg}")
print(f"MLP Standard Deviation: {mlp_std}")

# Read the combined data
combined_df = pd.read_csv('./results/Mix_BreastCancer.m.bal_pool_8708_UnderBeforeSplit_combined.csv')

# Find the row with MLP value closest to the average
closest_row = combined_df.loc[(combined_df['MLP'] - mlp_avg).abs().idxmin()]

print("\nColumns in the dataset:")
print(", ".join(combined_df.columns))

print("\nRow with MLP value closest to average:")
print(closest_row)

MLP Average: 0.9745888125492513
MLP Standard Deviation: 0.007884168908041625

Columns in the dataset:
NB, KNN, LDA, SVM linear, SVM, LR, MLP, DT, RF, XGB, GB, AdaB, Bagging, Dataset, folds, SEED

Row with MLP value closest to average:
NB            0.853059
KNN           0.513741
LDA           0.695922
SVM linear     0.95789
SVM           0.960106
LR            0.959663
MLP           0.974734
DT            0.661791
RF            0.876995
XGB           0.704344
GB            0.780585
AdaB          0.739805
Bagging       0.742908
Dataset          Class
folds                3
SEED                43
Name: 112, dtype: object
