# notebook for downloading, creating and preprocessing data

***
Imporrting packages 
***

In [42]:
import os
import glob

from tqdm import tqdm

import numpy as np
import pandas as pd

from ucimlrepo import fetch_ucirepo 
from sklearn.datasets import fetch_openml

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

***
Helper function for getting info about dataset
***

In [43]:
def variable_summary(df, max_uniques=20):
    summary = []
    for col in df.columns:
        dtype = df[col].dtype
        n_unique = df[col].nunique(dropna=True)

        info = {
            "variable": col,
            "dtype": str(dtype),
            "n_unique": n_unique,
        }

        if dtype == "object" or isinstance(dtype, pd.CategoricalDtype):
            uniques = df[col].dropna().unique()
            info["unique_values"] = uniques[:max_uniques]
            info["truncated"] = len(uniques) > max_uniques
        else:
            info["unique_values"] = None
            info["truncated"] = False

        summary.append(info)

    return pd.DataFrame(summary)

***
Function for downloading specific UCI dataset
***

In [44]:
def download_uci_datasets(name, uci_id, min_c, maj_c):
    print(f"Fetching {name} (UCI ID: {uci_id})...")
    try:
        dataset = fetch_ucirepo(id=uci_id)
        X = dataset.data.features
        y = dataset.data.targets

        print("\n--------------------------------------------------")
        if isinstance(X, pd.DataFrame): print(variable_summary(X))
        if isinstance(y, pd.DataFrame): print(variable_summary(y)) 
        else: print(y.unique())
        
        if isinstance(y, pd.DataFrame):
            y = y.iloc[:, 0]
        
        
        if maj_c == "All other":
            mask = pd.Series([True] * len(y), index=y.index)
        else:
            all_classes = (min_c + maj_c)
            mask = y.astype(str).isin([str(c) for c in all_classes])

        X_sub = X[mask].copy()
        y_sub = y[mask].copy()
        min_vals = [str(c) for c in (min_c if isinstance(min_c, list) else [min_c])]
        binary_target = np.where(y_sub.astype(str).isin(min_vals), 1, -1)
        
        n_min = np.sum(binary_target == 1)
        n_maj = np.sum(binary_target == -1)
        ratio = n_maj / n_min if n_min > 0 else 0.0
        
        print(f"-> {name} Processed:")
        print(f" {min_c}  Minority Samples (+1): {n_min}")
        print(f" {maj_c}  Majority Samples (-1): {n_maj}")
        print(f"   Imbalance Ratio (Maj/Min): {ratio:.2f}")
        print("--------------------------------------------------\n")


        
        return (X_sub, pd.Series(binary_target, name='target'))
    except Exception as e:
        print(f"Error fetching {name}: {e}")

***
Function for downloading datasets using OpenML, which are unavaliable on UCI 
***

In [45]:
def download_openml_datasets(name, data_id, min_c, maj_c):
    print(f"Fetching {name} (OpenML ID: {data_id})...")
    try:
        X, y = fetch_openml(data_id=data_id, return_X_y=True, as_frame=True)

        print("\n--------------------------------------------------")
        if isinstance(X, pd.DataFrame): print(variable_summary(X))
        if isinstance(y, pd.DataFrame): print(variable_summary(y)) 
        else: print(y.unique())
        
        if maj_c == "All other":
            mask = pd.Series([True] * len(y), index=y.index)
        else:
            all_classes = (min_c + maj_c)
            mask = y.astype(str).isin([str(c) for c in all_classes])
        X_sub = X[mask].copy()
        y_sub = y[mask].copy()
        min_vals = [str(c) for c in (min_c if isinstance(min_c, list) else [min_c])]
        binary_target = np.where(y_sub.astype(str).isin(min_vals), 1, -1)

        n_min = np.sum(binary_target == 1)
        n_maj = np.sum(binary_target == -1)
        ratio = n_maj / n_min if n_min > 0 else 0.0
        
        print(f"-> {name} Processed:")
        print(f" {min_c}  Minority Samples (+1): {n_min}")
        print(f" {maj_c}  Majority Samples (-1): {n_maj}")
        print(f"   Imbalance Ratio (Maj/Min): {ratio:.2f}")
        print("--------------------------------------------------\n")
        
        return (X_sub, pd.Series(binary_target, name='target'))
    except Exception as e:
        print(f"Error fetching {name}: {e}")

***
Function for downloading all datasets from the RE_SC
***

In [46]:
def download_datasets():
    datasets = {}

    # --- 1. Abalone (UCI) ---
    # Min: 18 | Maj: 9
    datasets['abalone9-18'] = download_uci_datasets(
        name='abalone9-18', 
        uci_id=1, 
        min_c=[18], 
        maj_c=[9]
    )

    ## --- 2. Breast Tissue (OpenML) ---
    ## Min: car, fad | Maj: All other
    #datasets['breast'] = download_openml_datasets(
    #    name='breast', 
    #    data_id=1479, 
    #    min_c=['car', 'fad'], 
    #    maj_c="All other"
    #)

    # --- 3-10. Ecoli (UCI) ---
    datasets['ecoli-01vs235'] = download_uci_datasets(
        name='ecoli-01vs235', 
        uci_id=39, 
        min_c= ['imS', 'imL', 'om'], # in paper below
        maj_c=['cp', 'im'],          # in paper above
    )
    
    datasets['ecoli-01vs5'] = download_uci_datasets(
        name='ecoli-01vs5', 
        uci_id=39, 
        min_c=['om'],       # in paper below
        maj_c=['cp', 'im'], # in paper above
    )
    
    datasets['ecoli-0147vs56'] = download_uci_datasets(
        name='ecoli-0147vs56', 
        uci_id=39, 
        min_c=['om', 'omL'],                 # in paper below
        maj_c=['cp', 'im', 'imU', 'pp'],     # in paper above
    )
    
    datasets['ecoli-0234vs5'] = download_uci_datasets(
        name='ecoli-0234vs5', 
        uci_id=39, 
        min_c=['om'],                        # in paper below
        maj_c=['cp', 'imS', 'imL', 'imU'],   # in paper above
    )
    
    datasets['ecoli-046vs5'] = download_uci_datasets(
        name='ecoli-046vs5', 
        uci_id=39, 
        min_c=['om'],               # in paper below
        maj_c=['cp', 'imU', 'omL'], # in paper above
    )
    
    datasets['ecoli-067vs5'] = download_uci_datasets(
        name='ecoli-067vs5', 
        uci_id=39, 
        min_c=['om'],              # in paper below
        maj_c=['cp', 'omL', 'pp']  # in paper above
    )
    
    datasets['ecoli2'] = download_uci_datasets(
        name='ecoli2', 
        uci_id=39, 
        min_c=['pp'], 
        maj_c="All other"
    )
    
    datasets['ecoli3'] = download_uci_datasets(
        name='ecoli3', 
        uci_id=39, 
        min_c=['imU'], 
        maj_c="All other"
    )

    # --- 11-14. Glass (UCI) ---
    #datasets['glass0123vs456'] = download_uci_datasets(
    #    name='glass0123vs456', 
    #    uci_id=42, 
    #    min_c=[0, 1, 2, 3], 
    #    maj_c="All other"
    #)
    #
    #datasets['glass0'] = download_uci_datasets(
    #    name='glass0', 
    #    uci_id=42, 
    #    min_c=[0], 
    #    maj_c="All other"
    #)
    #
    #datasets['glass1'] = download_uci_datasets(
    #    name='glass1', 
    #    uci_id=42, 
    #    min_c=[1], 
    #    maj_c="All other"
    #)
    
    datasets['glass6'] = download_uci_datasets(
        name='glass6', 
        uci_id=42, 
        min_c=[6], 
        maj_c="All other"
    )

    # --- 15. Haberman (UCI) ---
    datasets['haberman'] = download_uci_datasets(
        name='haberman', 
        uci_id=43, 
        min_c=[2], 
        maj_c=[1]
    )

    # --- 16. Iris (UCI) ---
    datasets['iris0'] = download_uci_datasets(
        name='iris0', 
        uci_id=53, 
        min_c=['Iris-setosa'], 
        maj_c="All other"
    )

    ## --- 17. Leaf (OpenML) ---
    ## Min: 1 | Maj: All other (Using ID 1491)
    #datasets['leaf'] = download_openml_datasets(
    #    name='leaf', 
    #    data_id=1491, 
    #    min_c=[1,2,3,4,5,6,7], 
    #    maj_c="All other"
    #)
#
    ## --- 18-19. Thyroid (OpenML) ---
    ## Using ID 1515
    #datasets['new-thyroid1'] = download_openml_datasets(
    #    name='new-thyroid1', 
    #    data_id=1515, 
    #    min_c=[2], 
    #    maj_c="All other"
    #)
    #
    #datasets['new-thyroid2'] = download_openml_datasets(
    #    name='new-thyroid2', 
    #    data_id=1515, 
    #    min_c=[3], 
    #    maj_c="All other"
    #)

    # --- 20. Page Blocks (UCI) ---
    # Min: 3 | Maj: 2, 5 (Matches count 28 vs 444)
    datasets['page-blocks-13vs4'] = download_uci_datasets(
        name='page-blocks-13vs4', 
        uci_id=78, 
        min_c=[4], 
        maj_c=[1, 3]
    )

    ## --- 21. Parkinsons (UCI) ---
    #datasets['parkinsons'] = download_uci_datasets(
    #    name='parkinsons', 
    #    uci_id=174, 
    #    min_c=[0], 
    #    maj_c=[1]
    #)

    ## --- 22. Seeds (OpenML) ---
    ## Min: 1 | Maj: All other (Using ID 1499)
    #datasets['seeds'] = download_openml_datasets(
    #    name='seeds', 
    #    data_id=1499, 
    #    min_c=['Kama'], 
    #    maj_c=['Rosa', 'Canadian']
    #)
#
    ## --- 23. Shuttle (UCI) ---
    ## Min: 1 | Maj: 4
    #datasets['shuttle-0vs4'] = download_uci_datasets(
    #    name='shuttle-0vs4', 
    #    uci_id=148, 
    #    min_c=['positive'], 
    #    maj_c=['negative']
    #)

    # --- 24. SPECT (UCI) ---
    datasets['spect'] = download_uci_datasets(
        name='spect', 
        uci_id=95, 
        min_c=[0], # in paper below
        maj_c=[1]  # in paper above
    )

   ## --- 25. Vertebral (UCI) ---
   #datasets['vertebral'] = download_uci_datasets(
   #    name='vertebral', 
   #    uci_id=212, 
   #    min_c=['NO'], 
   #    maj_c=['AB']
   #)
#
   ## --- 26. WPBC (UCI) ---
   #datasets['wpbc'] = download_uci_datasets(
   #    name='wpbc', 
   #    uci_id=17, 
   #    min_c=['N'], 
   #    maj_c=['R']
   #)

    # --- 27-28. Yeast (UCI) ---
    datasets['yeast-1vs7'] = download_uci_datasets(
        name='yeast-1vs7', 
        uci_id=110, 
        min_c=['VAC'], 
        maj_c=['NUC']
    )
    
    datasets['yeast-2vs4'] = download_uci_datasets(
        name='yeast-2vs4', 
        uci_id=110, 
        min_c=['ME2'], # in paper below
        maj_c=['CYT']  # in paper above
    )

    return datasets

***
Function for creating own datasets with various types and number of variables  
***

In [47]:
def create_synthetic_overlap(n_samples=1000, n_features=1, weights=(0.9, 0.1), density_type='high'):
    rng = np.random.RandomState(42)
    n_min = int(n_samples * weights[1])
    n_maj = n_samples - n_min
    
    # Generate Minority Class (centered at 0)
    X_min = rng.normal(loc=0.0, scale=1.0, size=(n_min, n_features))
    y_min = np.ones(n_min)
    
    # Generate Majority Class based on density type
    if density_type == 'high':
        # Type 1: High density in overlap
        X_maj = rng.normal(loc=1.0, scale=1.0, size=(n_maj, n_features))
        
    elif density_type == 'sparse':
        # Type 2: Overlap exists but majority is sparse there
        n_maj_overlap = int(n_maj * 0.1) # Only 10% in overlap
        n_maj_far = n_maj - n_maj_overlap
        
        X_maj_overlap = rng.normal(loc=0.0, scale=1.0, size=(n_maj_overlap, n_features))
        X_maj_far = rng.normal(loc=5.0, scale=1.0, size=(n_maj_far, n_features))
        X_maj = np.vstack([X_maj_overlap, X_maj_far])
        
    y_maj = -1 * np.ones(n_maj)
    
    # Stack the arrays
    X_array = np.vstack([X_min, X_maj])
    y_array = np.hstack([y_min, y_maj])
    
    # Create feature names
    feat_names = [f"feat_{i}" for i in range(n_features)]

    X_df = pd.DataFrame(X_array, columns=feat_names)
    
    return (X_df, pd.Series(y_array, name='target'))

***
Function for saving specific dataset in .csv
***

In [48]:
def save_dataset_csv(X, y, name, folder_path="../data"):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    df_combined = X.copy()
    target_col_name = 'target' 
    df_combined[target_col_name] = y.values
 
    file_path = os.path.join(folder_path, f"{name}.csv")
    df_combined.to_csv(file_path, index=False)
    print(f"Saved: {file_path}")

***
Function for loading specific dataset from .csv
***

In [49]:
def load_dataset_csv(file_path):
    df = pd.read_csv(file_path)
    target_col_name = 'target'
    
    if target_col_name not in df.columns:
        raise ValueError(f"Column '{target_col_name}' not found in {file_path}")
    
    y = df[target_col_name]
    X = df.drop(columns=[target_col_name])
    y = y.astype(int)
    
    return X, y

***
Function for loading datasets from folder 
***

In [50]:
def load_datasets_csv(folder_path="../data"):
    datasets_dict = {}
    
    search_path = os.path.join(folder_path, "*.csv")
    files = glob.glob(search_path)
    
    print(f"--- Loading datasets from '{folder_path}/' ---")
    for file_path in tqdm(files):
        filename = os.path.basename(file_path)
        name = os.path.splitext(filename)[0]
        
        try:
            X, y = load_dataset_csv(file_path)
            datasets_dict[name] = (X, y)
            print(f"Loaded: {name} | Shape: {X.shape}")
        except Exception as e:
            print(f"Error loading {name}: {e}")
            
    return datasets_dict

***
Function for choosing only numerical attributes
***

In [51]:
def drop_categorical_columns(X):
    X_numeric = X.select_dtypes(include=['number'])
    
    dropped_count = X.shape[1] - X_numeric.shape[1]
    if dropped_count > 0:
        print(f"-> Dropped {dropped_count} categorical/object columns.")
    
    return X_numeric

***
Function for normilizing the data
***

In [52]:
def preprocess_and_normalize(X):
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    num_cols = X.select_dtypes(include="number").columns
    cat_cols = X.columns.difference(num_cols)

    imputer = SimpleImputer(strategy="mean")
    scaler = StandardScaler()

    X_num = imputer.fit_transform(X[num_cols])
    X_num = scaler.fit_transform(X_num)

    X_num_df = pd.DataFrame(X_num, columns=num_cols, index=X.index)

    X_cat_df = X[cat_cols].copy()

    X_processed = pd.concat([X_num_df, X_cat_df], axis=1)

    X_processed = X_processed[X.columns]

    return X_processed

***
Downloading datasets from UCI or MlOpen
***

In [53]:
uci_mlopen_datasets = download_datasets()

Fetching abalone9-18 (UCI ID: 1)...

--------------------------------------------------
         variable    dtype  n_unique unique_values  truncated
0             Sex   object         3     [M, F, I]      False
1          Length  float64       134          None      False
2        Diameter  float64       111          None      False
3          Height  float64        51          None      False
4    Whole_weight  float64      2429          None      False
5  Shucked_weight  float64      1515          None      False
6  Viscera_weight  float64       880          None      False
7    Shell_weight  float64       926          None      False
  variable  dtype  n_unique unique_values  truncated
0    Rings  int64        28          None      False
-> abalone9-18 Processed:
 [18]  Minority Samples (+1): 42
 [9]  Majority Samples (-1): 689
   Imbalance Ratio (Maj/Min): 16.40
--------------------------------------------------

Fetching ecoli-01vs235 (UCI ID: 39)...

----------------------------

***
Creating Synthetic Datasets
***

In [54]:
synthetic_datasets = {}

synthetic_datasets['synth_type1_1d_v1'] = create_synthetic_overlap(n_features=1, weights=(0.9, 0.1), density_type='high')
synthetic_datasets['synth_type1_1d_v2'] = create_synthetic_overlap(n_features=1, weights=(0.94, 0.06), density_type='high')

synthetic_datasets['synth_type1_2d_v1'] = create_synthetic_overlap(n_features=2, weights=(0.81, 0.19), density_type='high')
synthetic_datasets['synth_type1_2d_v2'] = create_synthetic_overlap(n_features=2, weights=(0.9, 0.1), density_type='high')

synthetic_datasets['synth_type1_3d_v1'] = create_synthetic_overlap(n_features=3, weights=(0.75, 0.25), density_type='high')
synthetic_datasets['synth_type1_3d_v2'] = create_synthetic_overlap(n_features=3, weights=(0.95, 0.05), density_type='high')

synthetic_datasets['synth_type2_2d'] = create_synthetic_overlap(n_features=2, density_type='sparse')

***
Saving UCI datasets
***

In [55]:
for key, (X, y) in tqdm(uci_mlopen_datasets.items()):
    save_dataset_csv(X=X, y=y, name=f"{key}_raw", folder_path='../data/uci_ml_raw')

100%|█████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 356.11it/s]

Saved: ../data/uci_ml\abalone9-18_raw.csv
Saved: ../data/uci_ml\ecoli-01vs235_raw.csv
Saved: ../data/uci_ml\ecoli-01vs5_raw.csv
Saved: ../data/uci_ml\ecoli-0147vs56_raw.csv
Saved: ../data/uci_ml\ecoli-0234vs5_raw.csv
Saved: ../data/uci_ml\ecoli-046vs5_raw.csv
Saved: ../data/uci_ml\ecoli-067vs5_raw.csv
Saved: ../data/uci_ml\ecoli2_raw.csv
Saved: ../data/uci_ml\ecoli3_raw.csv
Saved: ../data/uci_ml\glass6_raw.csv
Saved: ../data/uci_ml\haberman_raw.csv
Saved: ../data/uci_ml\iris0_raw.csv
Saved: ../data/uci_ml\page-blocks-13vs4_raw.csv
Saved: ../data/uci_ml\spect_raw.csv
Saved: ../data/uci_ml\yeast-1vs7_raw.csv
Saved: ../data/uci_ml\yeast-2vs4_raw.csv





***
Saving synthetic datasets
***

In [56]:
for key, (X, y) in tqdm(synthetic_datasets.items()):
    save_dataset_csv(X=X, y=y, name=f"{key}", folder_path='../data/synthetic')

100%|███████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 251.38it/s]

Saved: ../data/synthetic\synth_type1_1d_v1.csv
Saved: ../data/synthetic\synth_type1_1d_v2.csv
Saved: ../data/synthetic\synth_type1_2d_v1.csv
Saved: ../data/synthetic\synth_type1_2d_v2.csv
Saved: ../data/synthetic\synth_type1_3d_v1.csv
Saved: ../data/synthetic\synth_type1_3d_v2.csv
Saved: ../data/synthetic\synth_type2_2d.csv





***
Deleting categorical columns from datasets
***

In [57]:
uci_mlopen_datasets_numerical = {}
for key, (X, y) in tqdm(uci_mlopen_datasets.items()):
    uci_mlopen_datasets_numerical[key] = (drop_categorical_columns(X), y)

100%|████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 7995.81it/s]

-> Dropped 1 categorical/object columns.





***
Normilizing datasets
***

In [61]:
uci_mlopen_datasets_numerical_normilized = {}
for key, (X, y) in tqdm(uci_mlopen_datasets_numerical.items()):
    uci_mlopen_datasets_numerical_normilized[key] = (preprocess_and_normalize(X), y)

100%|█████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 382.54it/s]


***
Saving preprocessed datasets
***

In [62]:
for key, (X, y) in tqdm(uci_mlopen_datasets_numerical_normilized.items()):
    save_dataset_csv(X=X, y=y, name=f"{key}_processed", folder_path='../data/uci_ml_processed')

100%|█████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 127.55it/s]

Saved: ../data/uci_ml_processed\abalone9-18_processed.csv
Saved: ../data/uci_ml_processed\ecoli-01vs235_processed.csv
Saved: ../data/uci_ml_processed\ecoli-01vs5_processed.csv
Saved: ../data/uci_ml_processed\ecoli-0147vs56_processed.csv
Saved: ../data/uci_ml_processed\ecoli-0234vs5_processed.csv
Saved: ../data/uci_ml_processed\ecoli-046vs5_processed.csv
Saved: ../data/uci_ml_processed\ecoli-067vs5_processed.csv
Saved: ../data/uci_ml_processed\ecoli2_processed.csv
Saved: ../data/uci_ml_processed\ecoli3_processed.csv
Saved: ../data/uci_ml_processed\glass6_processed.csv
Saved: ../data/uci_ml_processed\haberman_processed.csv
Saved: ../data/uci_ml_processed\iris0_processed.csv
Saved: ../data/uci_ml_processed\page-blocks-13vs4_processed.csv
Saved: ../data/uci_ml_processed\spect_processed.csv
Saved: ../data/uci_ml_processed\yeast-1vs7_processed.csv
Saved: ../data/uci_ml_processed\yeast-2vs4_processed.csv





***
Load Datasets
***

In [64]:
senthetic_datasets = load_datasets_csv('../data/synthetic')
uci_mlopen_processed_datasets = load_datasets_csv('../data/uci_ml_processed')

--- Loading datasets from '../data/synthetic/' ---


100%|███████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 611.92it/s]


Loaded: synth_type1_1d_v1 | Shape: (1000, 1)
Loaded: synth_type1_1d_v2 | Shape: (1000, 1)
Loaded: synth_type1_2d_v1 | Shape: (1000, 2)
Loaded: synth_type1_2d_v2 | Shape: (1000, 2)
Loaded: synth_type1_3d_v1 | Shape: (1000, 3)
Loaded: synth_type1_3d_v2 | Shape: (1000, 3)
Loaded: synth_type2_2d | Shape: (1000, 2)
Loaded: synth_type2_2d_example | Shape: (1000, 2)
--- Loading datasets from '../data/uci_ml_processed/' ---


100%|█████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 466.56it/s]

Loaded: abalone9-18_processed | Shape: (731, 7)
Loaded: ecoli-0147vs56_processed | Shape: (332, 7)
Loaded: ecoli-01vs235_processed | Shape: (244, 7)
Loaded: ecoli-01vs5_processed | Shape: (240, 7)
Loaded: ecoli-0234vs5_processed | Shape: (202, 7)
Loaded: ecoli-046vs5_processed | Shape: (203, 7)
Loaded: ecoli-067vs5_processed | Shape: (220, 7)
Loaded: ecoli2_processed | Shape: (336, 7)
Loaded: ecoli3_processed | Shape: (336, 7)
Loaded: glass6_processed | Shape: (214, 9)
Loaded: haberman_processed | Shape: (306, 3)
Loaded: iris0_processed | Shape: (150, 4)
Loaded: page-blocks-13vs4_processed | Shape: (5029, 10)
Loaded: spect_processed | Shape: (267, 22)
Loaded: yeast-1vs7_processed | Shape: (459, 8)
Loaded: yeast-2vs4_processed | Shape: (514, 8)



