In [8]:
import pandas as pd
#from datasets import load_dataset
from factory import create_full_data_loader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
from tabulate import tabulate

In [9]:
data_loader = create_full_data_loader("housing", test_size=0.2, normalize_features="mean_std", return_extra_info=True, encode_categorical=False)
X, y, extra_info = data_loader.load_data()

In [10]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: target, Length: 20640, dtype: float64

In [11]:
# Create an empty DataFrame to store dataset summaries
summary_data = pd.DataFrame(columns=['Dataset', 'Target Type', 'Total Samples', 
                                      'Num Classes', "Pos. Class Rate", 'Total Features', 'Num Categorical', 'Num Numerical'])

# Your data loader loop
loaders = [
    "creditcard",
    "iris",
    "titanic",
    "bufix",
    "breastcancer",
    "housing",
    "ageconditions",
    "adult",
    "heloc",
    "covertype",
"diabetes"]

for dataset_name in loaders:
    try:
        data_loader = create_full_data_loader(dataset_name, test_size=0.2, normalize_features="mean_std", return_extra_info=True, encode_categorical=False)
        X, y, extra_info = data_loader.load_data()

        # Calculate additional statistics
        total_samples = X.shape[0]
        total_features = X.shape[1]
        
        target_type = 'Classification' if isinstance(y.iloc[0], np.int64) else 'Regression'
        num_classes = y.nunique() if target_type == 'Classification' else None
        num_categorical = X.select_dtypes(include=['object']).shape[1]
        num_numerical = X.select_dtypes(include=['number']).shape[1]
        if num_classes == 2:
            class_freq = round(y.sum()/len(y),4)
        else:
            class_freq = None
                               
        # Append summary to DataFrame
        # Inside the loop, replace the append line with this line:
        summary_data = pd.concat([summary_data, pd.DataFrame([[dataset_name, target_type, total_samples, 
                                                        num_classes, class_freq, total_features, num_categorical, num_numerical]],
                                                     columns=summary_data.columns)], ignore_index=True)

        print("SUCCESS", dataset_name)
    except Exception as e:
        print("FAIL", dataset_name)
        print(repr(e))

# Print aggregated summary
print("\nAggregated Dataset Summary:")
print(summary_data)


SUCCESS creditcard
SUCCESS iris
titanic df len: 891
survived
0    549
1    342
Name: count, dtype: int64
SUCCESS titanic
FAIL bufix
ValueError('Invalid dataset: bufix')
SUCCESS breastcancer
SUCCESS housing
SUCCESS ageconditions
SUCCESS adult
SUCCESS heloc
covertype df 281011
SUCCESS covertype
[0, 687, 1374, 2061, 2748, 3435, 4122, 4809, 5496, 6183, 6870]
[0, 676, 1352, 2028, 2704, 3380, 4056, 4732, 5408, 6084, 6760]
[0, 1156, 2312, 3468, 4624, 5780, 6936, 8092, 9248, 10404, 11560]
SUCCESS diabetes

Aggregated Dataset Summary:
         Dataset     Target Type Total Samples Num Classes Pos. Class Rate  \
0     creditcard  Classification        284807           2          0.0017   
1           iris  Classification           150           3            None   
2        titanic  Classification           891           2          0.3838   
3   breastcancer  Classification           569           2          0.6274   
4        housing      Regression         20640        None            None   


In [12]:
summary_data.to_csv("summary_data.csv", index = False)

In [13]:
summary_data

Unnamed: 0,Dataset,Target Type,Total Samples,Num Classes,Pos. Class Rate,Total Features,Num Categorical,Num Numerical
0,creditcard,Classification,284807,2.0,0.0017,30,0,30
1,iris,Classification,150,3.0,,4,0,4
2,titanic,Classification,891,2.0,0.3838,12,5,7
3,breastcancer,Classification,569,2.0,0.6274,30,0,30
4,housing,Regression,20640,,,10,0,10
5,ageconditions,Classification,622,2.0,0.1736,56,0,56
6,adult,Classification,32561,2.0,0.2408,15,8,7
7,heloc,Classification,10459,2.0,0.5219,24,0,24
8,covertype,Classification,281011,7.0,,54,0,54
9,diabetes,Classification,101766,2.0,0.1116,47,39,8


In [14]:
# Convert DataFrame to ASCII table
ascii_table = tabulate(summary_data, headers='keys', tablefmt='fancy_grid')

# Print the ASCII table (for preview)
print(ascii_table)


╒════╤═══════════════╤════════════════╤═════════════════╤═══════════════╤═══════════════════╤══════════════════╤═══════════════════╤═════════════════╕
│    │ Dataset       │ Target Type    │   Total Samples │   Num Classes │   Pos. Class Rate │   Total Features │   Num Categorical │   Num Numerical │
╞════╪═══════════════╪════════════════╪═════════════════╪═══════════════╪═══════════════════╪══════════════════╪═══════════════════╪═════════════════╡
│  0 │ creditcard    │ Classification │          284807 │             2 │            0.0017 │               30 │                 0 │              30 │
├────┼───────────────┼────────────────┼─────────────────┼───────────────┼───────────────────┼──────────────────┼───────────────────┼─────────────────┤
│  1 │ iris          │ Classification │             150 │             3 │                   │                4 │                 0 │               4 │
├────┼───────────────┼────────────────┼─────────────────┼───────────────┼───────────────────┼─