In [None]:
pip install openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Downloading pycr

In [None]:
import shutil
import os

folder_path = "/content/data/processed"

if os.path.exists(folder_path):
    shutil.rmtree(folder_path)
    print(f" Deleted folder: {folder_path}")
else:
    print(f" Folder not found: {folder_path}")


⚠️ Folder not found: /content/data/processed


In [None]:
import openml
import pandas as pd

# List all datasets
datasets = openml.datasets.list_datasets(output_format='dataframe')

# Filter for classification datasets with ~20 raw features and manageable size
filtered = datasets[
    (datasets['NumberOfFeatures'] <= 21) &
    (datasets['NumberOfFeatures'] > 8) & # Allow some wiggle room
    (datasets['NumberOfClasses'].notna()) &
    (datasets['NumberOfInstances'] > 3000) &
    (datasets['NumberOfInstances'] <= 5000)
]

# Display key info
id_list = filtered['did'].tolist()
print(f"Selected {len(id_list)} datasets:")
print(id_list)

KeyboardInterrupt: 

In [None]:
import os

os.makedirs('/content/data/train', exist_ok=True)
os.makedirs('/content/data/test', exist_ok=True)

In [None]:
import openml
import pandas as pd
# import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class OpenMLPipeline:
    def __init__(self, dataset_id, test_size=0.2, random_state=42):
        self.dataset_id = dataset_id
        self.test_size = test_size
        self.random_state = random_state
        self.dataset = None
        self.features = None
        self.labels = None
        self.preprocessor = None

    def fetch_data(self):
        print(f"Fetching dataset {self.dataset_id} from OpenML...")
        dataset = openml.datasets.get_dataset(self.dataset_id)
        X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
        self.features = X
        self.labels = y

        # Convert target to numerical if it's not
        if self.labels.dtype == 'object' or self.labels.dtype.name == 'category' or self.labels.dtype.name == 'bool':
          print("Target is categorical, encoding to numerical...")
          self.labels = LabelEncoder().fit_transform(self.labels)

        print("fetch data:", X.shape)
        return X, y

    def preprocess_data(self):
        if self.features is None or self.labels is None:
            raise ValueError("Dataset not loaded. Call fetch_data() first.")

        numeric_features = self.features.select_dtypes(include=['int64', 'float64', 'uint8']).columns
        categorical_features = self.features.select_dtypes(include=['object', 'category', 'bool']).columns

        if len(numeric_features) + len(categorical_features) != self.features.shape[1]:
            print("Some features are neither categorized as numeric nor categorical. Skipping this dataset.")
            return None, None

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ])

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        X_processed = self.preprocessor.fit_transform(self.features)
        # print("Feature Data Types:\n", self.features.dtypes)
        # print("Original features:", list(self.features.columns))
        # print("Numeric features used:", list(numeric_features))
        # print("Categorical features used:", list(categorical_features))
        # print("Preprocessed Data Shape:", X_processed.shape)

        if isinstance(X_processed, pd.DataFrame):
            X_processed.columns = self.preprocessor.get_feature_names_out()
        else:
            X_processed = pd.DataFrame(X_processed, columns=self.preprocessor.get_feature_names_out())
        return X_processed, self.labels

    def split_data(self, X, y):
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def save_to_csv(self, X, y, X_train, X_test, y_train, y_test):
        # df_full = pd.DataFrame(X)
        # df_full['target'] = y.values
        # df_full.to_csv(f"processed_dataset_{self.dataset_id}.csv", index=False)

        df_train = pd.DataFrame(X_train)
        df_train['target'] = y_train
        df_train.to_csv(f"/content/data/train/train_dataset_{self.dataset_id}.csv", index=False)

        df_test = pd.DataFrame(X_test)
        df_test['target'] = y_test
        df_test.to_csv(f"/content/data/test/test_dataset_{self.dataset_id}.csv", index=False)

        print(f"Processed datasets saved: processed_dataset_{self.dataset_id}.csv, train_dataset_{self.dataset_id}.csv, test_dataset_{self.dataset_id}.csv")

    def run_pipeline(self):
        self.fetch_data()
        X_processed, y = self.preprocess_data()
        X_train, X_test, y_train, y_test = self.split_data(X_processed, y)
        self.save_to_csv(X_processed, y, X_train, X_test, y_train, y_test)
        print("Data pipeline completed.")
        return X_train, X_test, y_train, y_test

if __name__ == "__main__":
    pipeline = OpenMLPipeline(dataset_id=46517)  # Example dataset ID
    X_train, X_test, y_train, y_test = pipeline.run_pipeline()
    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Fetching dataset 46517 from OpenML...
Target is categorical, encoding to numerical...
fetch data: (5960, 20)
Processed datasets saved: processed_dataset_46517.csv, train_dataset_46517.csv, test_dataset_46517.csv
Data pipeline completed.
Train shape: (4768, 20), Test shape: (1192, 20)


In [None]:
import openml
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class OpenMLPipeline:
    def __init__(self, dataset_id):
        self.dataset_id = dataset_id
        self.dataset = None
        self.features = None
        self.labels = None
        self.preprocessor = None

    def fetch_data(self):
        print(f"Fetching dataset {self.dataset_id} from OpenML...")
        dataset = openml.datasets.get_dataset(self.dataset_id)
        X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
        self.features = X
        self.labels = y

        if self.labels.dtype in ['object', 'bool'] or self.labels.dtype.name == 'category':
            print("Target is categorical, encoding to numerical...")
            self.labels = LabelEncoder().fit_transform(self.labels)

        print("Data shape:", X.shape)
        return self.features, self.labels

    def preprocess_data(self):
        if self.features is None or self.labels is None:
            raise ValueError("Dataset not loaded. Call fetch_data() first.")

        numeric_features = self.features.select_dtypes(include=['int64', 'float64', 'uint8']).columns
        categorical_features = self.features.select_dtypes(include=['object', 'category', 'bool']).columns

        # Warn about columns being skipped
        other_columns = self.features.columns.difference(numeric_features.union(categorical_features))
        if len(other_columns) > 0:
            print(f"Warning: Skipping unrecognized feature types: {list(other_columns)}")

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ])

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        X_processed = self.preprocessor.fit_transform(self.features)

        X_processed = pd.DataFrame(X_processed, columns=self.preprocessor.get_feature_names_out())
        return X_processed, self.labels

    def save_to_csv(self, X, y):
        os.makedirs("/content/data/processed", exist_ok=True)

        df_full = pd.DataFrame(X)
        df_full['target'] = y
        df_full.to_csv(f"/content/data/processed/processed_dataset_{self.dataset_id}.csv", index=False)

        print(f"Processed dataset saved to processed_dataset_{self.dataset_id}.csv")

    def run_pipeline(self):
      self.fetch_data()
      X_processed, y = self.preprocess_data()

      if X_processed.shape[1] == 20:
          self.save_to_csv(X_processed, y)
          print(f"Saved dataset {self.dataset_id} with shape {X_processed.shape}")
          return X_processed, y
      else:
          print(f"Skipped dataset {self.dataset_id}: final dimension = {X_processed.shape[1]} (expected 20)")
          return None, None

if __name__ == "__main__":
    pipeline = OpenMLPipeline(dataset_id=1042)  # Example dataset ID
    X_processed, y = pipeline.run_pipeline()



Fetching dataset 1042 from OpenML...
Target is categorical, encoding to numerical...
Data shape: (3468, 784)
❌ Skipped dataset 1042: final dimension = 784 (expected 20)


In [None]:
for id in id_list:
    try:
        print(f"Running pipeline on dataset ID: {id}")
        pipeline = OpenMLPipeline(dataset_id=id)
        X_processed, y = pipeline.run_pipeline()
        print(f"Shape: {X_processed.shape}, Target length: {len(y)}\n")
    except Exception as e:
        print(f"Failed on dataset {id}: {e}\n")

Running pipeline on dataset ID: 30
Fetching dataset 30 from OpenML...
Target is categorical, encoding to numerical...
Data shape: (5473, 10)
❌ Skipped dataset 30: final dimension = 10 (expected 20)
Failed on dataset 30: 'NoneType' object has no attribute 'shape'

Running pipeline on dataset ID: 189
Fetching dataset 189 from OpenML...
Data shape: (8192, 8)
❌ Skipped dataset 189: final dimension = 8 (expected 20)
Failed on dataset 189: 'NoneType' object has no attribute 'shape'

Running pipeline on dataset ID: 225
Fetching dataset 225 from OpenML...
Data shape: (8192, 8)
❌ Skipped dataset 225: final dimension = 8 (expected 20)
Failed on dataset 225: 'NoneType' object has no attribute 'shape'

Running pipeline on dataset ID: 227
Fetching dataset 227 from OpenML...
Data shape: (8192, 12)
❌ Skipped dataset 227: final dimension = 12 (expected 20)
Failed on dataset 227: 'NoneType' object has no attribute 'shape'

Running pipeline on dataset ID: 287
Fetching dataset 287 from OpenML...
Data sha



Data shape: (10000, 14)
Failed on dataset 46726: Shape of passed values is (10000, 1), indices imply (10000, 69)

Running pipeline on dataset ID: 46729
Fetching dataset 46729 from OpenML...




Data shape: (10000, 8)
Failed on dataset 46729: Shape of passed values is (10000, 1), indices imply (10000, 18012)

Running pipeline on dataset ID: 46748
Fetching dataset 46748 from OpenML...




Data shape: (6016, 20)
Failed on dataset 46748: Shape of passed values is (6016, 1), indices imply (6016, 113)

Running pipeline on dataset ID: 46911
Fetching dataset 46911 from OpenML...
Target is categorical, encoding to numerical...
Data shape: (10000, 10)
❌ Skipped dataset 46911: final dimension = 15 (expected 20)
Failed on dataset 46911: 'NoneType' object has no attribute 'shape'

Running pipeline on dataset ID: 46964
Fetching dataset 46964 from OpenML...
Data shape: (6497, 12)
❌ Skipped dataset 46964: final dimension = 13 (expected 20)
Failed on dataset 46964: 'NoneType' object has no attribute 'shape'



In [None]:
import shutil

# Zip the folder
shutil.make_archive('/content/data_folder', 'zip', '/content/data')


'/content/data_folder.zip'

In [None]:
from google.colab import files

# Download the zip file
files.download('/content/data_folder.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>