In [2]:
pip install openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Downloading pycr

In [3]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.0


In [None]:
from openml.datasets import get_dataset

dataset = get_dataset(42706)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)

print(X.dtypes.value_counts())

uint8    197631
int64      2369
Name: count, dtype: int64


In [17]:
import openml
import pandas as pd
# import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class OpenMLPipeline:
    def __init__(self, dataset_id, test_size=0.2, random_state=42):
        self.dataset_id = dataset_id
        self.test_size = test_size
        self.random_state = random_state
        self.dataset = None
        self.features = None
        self.labels = None
        self.preprocessor = None

    def fetch_data(self):
        print(f"Fetching dataset {self.dataset_id} from OpenML...")
        dataset = openml.datasets.get_dataset(self.dataset_id)
        X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
        self.features = X
        self.labels = y
        print("fetch data:", X.shape)
        return X, y

    def preprocess_data(self):
        if self.features is None or self.labels is None:
            raise ValueError("Dataset not loaded. Call fetch_data() first.")

        numeric_features = self.features.select_dtypes(include=['int64', 'float64', 'uint8']).columns
        categorical_features = self.features.select_dtypes(include=['object', 'category']).columns

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ])

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        X_processed = self.preprocessor.fit_transform(self.features)
        print("Feature Data Types:\n", self.features.dtypes)
        print("Original features:", list(self.features.columns))
        print("Numeric features used:", list(numeric_features))
        print("Categorical features used:", list(categorical_features))
        return X_processed, self.labels

    def split_data(self, X, y):
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def save_to_csv(self, X, y, X_train, X_test, y_train, y_test):
        df_full = pd.DataFrame(X)
        df_full['target'] = y.values
        df_full.to_csv("processed_dataset.csv", index=False)

        df_train = pd.DataFrame(X_train)
        df_train['target'] = y_train.values
        df_train.to_csv("train_dataset.csv", index=False)

        df_test = pd.DataFrame(X_test)
        df_test['target'] = y_test.values
        df_test.to_csv("test_dataset.csv", index=False)

        print("Processed datasets saved: processed_dataset.csv, train_dataset.csv, test_dataset.csv")

    def run_pipeline(self):
        self.fetch_data()
        X_processed, y = self.preprocess_data()
        X_train, X_test, y_train, y_test = self.split_data(X_processed, y)
        self.save_to_csv(X_processed, y, X_train, X_test, y_train, y_test)
        print("Data pipeline completed.")
        return X_train, X_test, y_train, y_test

if __name__ == "__main__":
    pipeline = OpenMLPipeline(dataset_id=50)  # Example dataset ID
    X_train, X_test, y_train, y_test = pipeline.run_pipeline()
    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Fetching dataset 50 from OpenML...
fetch data: (958, 9)
Feature Data Types:
 top-left-square         category
top-middle-square       category
top-right-square        category
middle-left-square      category
middle-middle-square    category
middle-right-square     category
bottom-left-square      category
bottom-middle-square    category
bottom-right-square     category
dtype: object
Original features: ['top-left-square', 'top-middle-square', 'top-right-square', 'middle-left-square', 'middle-middle-square', 'middle-right-square', 'bottom-left-square', 'bottom-middle-square', 'bottom-right-square']
Numeric features used: []
Categorical features used: ['top-left-square', 'top-middle-square', 'top-right-square', 'middle-left-square', 'middle-middle-square', 'middle-right-square', 'bottom-left-square', 'bottom-middle-square', 'bottom-right-square']
Processed datasets saved: processed_dataset.csv, train_dataset.csv, test_dataset.csv
Data pipeline completed.
Train shape: (766, 27), Test sha

In [None]:
import openml
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class OpenMLPipeline:
    def __init__(self, dataset_id, test_size=0.2, random_state=42):
        self.dataset_id = dataset_id
        self.test_size = test_size
        self.random_state = random_state
        self.dataset = None
        self.features = None
        self.labels = None
        self.preprocessor = None

    def fetch_data(self):
        print(f"Fetching dataset {self.dataset_id} from OpenML...")
        dataset = openml.datasets.get_dataset(self.dataset_id)
        X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
        self.features = X
        self.labels = y
        print("fetch data:", X.shape)
        return X, y

    def preprocess_data(self):
      if self.features is None or self.labels is None:
        raise ValueError("Dataset not loaded. Call fetch_data() first.")

      # Identify numeric and categorical features
      numeric_features = self.features.select_dtypes(include=['int64', 'float64', 'uint8']).columns
      categorical_features = self.features.select_dtypes(include=['object', 'category']).columns

      # Apply Target Encoding separately
      if not categorical_features.empty:
        target_encoder = ce.TargetEncoder(cols=categorical_features)
        self.features[categorical_features] = target_encoder.fit_transform(self.features[categorical_features], self.labels)

      # Define numeric transformer pipeline
      numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
      ])

      # Create ColumnTransformer for numeric features only (categoricals are already encoded)
      self.preprocessor = ColumnTransformer(
        transformers=[
          ('num', numeric_transformer, numeric_features)
        ]
      )

      # Transform numeric features
      X_processed = self.preprocessor.fit_transform(self.features)

      print("Feature Data Types:\n", self.features.dtypes)
      print("Numeric features used:", list(numeric_features))
      print("Categorical features used (Target Encoded):", list(categorical_features))
      print("Processed dataset shape:", X_processed.shape)

      return X_processed, self.labels

    def split_data(self, X, y):
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def save_to_csv(self, X, y, filename="processed_dataset.csv"):
        df = pd.DataFrame(X)
        df['target'] = y.values
        df.to_csv(filename, index=False)
        print(f"Processed dataset saved to {filename}")

    def run_pipeline(self):
        self.fetch_data()
        X_processed, y = self.preprocess_data()
        self.save_to_csv(X_processed, y)
        X_train, X_test, y_train, y_test = self.split_data(X_processed, y)
        print("Data pipeline completed.")
        return X_train, X_test, y_train, y_test

if __name__ == "__main__":
    pipeline = OpenMLPipeline(dataset_id=31)
    X_train, X_test, y_train, y_test = pipeline.run_pipeline()
    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Fetching dataset 31 from OpenML...
fetch data: (1000, 20)


AttributeError: 'numpy.ndarray' object has no attribute 'groupby'