In [None]:
!pip install orkg
!pip install openml

# **Define variables**

In [272]:
custom_classID = 'C75446' # You need to edit it.
dataset_name_label = 'Class Test3'  # You need to edit it.
dataset_name='ilpd'
feature_predicates = {
    'Number of instances': 'P39021',
    'Number of features': 'P72007',
    'Number of classes': 'P52036',
    'Standard Deviation Ratio': 'P114000',
    'Class Entropy': 'P114002',
    'Normal Entropy': 'P114003',
    'hasFeature': 'P114013'
}

automl_datasets = ['kr-vs-kp', 'letter', 'balance-scale', 'mfeat-factors', 'mfeat-fourier', 'breast-w',
                   'mfeat-karhunen', 'mfeat-morphological', 'mfeat-zernike', 'optdigits', 'credit-approval',
                   'credit-g', 'pendigits', 'diabetes', 'sick', 'spambase', 'splice', 'tic-tac-toe', 'vehicle',
                   'electricity', 'satimage',  'isolet', 'vowel', 'analcatdata_authorship',
                   'analcatdata_dmft', 'mnist_784', 'pc4', 'pc3', 'jm1', 'kc2', 'kc1', 'pc1', 'bank-marketing',
                   'banknote-authentication', 'blood-transfusion-service-center', 'cnae-9', 'first-order-theorem-proving',
                   'har', 'ilpd', 'madelon', 'nomao', 'ozone-level-8hr', 'phoneme', 'qsar-biodeg', 'wall-robot-navigation',
                   'semeion', 'wdbc', 'adult', 'Bioresponse', 'PhishingWebsites', 'GesturePhaseSegmentationProcessed',
                   'cylinder-bands', 'dresses-sales', 'numerai28.6', 'texture', 'dna', 'churn',
                   'Devnagari-Script', 'CIFAR_10', 'MiceProtein', 'car', 'Internet-Advertisements', 'mfeat-pixel',
                   'steel-plates-fault', 'wilt', 'segment', 'climate-model-simulation-crashes', 'Fashion-MNIST',
                   'jungle_chess_2pcs_raw_endgame_complete', 'JapaneseVowels']

In [160]:
from google.colab import userdata
email = userdata.get('email')
password = userdata.get('password')

In [161]:
from orkg import ORKG, Hosts

orkg = ORKG(host=Hosts.SANDBOX, creds=(email, password))

# **Create the Class with the CustomID**

In [257]:
classes = orkg.classes.add(id=custom_classID, label=f'{dataset_name_label} Test').content
classes

{'id': 'C75446',
 'label': 'Class Test3 Test',
 'uri': None,
 'description': None,
 'created_at': '2024-03-10T23:28:19.374313343+01:00',
 'created_by': 'ba7b42b0-dbeb-41fb-99d5-cb6b4140660e',
 'modifiable': True,
 '_class': 'class'}

# **Extract the Meta-Features from the Datasets**

In [267]:
import pandas as pd
import numpy as np
from sklearn import datasets
from scipy.stats import entropy

def extract_meta_features(dataset_name):
    # Fetch the dataset from OpenML
    dataset = datasets.fetch_openml(dataset_name, parser='auto', as_frame='auto')
    # Identify the target variable
    possible_targets = ['Class', 'class', 'Defects', 'c', 'Author','band_type','defects','Prevention','problems', 'binaryClass','y','character','attribute_21' ,'target', 'Result','Phase', 'result']
    # Identify the target variable
    target = next((t for t in possible_targets if t in dataset.frame.columns), None)

    if target is None:
        raise ValueError("Target variable not found in the dataset.")    # Extract meta-features
    num_instances = dataset.frame.shape[0]
    num_features = dataset.frame.shape[1]
    num_classes = len(dataset.target.unique())
    skewness = round(dataset.frame.skew(numeric_only=True), 2)
    kurtosis = round(dataset.frame.kurt(numeric_only=True), 2)
    min_values = round(dataset.frame.min(numeric_only=True), 2)
    max_values = round(dataset.frame.max(numeric_only=True), 2)
    mean_values = round(dataset.frame.mean(numeric_only=True), 2)
    median_values = round(dataset.frame.median(numeric_only=True), 2)

    sd_ratio = round((dataset.frame.std(numeric_only=True) / dataset.frame.mean(numeric_only=True)).mean()  , 2)

    class_entropy = round(entropy(dataset.frame[target].value_counts(normalize=True), base=2), 2)
    normal_entropy = round(class_entropy / np.log2(len(dataset.target.unique())), 2)

    # Store meta-features in a DataFrame
    meta_features_df = pd.DataFrame({
        'Meta-Feature': ['Number of instances', 'Number of features', 'Number of classes', 'Skewness', 'Kurtosis', 'Min values', 'Max values',
                         'Mean values', 'Median values', 'Standard Deviation Ratio', 'Class Entropy',
                         'Normal Entropy'],
        'Value': [num_instances, num_features, num_classes, skewness,
                  kurtosis, min_values, max_values, mean_values, median_values, sd_ratio, class_entropy.item(),
                  normal_entropy]
    })

    # Create 'hasFeatures' column for meta-features that have multiple values
    multi_value_indices = [3, 4, 5, 6, 7, 8]  # Indices of meta-features with multiple values
    meta_features_df['hasFeatures'] = pd.Series([skewness, kurtosis, min_values, max_values, mean_values, median_values], index=multi_value_indices)
    return meta_features_df


# **Adding meta features that having single value to ORKG resources**

In [None]:
for dataset in automl_datasets:
  resource = orkg.resources.add(label = dataset, classes = [custom_classID]).content
  resource_id = resource['id']
  data = extract_meta_features(dataset)
  single_value_metaFeatures = {
    'Number of instances': data.loc[0, 'Value'],
    'Number of features': data.loc[1, 'Value'],
    'Number of classes': data.loc[2, 'Value'],
    'Standard Deviation Ratio': data.loc[9, 'Value'],
    'Class Entropy': data.loc[10, 'Value'],
    'Normal Entropy': data.loc[11, 'Value'],
    'hasFeature': 0
  }
  print(f"adding {dataset}")
  for feature, value in single_value_metaFeatures.items():
    # add Literal
    literal_id = orkg.literals.add(label=str(value)).content['id']
    # add the predicate
    predicate_id = feature_predicates[feature]

    # connect them
    statement_data = {
        'subject_id': resource_id,
        'predicate_id': predicate_id,
        'object_id': literal_id
    }
    orkg.statements.add(**statement_data)
  print(f"Done {dataset}")