In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import openml
import os
import pandas as pd
import numpy as np
import math

from sklearn.preprocessing import (
    MinMaxScaler
)
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.tree import (
    DecisionTreeRegressor
)
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from typing import Iterable, List

In [3]:
# Prequirements:
#   - Preprocessed data is loaded
#   - Check the dataset for 
#     - Data types (only numerical)
#     - Null values
#     - Normalization

# Define: 
#   - Model
#   - Feature selection
#   - Hyperparameter grid

# Perform: 
#   - Correlation analysis and dropping of highly correlated features
#   - Feature selection
#   - Hyperparameter optimization

# Load preprocessed data and function

In [4]:
import networkx as nx
from pathlib import Path
from typing import Union, Optional, Literal

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from gensim.models.poincare import PoincareModel


def load_graph(path: Union[Path, str]) -> nx.Graph:
    """
    Load a graph from a file. The file must be in the format of an adjacency list.
    
    :param path: Path to the file containing the graph.
    :type path: Union[Path, str]

    :return: The graph.
    """
    G = nx.read_adjlist(path)

    # add node names as labels to the graph
    node_names = {node: node for node in G.nodes()}
    nx.set_node_attributes(G, node_names, "label")

    return G


def poincare_encoding(path_to_embeddings: str, path_to_graph: str = None, data=None, column_to_encode=None,
                      dim_reduction: Optional[Literal['pca', 'tsne']] = None, n_components=2, encode_dim=50, epochs=500, seed=7,
                      explode_dim=True, verbosity=1) -> Union[pd.DataFrame, tuple[pd.DataFrame, PoincareModel]]:
    """
    Generates the Poincarè embedding for the given graph and encodes the given column of the given data with it. The
    encoding can be done in different formats. The function can also be used to just generate the embedding for the
    given graph. The graph has to be given as an edge list.

    :param dim_reduction: Dimensionality reduction method to use. Either 'pca' or 'tsne'. If None, no dimensionality
        reduction is applied.
    :type dim_reduction: Optional[Literal['pca', 'tsne']]
    :param n_components: Number of components to reduce the dimensionality to.
    :type n_components: int
    :param path_to_graph: Path to the graph.
    :type path_to_graph: str
    :param path_to_embeddings: Path to the embeddings.
    :type path_to_embeddings: str
    :param data: Data to encode.
    :type data: pandas.DataFrame
    :param column_to_encode: Column to encode.
    :type column_to_encode: str
    :param encode_dim: Dimension of the embedding.
    :type encode_dim: int
    :param epochs: Number of epochs to train the model.
    :type epochs: int
    :param seed: Seed for the random number generator.
    :type seed: int
    :param explode_dim: If True, the embedding is exploded into multiple columns.
    :type explode_dim: bool
    :param verbosity: Verbosity level.
    :type verbosity: int

    :return: The encoded data.
    :rtype: pandas.DataFrame
    """
    if path_to_graph is None:
        # Load the embeddings
        if verbosity > 0:
            print(f"Loading the embeddings from '{path_to_embeddings}'...")
        emb_df = pd.read_csv(path_to_embeddings, index_col=0)
        model = None
    else:
        # Load Graph
        G = load_graph(path_to_graph)
        # Embed the graph
        if verbosity > 0:
            print("(Poincare) Embedding the graph ...")
        model = PoincareModel(list(G.edges()), seed=seed, size=encode_dim)
        model.train(epochs=epochs, print_every=500)
        # Get the embeddings and map them to the node names
        embeddings_dict = {node: model.kv[node] for node in G.nodes}
        emb_df = pd.DataFrame.from_dict(embeddings_dict, orient='index')
        if path_to_embeddings is not None:
            # Save the embeddings
            if verbosity > 1:
                print(f"Saving the embeddings to '{path_to_embeddings}'...")
            emb_df.to_csv(path_to_embeddings)

    if dim_reduction == 'pca':
        # Reduce the dimensionality of the embeddings
        if verbosity > 1:
            print("Reducing the dimensionality of the embeddings by applying PCA...")
        pca = PCA(n_components=n_components, random_state=seed)
        emb_df = pd.DataFrame(pca.fit_transform(emb_df), index=emb_df.index)
    elif dim_reduction == 'tsne':
        # Reduce the dimensionality of the embeddings
        if verbosity > 1:
            print("Reducing the dimensionality of the embeddings by applying t-SNE...")
        tsne = TSNE(n_components=n_components, random_state=seed)
        emb_df = pd.DataFrame(tsne.fit_transform(emb_df), index=emb_df.index)

    if data is None or column_to_encode is None:
        return emb_df, model
    else:
        if verbosity > 0:
            print(f"Encoding the data feature '{column_to_encode}'...")
        if explode_dim:
            # Rename the columns to enc_dim_0, enc_dim_1, ...
            emb_df.columns = [f'enc_dim_{col}' for col in emb_df.columns]
            # Merge the embeddings with the data
            encoded_data_df = data.merge(emb_df, left_on=column_to_encode, right_index=True, how='left')
        else:
            # Combine the embeddings into one column
            emb_df['combined_enc_emb'] = emb_df.values.tolist()
            encoded_data_df = data.merge(emb_df['combined_emb'], left_on=column_to_encode, right_index=True, how='left')
        # Drop the column to encode
        encoded_data_df.drop(column_to_encode, axis=1, inplace=True)
        return encoded_data_df, model


def ohe_encode_train_data(X_train: pd.DataFrame, cols_to_encode: list, verbosity=1) -> (pd.DataFrame, OneHotEncoder):
    """
    Function to One Hot Encode the train data: Fits and transforms the OHE Object on the train data;
    more specifically: The provided cols_to_encode (list of features). Function also makes sure that a
    pd.DataFrame is returned by dropping the old features and concatenating the encoded ones.

    :param X_train: pd.DataFrame -- Provided Train Dataset
    :param cols_to_encode: list -- Provided list of features to apply OHE on
    :param verbosity: int -- Level of verbosity

    :return: Tuple with pd.DataFrame with encoded features and fitted OHE object
    """
    if verbosity > 0:
        print(f"One Hot Encoding the features {cols_to_encode} of the train data ...")

    # Get DataFrame with only relevant features, i.e. cols_to_encode
    X_train_cats = X_train[cols_to_encode]

    # Fit OneHotEncoding object
    ohe = OneHotEncoder(handle_unknown="ignore", dtype=np.float32)
    X_train_cats_encoded = ohe.fit_transform(X_train_cats).toarray()

    # Transform encoded data to pandas dataframe
    X_train_cats_encoded = pd.DataFrame(X_train_cats_encoded, columns=ohe.get_feature_names_out(), index=X_train.index)

    # Drop old features
    feats_to_drop = list(ohe.feature_names_in_)
    X_train = X_train.drop(columns=feats_to_drop, axis=1)

    # Concat old dataframe with new encoded features
    X_train_encoded = pd.concat([X_train, X_train_cats_encoded], axis=1)

    return X_train_encoded, ohe


def ohe_encode_test_data(X_test: pd.DataFrame, cols_to_encode: list, ohe: OneHotEncoder, verbosity=1) -> pd.DataFrame:
    """
    Function to apply the fitted OHE object on the test set features provided in param cols_to_encode.
    Also makes sure that pd.DataFrame is returned by dropping the old features and concatenating the encoded ones.

    :param X_test: pd.DataFrame -- Provided Test Dataset
    :param cols_to_encode: list -- Provided list of features to apply OHE on
    :param ohe: OneHotEncoder -- Fitted OHE object
    :param verbosity: int -- Level of verbosity

    :return: pd.DataFrame -- Encoded Test Dataset
    """
    if verbosity > 0:
        print(f"One Hot Encoding the features {cols_to_encode} of the test data ...")

    # Get DataFrame with only relevant features, i.e. cols_to_encode and transform them
    X_test_cats = X_test[cols_to_encode]
    X_test_cats_encoded = ohe.transform(X_test_cats).toarray()

    # Transform to pandas DataFrame
    X_test_cats_encoded = pd.DataFrame(X_test_cats_encoded, columns=ohe.get_feature_names_out(), index=X_test.index)

    # Drop old features
    feats_to_drop = list(ohe.feature_names_in_)
    X_test = X_test.drop(columns=feats_to_drop, axis=1)

    # Concat old dataframe with new encoded features
    X_test_encoded = pd.concat([X_test, X_test_cats_encoded], axis=1)

    return X_test_encoded


In [5]:
def custom_cross_validated_indices(df: pd.DataFrame, factors: Iterable[str], target: str,
                                   **kfoldargs) -> List[List[Iterable[int]]]:
    df_factors = df.groupby(factors)[target].mean().reset_index()
    X_factors, y_factors = df_factors.drop(target, axis=1), df_factors[target]

    indices = []
    for itr, ite in KFold(**kfoldargs).split(X_factors, y_factors):
        tr = pd.merge(X_factors.iloc[itr], df.reset_index(), on=factors)["index"]  # "index" is the index of df
        te = pd.merge(X_factors.iloc[ite], df.reset_index(), on=factors)["index"]  # "index" is the index of df
        indices.append([tr, te])

    return indices

In [6]:
import sys
sys.path.append("../..")
from src.load_datasets import load_dataset, load_rankings, load_train_data

FACTORS = ["dataset", "model", "tuning", "scoring"]
NEW_INDEX = "encoder"
TARGET = "cv_score"

X_train, y_train = load_train_data('../../data/raw/dataset_train.csv')
indices = custom_cross_validated_indices(pd.concat([X_train, y_train], axis=1).copy(), FACTORS, TARGET, n_splits=5, shuffle=True, random_state=42)

Loading train data ...


In [7]:
import src.encoding
from src.meta_information import add_dataset_meta_information

X_train, _ = poincare_encoding(path_to_graph="../../data/raw/graph.adjlist",
                                            path_to_embeddings="../../data/preprocessed/embeddings.csv",
                                            data=X_train,
                                            column_to_encode="encoder",
                                            encode_dim=50,
                                            explode_dim=True,
                                            epochs=5000,
                                            dim_reduction=False,
                                            verbosity=2)

X_train = add_dataset_meta_information(df=X_train,
                                       path_to_meta_df="../../data/preprocessed/dataset_agg.csv",
                                       nan_threshold=0.4,
                                       replacing_strategy="mean")


(Poincare) Embedding the graph ...
Saving the embeddings to '../../data/preprocessed/embeddings.csv'...
Encoding the data feature 'encoder'...


# Check dataset

- Data types (only numerical)
- Null values
- Normalization

In [8]:
all_columns = X_train.columns
numeric_columns = X_train.select_dtypes(include=np.number).columns

encode = False
scale = False

# Check if there are only numeric columns
if len(all_columns) > len(numeric_columns):
    encode_cols = set(all_columns) - set(numeric_columns)
    encode = True
    print(f"There are columns which are not encoded: {encode_cols}")
    

# Check if there are null values, if yes, fill them with the mean
if X_train.isna().any().sum() > 0:
    X_train = X_train.fillna(X_train.mean())
#if X_val.isna().any().sum() > 0:
#    X_val = X_val.fillna(X_val.mean())
    
    
# Check if data is normalized to [0, 1]
#if max(X_train.max()) > 1 or min(X_train.min()) < 0:
#    scale = True
#    print("Dataframe will be scaled")

There are columns which are not encoded: {'scoring', 'tuning', 'model'}


# Define 

- Model
- Feature selection (https://machinelearningmastery.com/feature-selection-for-regression-data/)
- Hyperparameter grid

In [9]:
model = DecisionTreeRegressor(random_state=42)

In [10]:
param_grid = {
    "model__max_depth": [5, 10, 15, None]
}

In [11]:
def get_pearson_correlated_features(data=None, threshold=0.7):
    """
    Calculates the pearson correlation of all features in the dataframe and returns a set of features with a
    correlation greater than the threshold.

    :param data: The input dataframe.
    :type data: pd.DataFrame
    :param threshold: The threshold for the correlation coefficient in the range of [0.0, 1.0].
    :type threshold: float,optional(default=0.7)

    :return: The set of features with a correlation greater than the threshold.
    :rtype: set
    """
    # Calculate correlation matrix
    corr_matrix = data.corr()

    # Get the set of correlated features
    correlated_features = set()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                correlated_features.add(colname)

    return correlated_features

In [12]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression, f_regression
import numpy as np


def select_features(X_train, y_train, quantile):
    # configure to select a subset of features
    fs = SelectKBest(score_func=f_regression, k='all')  # or mutual_info_regression
    fs.fit(X_train, y_train)
    
    # Select columns based on mask
    mask = [x >= np.quantile(fs.scores_, quantile) for x in fs.scores_]  # 0.4
    X_train_fs = X_train.loc[:, mask]
    #X_val_fs = X_val.loc[:, mask]
    
    #fs = SelectKBest(score_func=f_regression, k=k)
    # learn relationship from training data
    #fs.fit(X_train, y_train)
    # transform train input data
    #X_train_fs = fs.transform(X_train)
    # transform test input data
    #X_test_fs = fs.transform(X_test)
    
    return list(X_train_fs.columns)
    #return pd.DataFrame(X_train_fs), pd.DataFrame(X_test_fs)

# Perform

- Correlation analysis and dropping of highly correlated features
- Feature selection
- Hyperparameter optimization

In [13]:
# Drop correlated features
column_list = set(X_train.columns) - set(["model", "tuning", "scoring"])

correlated_features = get_pearson_correlated_features(data=X_train[list(column_list)], threshold=0.8)
print("Drop correlated features: \n", correlated_features)
X_train_clean = X_train.drop(correlated_features, axis=1)

Drop correlated features: 
 {'MaxAttributeEntropy', 'Quartile2MutualInformation', 'MeanMutualInformation', 'StdvNominalAttDistinctValues', 'MinAttributeEntropy', 'enc_dim_15', 'PercentageOfMissingValues', 'enc_dim_22', 'non_categorical_features_count', 'enc_dim_40', 'MaxMutualInformation', 'total_feature_count', 'MinorityClassSize', 'enc_dim_28', 'enc_dim_36', 'MaxNominalAttDistinctValues', 'enc_dim_29', 'enc_dim_48', 'enc_dim_10', 'avg_number_of_categories_per_cat_feature', 'MajorityClassPercentage', 'row_count', 'null_value_count', 'ratio_of_null_values_to_all', 'MeanAttributeEntropy', 'enc_dim_7', 'enc_dim_2', 'Quartile2AttributeEntropy', 'enc_dim_41', 'enc_dim_43', 'enc_dim_47', 'enc_dim_30', 'EquivalentNumberOfAtts', 'enc_dim_13', 'NumberOfSymbolicFeatures', 'ratio_of_categorical_features_to_all', 'enc_dim_49', 'sum_of_all_categories', 'enc_dim_12', 'enc_dim_44', 'column_count', 'enc_dim_46', 'enc_dim_16', 'enc_dim_5', 'columns_with_null_values_count', 'enc_dim_35'}


In [14]:
# Select features
print(f"Number of features before selection: {X_train_clean.shape[1]}")

column_list = set(X_train_clean.columns) - set(["model", "tuning", "scoring"])
selected_feature_set = select_features(X_train_clean[list(column_list)], y_train.values.ravel(), 0.4)

print(f"Number of features after selection : {len(selected_feature_set)}")
print(selected_feature_set)

Number of features before selection: 58
Number of features after selection : 33
['enc_dim_11', 'PercentageOfSymbolicFeatures', 'enc_dim_20', 'MinMutualInformation', 'MeanNominalAttDistinctValues', 'categorical_features_count', 'enc_dim_8', 'AutoCorrelation', 'rows_with_null_values_count', 'PercentageOfInstancesWithMissingValues', 'enc_dim_45', 'PercentageOfNumericFeatures', 'MajorityClassSize', 'ClassEntropy', 'Quartile1MutualInformation', 'Dimensionality', 'max_number_of_categories_per_cat_feature', 'Quartile3AttributeEntropy', 'MinNominalAttDistinctValues', 'NumberOfBinaryFeatures', 'enc_dim_42', 'MinorityClassPercentage', 'Quartile1AttributeEntropy', 'min_number_of_categories_per_cat_feature', 'enc_dim_34', 'enc_dim_17', 'enc_dim_23', 'enc_dim_6', 'dataset', 'PercentageOfBinaryFeatures', 'NumberOfNumericFeatures', 'Quartile3MutualInformation', 'enc_dim_14']


In [17]:
# Perform GridSearchCV
pipeline = Pipeline([("scaler", MinMaxScaler()), ("model", model)])
#indices = custom_cross_validated_indices(X_train_clean, FACTORS, TARGET, n_splits=5, shuffle=True, random_state=42)

column_list = set(X_train_clean.columns) - set(["model", "tuning", "scoring"])

gs = GridSearchCV(pipeline, param_grid, cv=indices, scoring="r2").fit(X_train_clean[list(column_list)], y_train)

print("Best parameters: ")
print(gs.best_params_)

Best parameters: 
{'model__max_depth': None}


In [18]:
print(f"With score: {gs.best_score_} +/- {gs.cv_results_['std_test_score'][gs.cv_results_['mean_test_score'].argmax()]}")

With score: 0.4479765366422243 +/- 0.047288720620854124


# Runs for different models and parameters

- Define a functions with the above functionality
- Parameters for
  - Model
  - Parameter grid
  - Feature selection
  - Correlation (yes / no / threshold)

In [25]:
def parameter_search(X_train=None,
                     y_train=None,
                     indices=None,
                     model=None, 
                     parameter_grid=None, 
                     feature_selection=False, 
                     qunatile=0.4,
                     drop_correlated_features=True, 
                     corr_threshold=0.8):
    # If you want to calculate spearman you need the inital features, so keep 
    # column_list = set(X_train_clean.columns) - set(["model", "tuning", "scoring"])
    # out of the removals
    column_list = list(set(X_train_clean.columns) - set(["model", "tuning", "scoring"]))
    
    if drop_correlated_features:
        # Calculate correlation matrix
        
        corr_matrix = X_train[column_list].corr()

        # Get the set of correlated features
        correlated_features = set()
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > corr_threshold:
                    colname = corr_matrix.columns[i]
                    correlated_features.add(colname)

        X_train = X_train.drop(correlated_features, axis=1)
    
    if feature_selection: 
        fs = SelectKBest(score_func=f_regression, k='all')  # or mutual_info_regression
        fs.fit(X_train, y_train)

        # Select columns based on mask
        mask = [x >= np.quantile(fs.scores_, quantile) for x in fs.scores_]  # 0.4
        X_train_fs = X_train.loc[:, mask]
        selected_features = list(X_train_fs.columns)
        X_train = X_train[selected_features]
    
    # Perform GridSearchCV
    pipeline = Pipeline([("scaler", MinMaxScaler()), ("model", model)])
    #indices = custom_cross_validated_indices(X_train_clean, FACTORS, TARGET, n_splits=5, shuffle=True, random_state=42)

    column_list = list(set(X_train.columns) - set(["model", "tuning", "scoring"]))
    gs = GridSearchCV(pipeline, param_grid, cv=indices, scoring="r2").fit(X_train[column_list], y_train)

    print("Best parameters: ")
    print(gs.best_params_)
    print(f"With score: {gs.best_score_} +/- {gs.cv_results_['std_test_score'][gs.cv_results_['mean_test_score'].argmax()]}")
    
    return gs

In [20]:
# At this point indices are given, so define models and parameter grids

### Which models do I want to test? 

- DecisionTree
- RandomForest
- XGBoost
- LGBM
- Linear Regression
- Ridge Regression
- Lasso Regression 

In [21]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42)

grid_rf = {
    "model__n_estimators": [25, 50, 100, 150, 200],
    "model__max_depth"   : [10, 25, 50, 75, None], 
    "model__min_samples_split": [2, 5, 10, 15, 20, 25],
    "model__min_samples_leaf" : [1, 2, 5, 10, 15]
}

In [26]:
gs = parameter_search(X_train=X_train,
                      y_train=y_train,
                      indices=indices,
                      model=model, 
                      parameter_grid=grid_rf, 
                      feature_selection=False, 
                      qunatile=0.4,
                      drop_correlated_features=True, 
                      corr_threshold=0.8)

Best parameters: 
{'model__max_depth': 10}
With score: 0.44813157695338574 +/- 0.04703969013048095


In [31]:
from xgboost import XGBRegressor
model = XGBRegressor(random_state=42)

grid_xgb = {
    "model__n_estimators"     : [25, 50, 100, 150, 200],
    "model__max_depth"        : [10, 25, 50, 75, None], 
    "model__max_leaves"       : [1, 5, 10, 25, 50, None]
}

In [32]:
gs = parameter_search(X_train=X_train,
                      y_train=y_train,
                      indices=indices,
                      model=model, 
                      parameter_grid=grid_xgb, 
                      feature_selection=False, 
                      qunatile=0.4,
                      drop_correlated_features=True, 
                      corr_threshold=0.8)

Best parameters: 
{'model__max_depth': 5}
With score: 0.44848212768752055 +/- 0.04685683933957084


In [None]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_dt = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

param_grid_lgbm = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

param_grid_lr = {
    'fit_intercept': [True, False],
    'normalize': [True, False]
}

param_grid_ridge = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
    'fit_intercept': [True, False],
    'normalize': [True, False],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

param_grid_lasso = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
    'fit_intercept': [True, False],
    'normalize': [True, False],
    'selection': ['cyclic', 'random']
}