In [1]:
import os
import random
import time
import typing as ty
import yaml
import argparse
from collections import defaultdict

import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression

# Import models and encoders
os.chdir('/home/mrsergazinov/TabLLM/')
from encoders.numEncoders import (
    FourierFeatures, 
    BinningFeatures, 
    ComboFeatures,
    SquareScalingFeatures,
)

from train_eval import preprocess_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1):
        super(MLP, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers

        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_dim, hidden_dim))
        for _ in range(num_layers - 1):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.layers.append(nn.Linear(hidden_dim, output_dim))

    def forward(self, x):
        for i in range(self.num_layers):
            x = torch.relu(self.layers[i](x))
        x = self.layers[-1](x)
        return x


def generate_dataset(n, p, M=10, kernel='gaussian', sigma=0.5):
    # Generate nxp matrix of features
    X = np.random.randn(n, p)
    
    # Generate points Z of dimension Mxp
    Z = np.random.randn(M, p)
    
    # Generate alpha coefficients for each feature-point combination
    alpha = np.random.randn(p, M)

    # Define kernels
    def gaussian_kernel(x, z, sigma=sigma):
        # x and z are scalars here
        return np.exp(-((x - z) ** 2) / sigma)

    def hat_kernel(x, z, sigma=sigma):
        return (np.abs(x - z) < sigma).astype(float)
    
    def laplace_kernel(x, z, sigma=sigma):
        return np.exp(-np.abs(x - z) / sigma)
    
    # Choose kernel function based on input parameter
    if kernel == 'gaussian':
        k_func = gaussian_kernel
    elif kernel == 'hat':
        k_func = hat_kernel
    elif kernel == 'laplace':
        k_func = laplace_kernel
    else:
        raise ValueError("Unknown kernel")

    # Compute y
    y = np.zeros(n)
    for i in range(n):
        for j in range(p):
            # Compute kernel features for this dimension against all M points
            k_values = [k_func(X[i, j], Z[m, j]) for m in range(M)]
            y[i] += np.sum(alpha[j, :] * k_values)

    # Add small Gaussian noise
    y += np.random.randn(n) * 0.01

    return X, y


In [3]:
results = defaultdict(list)
for seed in range(30):
    print(f"Seed: {seed}")
    torch.manual_seed(seed)
    np.random.seed(seed)
    X, y = generate_dataset(10000, 10, kernel='laplace')
    params = {
        'model_name': 'MLP',
        'num_encoder': 'NewFourierFeatures',
        'num_encoder_trainable': False,
        'scaler': 'SquareScalingFeatures',
        'n_run': 1,
        'config_file': 'configs/simulation.yaml',
        'random_state': seed,
        'test_size': 0.1,
    }
    task_type = 'regression'

    # convet X to DataFrame and y to Series
    X = pd.DataFrame(X)
    y = pd.Series(y)

    # Pre-process the data: 
    # 1. standaridizes columns to mean=0 and std=1
    # 2. standardizes target to mean=0 and std=1
    # 3. splits the data into training, validation, and test sets
    (y_train, 
     y_val, 
     y_test, 
     X_train_num, 
     X_val_num, 
     X_test_num, 
     X_train_cat, 
     X_val_cat, 
     X_test_cat) = preprocess_data(X, y, task_type, params)
    y_test = y_test.numpy()

    # Raw features -- linear regression
    model = LinearRegression()
    model.fit(X_train_num, y_train)
    y_pred = model.predict(X_test_num)
    mse = np.mean((y_pred - y_test) ** 2)
    results['Raw'].append(mse)
    print(f"Raw features: {mse}")

    # Fourier features with normal -- linear regression
    # Encode the numerical features
    encoder = FourierFeatures(
       n_features=X_train_num.shape[1],
       n_frequencies=10,
       frequency_scale=0.5,
       distribution='normal',
    )
    with torch.no_grad():
        X_train_num_normal = encoder(X_train_num)
        X_val_num_normal = encoder(X_val_num)
        X_test_num_normal = encoder(X_test_num)
    # Fit the linear regression model
    model = LinearRegression()
    model.fit(X_train_num_normal, y_train)
    y_pred = model.predict(X_test_num_normal)
    # Compute the mean squared error
    mse = np.mean((y_pred - y_test) ** 2)
    results['Fourier_normal'].append(mse)
    print(f"Fourier features with Normal: {mse}")


    # Fourier features with cauchy -- linear regression
    # Encode the numerical features
    encoder = FourierFeatures(
       n_features=X_train_num.shape[1],
       n_frequencies=10,
       frequency_scale=0.5,
       distribution='cauchy',
    )
    with torch.no_grad():
        X_train_num_cauchy = encoder(X_train_num)
        X_val_num_cauchy = encoder(X_val_num)
        X_test_num_cauchy = encoder(X_test_num)
    # Fit the linear regression model
    model = LinearRegression()
    model.fit(X_train_num_cauchy, y_train)
    y_pred = model.predict(X_test_num_cauchy)
    # Compute the mean squared error
    mse = np.mean((y_pred - y_test) ** 2)
    results['Fourier_cauchy'].append(mse)
    print(f"Fourier features with Cauchy: {mse}")


Seed: 0
Raw features: 0.8932828307151794
Fourier features with Normal: 0.12520308792591095
Fourier features with Cauchy: 0.04882017523050308
Seed: 1
Raw features: 0.8054531216621399
Fourier features with Normal: 6.725131034851074
Fourier features with Cauchy: 4.577852725982666
Seed: 2
Raw features: 0.78705894947052
Fourier features with Normal: 0.09973685443401337
Fourier features with Cauchy: 0.07541023939847946
Seed: 3
Raw features: 0.789662778377533
Fourier features with Normal: 0.04982997849583626
Fourier features with Cauchy: 0.04209383577108383
Seed: 4
Raw features: 0.8271408677101135
Fourier features with Normal: 0.13838732242584229
Fourier features with Cauchy: 0.07183782756328583
Seed: 5
Raw features: 0.570482611656189
Fourier features with Normal: 2.9672436714172363
Fourier features with Cauchy: 0.604804277420044
Seed: 6
Raw features: 0.7397266626358032
Fourier features with Normal: 0.030324863269925117
Fourier features with Cauchy: 0.036668021231889725
Seed: 7
Raw features: 

In [6]:
for key in results.keys():
    print(f"{key}: {np.median(results[key])}")

Raw: 0.7508153915405273
Fourier_normal: 0.11246997117996216
Fourier_cauchy: 0.056922681629657745
