In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_parquet("/content/inequality_education_fev.parquet")
X = df.drop(columns=["target", "id", "timestamp"])
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df["Human Development Groups"])
X = torch.tensor( [np.hstack(X_train.values[i]) for i in range(len(X_train))], dtype=torch.float32)
y = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
groups = torch.tensor(X_train["Human Development Groups"], dtype=torch.int64) #fairness groups

  X = torch.tensor( [np.hstack(X_train.values[i]) for i in range(len(X_train))], dtype=torch.float32)


In [None]:


# Simple model
class GlobalTimeSeriesModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, x):
        return self.net(x)

# Fairness penalty
def fairness_penalty(y_true, y_pred, groups):
    """Differenz der mittleren Fehler zwischen Gruppen."""
    errors = torch.abs(y_true - y_pred).detach()
    groups_err = []
    groups_err.append(errors[groups == 0].mean())
    groups_err.append(errors[groups == 1].mean())
    groups_err.append(errors[groups == 2].mean())
    groups_err.append(errors[groups == 3].mean())
    return np.std(np.array(groups_err))


def train(model, with_fairness=True):
  optimizer = optim.Adam(model.parameters(), lr=0.01)
  criterion = nn.MSELoss()
  lambda_fair = 0.5  # Gewichtung der Fairness-Strafe

  for epoch in range(10000):
      optimizer.zero_grad()
      y_pred = model(X)
      mse_loss = criterion(y_pred, y)
      fair_loss = fairness_penalty(y, y_pred, groups)
      if with_fairness:
        loss = mse_loss + lambda_fair * fair_loss
      else:
        loss = mse_loss
      loss.backward()
      optimizer.step()

      if epoch % 1000 == 0:
          print(f"Epoch {epoch}: Gesamtverlust={loss.item():.4f} "
                f"(MSE={mse_loss.item():.4f}, Fairness={fair_loss.item():.4f})")

model1 = GlobalTimeSeriesModel(input_size=len(X[0]), hidden_size=9)
model2 = GlobalTimeSeriesModel(input_size=len(X[0]), hidden_size=9)

train(model1, with_fairness=True)
print("\n\n")
train(model2, with_fairness=False)


Epoch 0: Gesamtverlust=775.0729 (MSE=769.6347, Fairness=10.8765)
Epoch 1000: Gesamtverlust=0.7298 (MSE=0.6692, Fairness=0.1212)
Epoch 2000: Gesamtverlust=0.0471 (MSE=0.0324, Fairness=0.0295)
Epoch 3000: Gesamtverlust=0.0109 (MSE=0.0059, Fairness=0.0100)
Epoch 4000: Gesamtverlust=0.0064 (MSE=0.0026, Fairness=0.0075)
Epoch 5000: Gesamtverlust=0.0044 (MSE=0.0014, Fairness=0.0060)
Epoch 6000: Gesamtverlust=0.0031 (MSE=0.0009, Fairness=0.0045)
Epoch 7000: Gesamtverlust=0.0024 (MSE=0.0007, Fairness=0.0034)
Epoch 8000: Gesamtverlust=0.0031 (MSE=0.0011, Fairness=0.0040)
Epoch 9000: Gesamtverlust=0.0015 (MSE=0.0002, Fairness=0.0027)



Epoch 0: Gesamtverlust=779.1244 (MSE=779.1244, Fairness=10.8778)
Epoch 1000: Gesamtverlust=2.0179 (MSE=2.0179, Fairness=0.1625)
Epoch 2000: Gesamtverlust=0.2821 (MSE=0.2821, Fairness=0.0791)
Epoch 3000: Gesamtverlust=0.0716 (MSE=0.0716, Fairness=0.0279)
Epoch 4000: Gesamtverlust=0.0237 (MSE=0.0237, Fairness=0.0137)
Epoch 5000: Gesamtverlust=0.0060 (MSE=0.0060, Fa

In [None]:
X_test_tensor = torch.tensor([np.hstack(X_test.values[i]) for i in range(len(X_test))], dtype=torch.float32)

y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

def test(model):
    preds = model(X_test_tensor)
    test_mse = nn.MSELoss()(preds, y_test_tensor).item()

    groups_test = torch.tensor(X_test["Human Development Groups"].values, dtype=torch.int64)
    test_fairness = fairness_penalty(y_test_tensor, preds, groups_test).item()


    print(f"Test MSE: {test_mse:.4f}")
    print(f"Test fairness (std of group mean abs errors): {test_fairness:.4f}")

test(model1)
print("\n\n")
test(model2)

Test MSE: 0.0153
Test fairness (std of group mean abs errors): 0.0421



Test MSE: 0.0122
Test fairness (std of group mean abs errors): 0.0214


# Walmart Dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("autogluon/fev_datasets", "restaurant", split="train")
df = ds.to_pandas()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

restaurant/train-00000-of-00001.parquet:   0%|          | 0.00/404k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/817 [00:00<?, ? examples/s]

In [None]:
df.head()

Unnamed: 0,id,timestamp,target,air_genre_name,air_area_name,latitude,longitude
0,air_00a91d42b08b08d9,"[2016-07-01T00:00:00.000, 2016-07-02T00:00:00....","[35.0, 9.0, 9.0, 20.0, 25.0, 29.0, 34.0, 42.0,...",Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694004,139.753601
1,air_0164b9927d20bcc3,"[2016-10-03T00:00:00.000, 2016-10-04T00:00:00....","[3.0, 8.0, 10.0, 11.0, 13.0, 2.0, 2.0, 2.0, 3....",Italian/French,Tōkyō-to Minato-ku Shibakōen,35.65807,139.751602
2,air_0241aa3964b7f861,"[2016-01-03T00:00:00.000, 2016-01-04T00:00:00....","[10.0, 9.0, 17.0, 10.0, 10.0, 5.0, 8.0, 16.0, ...",Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712608,139.779999
3,air_0328696196e46f18,"[2016-07-03T00:00:00.000, 2016-07-04T00:00:00....","[11.0, 4.0, 6.0, 4.0, 8.0, 8.0, 12.0, 4.0, 4.0...",Dining bar,Ōsaka-fu Ōsaka-shi Nakanochō,34.701279,135.528091
4,air_034a3d5b40d5b1b1,"[2016-07-01T00:00:00.000, 2016-07-02T00:00:00....","[19.0, 23.0, 5.0, 13.0, 13.0, 10.0, 3.0, 23.0,...",Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229


Only keep time series of equal length

In [None]:
lens = []
for i in range(df.shape[0]):
  lens.append(len(df["target"].values[i]))

mask = (pd.Series(lens) == 296)
df_reduced = df.loc[mask[mask].index, :]

lens = []
for i in range(df_reduced.shape[0]):
  lens.append(len(df_reduced["target"].values[i]))

pd.Series(lens).value_counts()

Unnamed: 0,count
296,425


Encoding

In [None]:
df_reduced = df_reduced.drop(columns=["id", "air_area_name"])   #air_area_name has 92 unique values, OHE would make the data too large so drop that column
df_reduced = df_reduced[df_reduced.air_genre_name.isin(df_reduced.air_genre_name.value_counts().index[:3])]   #only keep three most frequent genres
df_reduced.reset_index(drop=True, inplace=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded = encoder.fit_transform(df_reduced[["air_genre_name"]])

encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(["air_genre_name"]))
df_encoded = pd.concat([df_reduced.drop(columns="air_genre_name"), encoded_df], axis=1)

scaler = StandardScaler()
df_encoded[["latitude", "longitude"]] = scaler.fit_transform(df_encoded[["latitude", "longitude"]])
df_encoded

Unnamed: 0,timestamp,target,latitude,longitude,air_genre_name_Dining bar,air_genre_name_Izakaya
0,"[2016-07-01T00:00:00.000, 2016-07-02T00:00:00....","[19.0, 23.0, 5.0, 13.0, 13.0, 10.0, 3.0, 23.0,...",-0.407716,-0.418432,0.0,0.0
1,"[2016-07-01T00:00:00.000, 2016-07-02T00:00:00....","[37.0, 26.0, 37.0, 25.0, 34.0, 26.0, 27.0, 29....",-0.354461,-0.448273,0.0,0.0
2,"[2016-07-01T00:00:00.000, 2016-07-02T00:00:00....","[29.0, 35.0, 17.0, 6.0, 12.0, 25.0, 6.0, 21.0,...",3.747518,1.132463,0.0,1.0
3,"[2016-07-01T00:00:00.000, 2016-07-02T00:00:00....","[7.0, 26.0, 3.0, 6.0, 10.0, 12.0, 4.0, 14.0, 1...",-0.954562,-1.769969,1.0,0.0
4,"[2016-07-01T00:00:00.000, 2016-07-02T00:00:00....","[31.0, 57.0, 45.0, 25.0, 25.0, 26.0, 23.0, 20....",0.094130,0.673346,0.0,0.0
...,...,...,...,...,...,...
244,"[2016-07-01T00:00:00.000, 2016-07-02T00:00:00....","[16.0, 12.0, 4.0, 10.0, 18.0, 7.0, 7.0, 29.0, ...",0.072860,0.707576,0.0,1.0
245,"[2016-07-01T00:00:00.000, 2016-07-02T00:00:00....","[7.0, 7.0, 1.0, 1.0, 7.0, 6.0, 6.0, 1.0, 7.0, ...",-0.970373,-1.822453,0.0,0.0
246,"[2016-07-01T00:00:00.000, 2016-07-02T00:00:00....","[41.0, 32.0, 18.0, 16.0, 15.0, 11.0, 24.0, 41....",0.078351,0.737738,0.0,1.0
247,"[2016-07-01T00:00:00.000, 2016-07-02T00:00:00....","[15.0, 31.0, 21.0, 21.0, 2.0, 5.0, 1.0, 2.0, 1...",-0.406334,-0.491440,1.0,0.0


In [None]:
X,y = pd.DataFrame(), pd.DataFrame()
for i in range(len(df_encoded)):
  df_reduced_one = df_encoded.loc[i,:]
  df_reduced_one = pd.DataFrame(df_reduced_one).T.explode(["timestamp", "target"]).reset_index(drop= True)
  df_reduced_one["timestamp"] = pd.to_datetime(df_reduced_one["timestamp"])

  # Extract useful time series features
  df_reduced_one["year"] = df_reduced_one["timestamp"].dt.year
  df_reduced_one["month"] = df_reduced_one["timestamp"].dt.month
  df_reduced_one["day"] = df_reduced_one["timestamp"].dt.day
  df_reduced_one["dayofweek"] = df_reduced_one["timestamp"].dt.dayofweek  # Monday=0
  df_reduced_one["is_weekend"] = (df_reduced_one["timestamp"].dt.dayofweek >= 5).astype(int)

  import numpy as np

  for x in ["month", "day", "dayofweek"]:
    df_reduced_one[f"{x}_sin"] = np.sin(2 * np.pi * df_reduced_one[x] / 7)
    df_reduced_one[f"{x}_cos"] = np.cos(2 * np.pi * df_reduced_one[x] / 7)
    df_reduced_one.drop(columns=[x], inplace=True)

  df_reduced_one = df_reduced_one.drop(columns=["timestamp"])
  X_one = df_reduced_one.iloc[:270, :].pivot_table(index=list(df_reduced_one.columns[3:5]), aggfunc=list).reset_index()
  y_one = df_reduced_one.iloc[270:, :].pivot_table(index=list(df_reduced_one.columns[3:5]), aggfunc=list).reset_index()
  X = pd.concat([X, X_one])
  y = pd.concat([y, y_one["target"]])

In [None]:
X.reset_index(drop=True, inplace=True)

In [None]:
y.reset_index(drop=True, inplace=True)

In [None]:
for c in ["target", "year"]:    #standard scale target and year
  all_vals = np.concatenate(X[c].values)
  mean, std = all_vals.mean(), all_vals.std()
  for i in range(len(X)):
    X.at[i, c] = [(a - mean) / std for a in X.loc[i, c]]

X

Unnamed: 0,air_genre_name_Dining bar,air_genre_name_Izakaya,day_cos,day_sin,dayofweek_cos,dayofweek_sin,is_weekend,latitude,longitude,month_cos,month_sin,target,year
0,0.0,0.0,"[0.6234898018587336, -0.22252093395631434, -0....","[0.7818314824680298, 0.9749279121818236, 0.433...","[-0.9009688679024191, -0.2225209339563146, 0.6...","[-0.433883739117558, -0.9749279121818236, -0.7...","[0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[-0.40771613, -0.40771613, -0.40771613, -0.407...","[-0.41843176, -0.41843176, -0.41843176, -0.418...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[-2.4492935982947064e-16, -2.4492935982947064e...","[-0.12445488, 0.066016056, -0.7911031, -0.4101...","[-0.6836602258048724, -0.6836602258048724, -0...."
1,0.0,0.0,"[0.6234898018587336, -0.22252093395631434, -0....","[0.7818314824680298, 0.9749279121818236, 0.433...","[-0.9009688679024191, -0.2225209339563146, 0.6...","[-0.433883739117558, -0.9749279121818236, -0.7...","[0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[-0.35446063, -0.35446063, -0.35446063, -0.354...","[-0.44827327, -0.44827327, -0.44827327, -0.448...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[-2.4492935982947064e-16, -2.4492935982947064e...","[0.7326643, 0.20886925, 0.7326643, 0.16125152,...","[-0.6836602258048724, -0.6836602258048724, -0...."
2,0.0,1.0,"[0.6234898018587336, -0.22252093395631434, -0....","[0.7818314824680298, 0.9749279121818236, 0.433...","[-0.9009688679024191, -0.2225209339563146, 0.6...","[-0.433883739117558, -0.9749279121818236, -0.7...","[0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[3.747518, 3.747518, 3.747518, 3.747518, 3.747...","[1.1324627, 1.1324627, 1.1324627, 1.1324627, 1...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[-2.4492935982947064e-16, -2.4492935982947064e...","[0.35172245, 0.6374288, -0.21969034, -0.743485...","[-0.6836602258048724, -0.6836602258048724, -0...."
3,1.0,0.0,"[0.6234898018587336, -0.22252093395631434, -0....","[0.7818314824680298, 0.9749279121818236, 0.433...","[-0.9009688679024191, -0.2225209339563146, 0.6...","[-0.433883739117558, -0.9749279121818236, -0.7...","[0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[-0.95456177, -0.95456177, -0.95456177, -0.954...","[-1.7699685, -1.7699685, -1.7699685, -1.769968...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[-2.4492935982947064e-16, -2.4492935982947064e...","[-0.69586766, 0.20886925, -0.8863386, -0.74348...","[-0.6836602258048724, -0.6836602258048724, -0...."
4,0.0,0.0,"[0.6234898018587336, -0.22252093395631434, -0....","[0.7818314824680298, 0.9749279121818236, 0.433...","[-0.9009688679024191, -0.2225209339563146, 0.6...","[-0.433883739117558, -0.9749279121818236, -0.7...","[0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[0.094130315, 0.094130315, 0.094130315, 0.0941...","[0.67334574, 0.67334574, 0.67334574, 0.6733457...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[-2.4492935982947064e-16, -2.4492935982947064e...","[0.44695792, 1.685019, 1.1136062, 0.16125152, ...","[-0.6836602258048724, -0.6836602258048724, -0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,0.0,1.0,"[0.6234898018587336, -0.22252093395631434, -0....","[0.7818314824680298, 0.9749279121818236, 0.433...","[-0.9009688679024191, -0.2225209339563146, 0.6...","[-0.433883739117558, -0.9749279121818236, -0.7...","[0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[0.072859876, 0.072859876, 0.072859876, 0.0728...","[0.7075762, 0.7075762, 0.7075762, 0.7075762, 0...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[-2.4492935982947064e-16, -2.4492935982947064e...","[-0.2673081, -0.45777902, -0.83872086, -0.5530...","[-0.6836602258048724, -0.6836602258048724, -0...."
245,0.0,0.0,"[0.6234898018587336, -0.22252093395631434, -0....","[0.7818314824680298, 0.9749279121818236, 0.433...","[-0.9009688679024191, -0.2225209339563146, 0.6...","[-0.433883739117558, -0.9749279121818236, -0.7...","[0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[-0.9703728, -0.9703728, -0.9703728, -0.970372...","[-1.8224531, -1.8224531, -1.8224531, -1.822453...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[-2.4492935982947064e-16, -2.4492935982947064e...","[-0.69586766, -0.69586766, -0.98157406, -0.981...","[-0.6836602258048724, -0.6836602258048724, -0...."
246,0.0,1.0,"[0.6234898018587336, -0.22252093395631434, -0....","[0.7818314824680298, 0.9749279121818236, 0.433...","[-0.9009688679024191, -0.2225209339563146, 0.6...","[-0.433883739117558, -0.9749279121818236, -0.7...","[0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[0.07835146, 0.07835146, 0.07835146, 0.0783514...","[0.7377385, 0.7377385, 0.7377385, 0.7377385, 0...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[-2.4492935982947064e-16, -2.4492935982947064e...","[0.9231352, 0.49457565, -0.1720726, -0.2673081...","[-0.6836602258048724, -0.6836602258048724, -0...."
247,1.0,0.0,"[0.6234898018587336, -0.22252093395631434, -0....","[0.7818314824680298, 0.9749279121818236, 0.433...","[-0.9009688679024191, -0.2225209339563146, 0.6...","[-0.433883739117558, -0.9749279121818236, -0.7...","[0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[-0.40633377, -0.40633377, -0.40633377, -0.406...","[-0.49143994, -0.49143994, -0.49143994, -0.491...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[-2.4492935982947064e-16, -2.4492935982947064e...","[-0.31492582, 0.44695792, -0.029219411, -0.029...","[-0.6836602258048724, -0.6836602258048724, -0...."


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df_reduced["air_genre_name"])