In [91]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch_explain as te
from torch.nn.functional import one_hot
from torch_explain.logic.metrics import test_explanation, complexity, concept_consistency, formula_consistency
from torch_explain.logic.nn import entropy, psi
from torch_explain.logic.utils import replace_names

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
n_splits = 5
test_size = 0.2
random_state = 42
shuffle = True
flip = False

In [104]:
y = pd.read_csv('../../data/node_lens/target.csv', index_col=None, header=0)
x = pd.read_csv('../../data/node_lens/u.csv', index_col=None, header=0)
x = x.iloc[:, 1:]
feature_names = x.columns
x = x.values
le = LabelEncoder()
y = le.fit_transform(y.values[:, 2])
# y = y.values
# if flip:
#     t = x
#     x = y
#     y = t
x.shape, y.shape

((250, 19), (250,))

In [38]:
layers = [
    te.nn.EntropyLinear(x.shape[1], 20, n_classes=2, temperature=0.3),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(20, 10),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(10, 1),
]
model = torch.nn.Sequential(*layers)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
loss_form = torch.nn.BCEWithLogitsLoss()
model.train()

Sequential(
  (0): EntropyLinear(
    in_features=19, out_features=20, n_classes=2
    (conceptizator): Conceptizator(activation=identity_bool, threshold=0.5)
  )
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=20, out_features=10, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=10, out_features=1, bias=True)
)

In [116]:
r2_train_list, r2_test_list = [], []
columns=['Class name', 'Test model accuracy', 'Explanation', 'Test explanation accuracy', 'Explanation complexity']
results = pd.DataFrame(columns=columns)
sss = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
for split, (train_index, test_index) in enumerate(sss.split(x, y)):
    print(f"Fold {split+1}/{n_splits} \n\t Train...")
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler_x = StandardScaler()
    scaler_x.fit(x_train)
    x_train = scaler_x.transform(x_train)
    x_test = scaler_x.transform(x_test)
    
    x_train = torch.FloatTensor(x_train)
    x_test = torch.FloatTensor(x_test)
    y_train = one_hot(torch.LongTensor(y_train)).to(torch.float)
    y_test = one_hot(torch.LongTensor(y_test)).to(torch.float)
    x2 = torch.concat((x_train, x_test))
    y2 = torch.concat((y_train, y_test))
#     model = RandomForestClassifier(random_state=random_state)
#     model = LinearRegression()
#     model = MLPRegressor()
#     model.fit(x_train, y_train)
    layers = [
        te.nn.EntropyLinear(x.shape[1], 10, n_classes=2, temperature=0.9),
        torch.nn.LeakyReLU(),
#         torch.nn.Linear(50, 10),
#         torch.nn.LeakyReLU(),
        torch.nn.Linear(10, 1),
    ]
    model = torch.nn.Sequential(*layers)

    optimizer = torch.optim.AdamW(model.parameters(), lr=0.005)
    loss_form = torch.nn.BCEWithLogitsLoss()
    model.train()

    train_mask = torch.where(torch.FloatTensor(train_index))[0]
    test_mask = torch.where(torch.FloatTensor(test_index))[0]
    for epoch in range(2001):
        # train step
        optimizer.zero_grad()
        y_pred = model(x2).squeeze(-1)
        loss = loss_form(y_pred[train_mask], y2[train_mask]) + 0.0001 * te.nn.functional.entropy_logic_loss(model)
        loss.backward()
        optimizer.step()

#         # compute accuracy
#         if epoch % 100 == 0:
    train_auc = roc_auc_score(y2[train_mask], y_pred[train_mask].detach().numpy())
    test_auc = roc_auc_score(y2[test_mask], y_pred[test_mask].detach().numpy())
    print(f'Epoch {epoch}: loss {loss:.4f} train accuracy: {train_auc:.4f} test accuracy: {test_auc:.4f}')
    
    explanations, local_exp = entropy.explain_classes(model, x2, y2, train_mask, test_mask,
                                                  c_threshold=0., y_threshold=0., verbose=True,
                                                  concept_names=feature_names, class_names=le.classes_,
                                                  topk_explanations=10, max_minterm_complexity=5,
                                                  material=True, good_bad_terms=True, 
                                                  max_accuracy=True)
    
    c0 = pd.DataFrame([[
        le.classes_[0], 
        test_auc, 
        explanations['0']['explanation'], 
        explanations['0']['explanation_accuracy'],
        explanations['0']['explanation_complexity'],
    ]], columns=columns)
    
    c1 = pd.DataFrame([[
        le.classes_[1], 
        test_auc, 
        explanations['1']['explanation'], 
        explanations['1']['explanation_accuracy'],
        explanations['1']['explanation_complexity'],
    ]], columns=columns)
    
    results = pd.concat((results, c0, c1))

Fold 1/5 
	 Train...
Epoch 2000: loss 0.0062 train accuracy: 1.0000 test accuracy: 1.0000
Explanation class down: (CD8 act & ~virus LRT) | (B act & ACT & ~virus LRT) - acc. = 1.0000 - compl. = 5.0000
Explanation class up: APC nai LRT & CD4 nai & ~B act - acc. = 0.8800 - compl. = 3.0000
Fold 2/5 
	 Train...
Epoch 2000: loss 0.0076 train accuracy: 1.0000 test accuracy: 1.0000
Explanation class down: (B act & ACT) | (~inf cell LRT & ~virus LRT) - acc. = 1.0000 - compl. = 4.0000
Explanation class up: (virus LRT & anti) | (healthy cell LRT & virus LRT & CD8 nai & ~B act) - acc. = 0.8800 - compl. = 6.0000
Fold 3/5 
	 Train...
Epoch 2000: loss 0.0084 train accuracy: 1.0000 test accuracy: 1.0000
Explanation class down: (APC act & B act) | (~CD8 nai & ~anti) | (APC act & short lived plasma & ~CD8 nai) - acc. = 1.0000 - compl. = 7.0000
Explanation class up: (healthy cell LRT & virus LRT & ~APC act) | (healthy cell LRT & virus LRT & anti & ~short lived plasma) - acc. = 0.9388 - compl. = 7.0000
Fo

In [118]:
results.to_csv('lens_results.csv')

In [119]:
results

Unnamed: 0,Class name,Test model accuracy,Explanation,Test explanation accuracy,Explanation complexity
0,down,1.0,(CD8 act & ~virus LRT) | (B act & ACT & ~virus...,1.0,5
0,up,1.0,APC nai LRT & CD4 nai & ~B act,0.88,3
0,down,1.0,(B act & ACT) | (~inf cell LRT & ~virus LRT),1.0,4
0,up,1.0,(virus LRT & anti) | (healthy cell LRT & virus...,0.88,6
0,down,1.0,(APC act & B act) | (~CD8 nai & ~anti) | (APC ...,1.0,7
0,up,1.0,(healthy cell LRT & virus LRT & ~APC act) | (h...,0.938776,7
0,down,1.0,(B act & ACT) | (short lived plasma & ~virus L...,1.0,6
0,up,1.0,(virus LRT & anti & ~short lived plasma) | (vi...,0.88,6
0,down,1.0,(APC act & ~healthy cell LRT) | (~healthy cell...,1.0,6
0,up,1.0,(inf cell LRT & virus LRT & anti) | (virus LRT...,0.88,10
