In [1]:
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
from pulearn.elkanoto import ElkanotoPuClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid, GridSearchCV
from sklearn.decomposition import PCA
from tqdm import tqdm

import pul_config
import importlib
importlib.reload(pul_config)

def predict_proba(m, X):
    try:
        proba = m.predict_proba(X)
    except:
        try:
            proba = m.decision_function(X)
        except:
            proba = m.best_estimator_.decision_function(X)

    return proba

def evaluate(m, X, y, n=100, scale=True):
    y_pred = m.predict(X) > 0
    y_prob_pred = predict_proba(m, X)

    order = np.argsort(y_prob_pred)[::-1]
    # print(y_pred[order][:n])
    if scale:
        return y[order][:n].sum() / y.sum()
    else:
        return y[order][:n].sum()
    

# CHANGE HERE
input_dirs = [Path('1_outputs/standard/'), Path('1_outputs/small_DS/')]
output_dirs = [Path('2_outputs/standard/'), Path('2_outputs/small_DS/')]
method = 'IsolationForest'
pul_cfg = pul_config.IsolationForestConfig
# -----

MODEL_NAMES = ('RotatE', 'TransE')
output_dirs = [output_dir / method for output_dir in output_dirs]

## Training

In [4]:
np.random.seed(42)

In [5]:
def train(input_dir, output_dir, n_jobs, cv, model_names, rerun=False):
    ref_df = pd.read_csv(input_dir / 'ref_df.csv', index_col=0)
    ref_df.head()
    for model_name in tqdm(model_names):
        out_dir = output_dir / model_name
        out_dir.mkdir(exist_ok=True, parents=True)
        for i in tqdm(range(10)):
            m_file = out_dir / f'{model_name}_{i}.pkl'
            if m_file.exists():
                try:
                    # try to load the file
                    with open(m_file, 'rb') as f:
                        pickle.load(f)
                    if not rerun:
                        print(m_file, 'exists. Skipping...')
                        continue
                except:
                    pass

            X_all = np.load(input_dir / f'{model_name}_X_{i}.npy')

            ds_names = ('train', 'test', 'val')
            Xs = {ds: X_all[ref_df.query(ds)['id']] for ds in ds_names}
            ys = {ds: ref_df.query(ds)['y'].values for ds in ds_names}

            ds = 'train'
            X = Xs[ds]
            y = ys[ds]

            pipe = pul_cfg.build_pipeline()
            param_grid = pul_cfg.build_param_grid()

            grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=cv, refit=True, n_jobs=n_jobs, scoring=evaluate)
            grid_search.fit(X=X, y=y)

            with open(m_file, 'wb') as f:
                pickle.dump(grid_search, f)

In [None]:
N_JOBS = 40
CV = 5

for input_dir, output_dir in tqdm(zip(input_dirs, output_dirs), total=len(input_dirs)):
    train(input_dir=input_dir, output_dir=output_dir, model_names=MODEL_NAMES, cv=CV, n_jobs=N_JOBS)

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/10 [00:00<?, ?it/s][A[A

100%|██████████| 10/10 [00:00<00:00, 83.68it/s][A[A

 50%|█████     | 1/2 [00:00<00:00,  7.92it/s][A

2_outputs/standard/IsolationForest/RotatE/RotatE_0.pkl exists. Skipping...
2_outputs/standard/IsolationForest/RotatE/RotatE_1.pkl exists. Skipping...
2_outputs/standard/IsolationForest/RotatE/RotatE_2.pkl exists. Skipping...
2_outputs/standard/IsolationForest/RotatE/RotatE_3.pkl exists. Skipping...
2_outputs/standard/IsolationForest/RotatE/RotatE_4.pkl exists. Skipping...
2_outputs/standard/IsolationForest/RotatE/RotatE_5.pkl exists. Skipping...
2_outputs/standard/IsolationForest/RotatE/RotatE_6.pkl exists. Skipping...
2_outputs/standard/IsolationForest/RotatE/RotatE_7.pkl exists. Skipping...
2_outputs/standard/IsolationForest/RotatE/RotatE_8.pkl exists. Skipping...
2_outputs/standard/IsolationForest/RotatE/RotatE_9.pkl exists. Skipping...




  0%|          | 0/10 [00:00<?, ?it/s][A[A

2_outputs/standard/IsolationForest/TransE/TransE_0.pkl exists. Skipping...
2_outputs/standard/IsolationForest/TransE/TransE_1.pkl exists. Skipping...




 30%|███       | 3/10 [00:00<00:00, 27.88it/s][A[A

2_outputs/standard/IsolationForest/TransE/TransE_2.pkl exists. Skipping...
2_outputs/standard/IsolationForest/TransE/TransE_3.pkl exists. Skipping...
2_outputs/standard/IsolationForest/TransE/TransE_4.pkl exists. Skipping...




 60%|██████    | 6/10 [00:00<00:00, 28.06it/s][A[A

2_outputs/standard/IsolationForest/TransE/TransE_5.pkl exists. Skipping...
2_outputs/standard/IsolationForest/TransE/TransE_6.pkl exists. Skipping...
2_outputs/standard/IsolationForest/TransE/TransE_7.pkl exists. Skipping...
2_outputs/standard/IsolationForest/TransE/TransE_8.pkl exists. Skipping...




100%|██████████| 10/10 [00:00<00:00, 30.49it/s][A[A

100%|██████████| 2/2 [00:00<00:00,  4.35it/s][A
 50%|█████     | 1/2 [00:00<00:00,  1.60it/s]

2_outputs/standard/IsolationForest/TransE/TransE_9.pkl exists. Skipping...



  0%|          | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/10 [00:00<?, ?it/s][A[A

 10%|█         | 1/10 [01:44<15:44, 104.91s/it][A[A

 20%|██        | 2/10 [03:12<12:36, 94.62s/it] [A[A

 30%|███       | 3/10 [04:39<10:38, 91.26s/it][A[A

 40%|████      | 4/10 [06:05<08:54, 89.04s/it][A[A

 50%|█████     | 5/10 [07:29<07:16, 87.37s/it][A[A

 60%|██████    | 6/10 [08:48<05:37, 84.43s/it][A[A

 70%|███████   | 7/10 [10:07<04:08, 82.85s/it][A[A

 80%|████████  | 8/10 [11:29<02:44, 82.32s/it][A[A

 90%|█████████ | 9/10 [12:48<01:21, 81.46s/it][A[A

100%|██████████| 10/10 [14:07<00:00, 84.74s/it][A[A

 50%|█████     | 1/2 [14:07<14:07, 847.40s/it][A

  0%|          | 0/10 [00:00<?, ?it/s][A[A

## Evaluation

In [None]:
def load_grid_searches(output_dir, model_names):
    grid_searches = {}
    for model_name in model_names:
        grid_searches[model_name] = {}
        for j in range(10):
            m_name = f'{model_name}_{j}'
            m_file = output_dir / model_name / f'{m_name}.pkl'
            with open(m_file, 'rb') as f:
                grid_search = pickle.load(f)
            grid_searches[model_name][m_name] = grid_search

    return grid_searches

In [None]:
grid_searches_list = [
    load_grid_searches(output_dir=o, model_names=MODEL_NAMES) for o in output_dirs
]

In [None]:
def score_df_from_grid_searches(grid_searches, input_dir, ns=(10,100)):
    ref_df = pd.read_csv(input_dir / 'ref_df.csv', index_col=0)
    ref_df.head()

    score_dict = {
        'model_name': [],
        'm_name': [],
        'score10_val': [],
        'score100_val': [],
        'score10_test': [],
        'score100_test': [],

    }
    scale = False
    for model_name in grid_searches.keys():
        for i, m_name in enumerate(grid_searches[model_name].keys()):
            score_dict['model_name'].append(model_name)
            score_dict['m_name'].append(m_name)
            X_all = np.load(input_dir / f'{model_name}_X_{i}.npy')

            ds_names = ('train', 'test', 'val')
            Xs = {ds: X_all[ref_df.query(ds)['id']] for ds in ds_names}
            ys = {ds: ref_df.query(ds)['y'].values for ds in ds_names}

            grid_search = grid_searches[model_name][m_name]
            for n in ns:
                val_score = evaluate(
                    m=grid_search,
                    X=Xs['val'],
                    y=ys['val'],
                    n=n,
                    scale=scale
                )
                score_dict[f'score{n}_val'].append(val_score)

                test_score = evaluate(
                    m=grid_search,
                    X=Xs['test'],
                    y=ys['test'],
                    n=n,
                    scale=scale
                )
                score_dict[f'score{n}_test'].append(test_score)

    score_df = pd.DataFrame(score_dict)
    return score_df

In [None]:
score_dfs = [score_df_from_grid_searches(gs, i) for gs, i in zip(grid_searches_list, input_dirs)]
for output_dir, score_df in zip(output_dirs, score_dfs):
    score_df.to_csv(output_dir / 'score_df.csv')

In [None]:
score_dfs[0].groupby('model_name').mean()

In [None]:
import seaborn as sns

sns.boxplot(score_dfs[0], x='model_name', y='score10_test').set(title='Hits@100 and hits@10\n(large positive set)')
sns.boxplot(score_dfs[0], x='model_name', y='score100_test').set_ylabel('score')

In [None]:
score_dfs[1].groupby('model_name').mean()

In [None]:
import seaborn as sns

sns.boxplot(score_dfs[1], x='model_name', y='score10_test').set(title='Hits@100 and hits@10\n(small positive set)')
sns.boxplot(score_dfs[1], x='model_name', y='score100_test').set_ylabel('score')

In [None]:
score_dfs = []
for output_dir in output_dirs:
    score_dfs.append(pd.read_csv(output_dir / 'score_df.csv', index_col=0))
plot_dfs = []
for ds, score_df in zip(('large', 'small'), score_dfs):
    plot_df = score_df[['model_name', 'score10_test', 'score100_test']].copy()
    plot_df.columns = ['model_name', 'hits@10', 'hits@100']
    plot_df['dataset'] = ds
    plot_dfs.append(plot_df)
    
plot_df = pd.concat(plot_dfs)
plot_df = plot_df.melt(id_vars=['model_name', 'dataset'])
plot_dfs = {k:v for k, v in plot_df.groupby('model_name')}

In [None]:
import seaborn as sns
model_name = 'TransE'
plot_df = plot_dfs[model_name]
plot = sns.barplot(plot_df, x='dataset', y='value', hue='variable', order=['small', 'large'])
sns.move_legend(plot, 'upper left')
plot.set(xlabel='positive dataset', ylabel='score')
plot.set_title(f'Isolation Forest - {model_name}')
plot.set_ylim([0, 55])
_ = plot.set_xticklabels(labels=['small', 'large'], rotation=0)

In [None]:
import seaborn as sns
model_name = 'RotatE'
plot_df = plot_dfs[model_name]
plot = sns.barplot(plot_df, x='dataset', y='value', hue='variable', order=['small', 'large'])
sns.move_legend(plot, 'upper left')
plot.set(xlabel='positive dataset', ylabel='score')
plot.set_title(f'Isolation Forest - {model_name}')
plot.set_ylim([0, 55])
_ = plot.set_xticklabels(labels=['small', 'large'], rotation=0)

In [None]:
def calc_predictions(grid_searches, input_dir):
    ref_df = pd.read_csv(input_dir / 'ref_df.csv', index_col=0)
    ref_df.head()
    
    ids = ref_df['id'].values
    y = ref_df['y'].values
    train = ref_df['train'].values
    val = ref_df['val'].values
    test = ref_df['test'].values
    
    y_preds = {}
    for model_name in grid_searches.keys():
        y_preds[model_name] = {}
        for i, m_name in enumerate(grid_searches[model_name].keys()):
            X_all = np.load(input_dir / f'{model_name}_X_{i}.npy')
            X = X_all[ids]

            grid_search = grid_searches[model_name][m_name]
            y_preds[model_name][m_name] = predict_proba(grid_search, X)
            
    return y_preds

def build_proba_df(grid_searches, input_dir):
    from pykeen.datasets import OpenBioLink
    obl = OpenBioLink()
    id_to_entity = {i:e for e, i in obl.entity_to_id.items()}
    
    y_preds = calc_predictions(grid_searches=grid_searches, input_dir=input_dir)
    
    ref_df = pd.read_csv(input_dir / 'ref_df.csv', index_col=0)
    ref_df.head()
    
    ids = ref_df['id'].values
    y = ref_df['y'].values
    train = ref_df['train'].values
    val = ref_df['val'].values
    test = ref_df['test'].values
    
    proba_df = pd.DataFrame(dict(
        id=ids,
        entity=[id_to_entity[i] for i in ids],
        y=y,
        train=train,
        val=val,
        test=test,
        **y_preds['RotatE'],
        **y_preds['TransE'],
    ))
    proba_df['RotatE_sum'] = proba_df.filter(regex='RotatE_[0-9]+').sum(axis=1)
    proba_df['TransE_sum'] = proba_df.filter(regex='TransE_[0-9]+').sum(axis=1)
    
    return proba_df

In [None]:
proba_dfs = [build_proba_df(gs, i) for gs, i in zip(grid_searches_list, input_dirs)]
for output_dir, proba_df in zip(output_dirs, proba_dfs):
    proba_df.to_csv(output_dir / 'proba_df.csv')

In [None]:
proba_df = proba_dfs[0]

In [None]:
proba_df.query('test').sort_values('RotatE_sum', ascending=False)