In [3]:
import pandas as pd
import numpy as np

### Load some data and prepare for PU learning.
Each row corresponds to a material and each column is a feature from either elemental data or density functional theory calculations.
We discard non-numeric columns (e.g. compound labels) and rename the "Synt" column to "PU_label" so the PULearner knows where to find the labels of positive (1) or unlabeled (0).

In [4]:
df = pd.read_excel('../test_files/MAX_dataset.xlsx', index_col=0)
df = df.iloc[:, 4:]  # Get rid of non-numeric features
df = df.rename(index=str, columns={'Synt': 'PU_label'})
df.to_json('MAX_dataset.json')

df = pd.read_excel('../test_files/MX_dataset.xlsx', index_col=0)
df = df.iloc[:, 3:]  # Get rid of non-numeric features
df = df.rename(index=str, columns={'Synt': 'PU_label'})
df.to_json('MX_dataset.json')

### Do k-fold cross validation with bagged decision tree base classifiers.
Run PU learning on a sample data set. n_repeats and n_bags should be large values for production runs. Here we use small values so the example runs quickly.

In [5]:
from mlmsynth import PULearner

In [19]:
pul = PULearner()
n_splits = 3  # 3-fold CV
n_repeats = 5  # Repeat the entire kfold CV 10 times for averaging
n_bags = 5  # 10 bags for bootstrap aggregating.

pu_stats = pul.cv_baggingDT('MX_dataset.json', splits=n_splits, repeats=n_repeats, bags=n_bags)

Performed Repeated 3-fold: 1 out of 5
True Positive Rate: 0.75 (+/- 0.00)
Performed Repeated 3-fold: 2 out of 5
True Positive Rate: 0.42 (+/- 0.36)
Performed Repeated 3-fold: 3 out of 5
True Positive Rate: 0.33 (+/- 0.54)
Performed Repeated 3-fold: 4 out of 5
True Positive Rate: 0.44 (+/- 0.63)
Performed Repeated 3-fold: 5 out of 5
True Positive Rate: 0.39 (+/- 0.16)
Accuracy: 0.43
95% confidence interval: [0.32, 0.53]


In [33]:
print(pu_stats.keys())
print(len(pu_stats['prob_rp']))
print(pu_stats['feat_rank_rp'][0])
print(pu_stats['prob'][0])

dict_keys(['prob', 'labels', 'metrics', 'prob_rp', 'label_rp', 'tpr_rp', 'scores_rp', 'feat_rank_rp'])
56
[0.09090909 0.         0.         0.         0.        ]
0.5544444444444444


In [57]:
fimp = [234, 23511, 123]
df = pd.Series(data=fimp)
df = df.sort_values(ascending=False)
df = df / df.max()
df[:2].index.values

array([1, 0])