In [3]:
import pandas as pd
import numpy as np

### Load some data and prepare for PU learning.
Each row corresponds to a material and each column is a feature from either elemental data or density functional theory calculations.
We discard non-numeric columns (e.g. compound labels) and rename the "Synt" column to "PU_label" so the PULearner knows where to find the labels of positive (1) or unlabeled (0).

In [4]:
df = pd.read_excel('../test_files/MAX_dataset.xlsx', index_col=0)
df = df.iloc[:, 4:]  # Get rid of non-numeric features
df = df.rename(index=str, columns={'Synt': 'PU_label'})
df.to_json('MAX_dataset.json')

df = pd.read_excel('../test_files/MX_dataset.xlsx', index_col=0)
df = df.iloc[:, 3:]  # Get rid of non-numeric features
df = df.rename(index=str, columns={'Synt': 'PU_label'})
df.to_json('MX_dataset.json')

### Do k-fold cross validation with bagged decision tree base classifiers.
Run PU learning on a sample data set. n_repeats and n_bags should be large values for production runs. Here we use small values so the example runs quickly.

In [5]:
from mlmsynth import PULearner

In [7]:
pul = PULearner()
n_splits = 3  # 3-fold CV
n_repeats = 10  # Repeat the entire kfold CV 10 times for averaging
n_bags = 10  # 10 bags for bootstrap aggregating.

pu_stats = pul.cv_baggingDT('MAX_dataset.json', splits=n_splits, repeats=n_repeats, bags=n_bags)

Performed Repeated 3-fold: 1 out of 10
True Positive Rate: 0.86 (+/- 0.00)
Performed Repeated 3-fold: 2 out of 10
True Positive Rate: 0.87 (+/- 0.16)
Performed Repeated 3-fold: 3 out of 10
True Positive Rate: 0.86 (+/- 0.28)
Performed Repeated 3-fold: 4 out of 10
True Positive Rate: 0.87 (+/- 0.04)
Performed Repeated 3-fold: 5 out of 10
True Positive Rate: 0.86 (+/- 0.34)
Performed Repeated 3-fold: 6 out of 10
True Positive Rate: 0.78 (+/- 0.20)
Performed Repeated 3-fold: 7 out of 10
True Positive Rate: 0.84 (+/- 0.20)
Performed Repeated 3-fold: 8 out of 10
True Positive Rate: 0.73 (+/- 0.12)
Performed Repeated 3-fold: 9 out of 10
True Positive Rate: 0.81 (+/- 0.21)
Performed Repeated 3-fold: 10 out of 10
True Positive Rate: 0.92 (+/- 0.16)
Accuracy: 0.83
95% confidence interval: [0.78, 0.87]
