In [None]:
import pandas as pd
import numpy as np

### Load some data and prepare for PU learning.
Each row corresponds to a material and each column is a feature from either elemental data or density functional theory calculations.
We discard non-numeric columns (e.g. compound labels) and rename the "Synt" column to "PU_label" so the PULearner knows where to find the labels of positive (1) or unlabeled (0).

In [None]:
df = pd.read_excel('../test_files/MAX_dataset.xlsx', index_col=0)
#df = df.iloc[:, 4:]  # Get rid of non-numeric features
df = df.rename(index=str, columns={'Synt': 'PU_label'})
df.to_json('../test_files/MAX_dataset.json')

df = pd.read_excel('../test_files/MX_dataset.xlsx', index_col=0)
#df = df.iloc[:, 3:]  # Get rid of non-numeric features
df = df.rename(index=str, columns={'Synt': 'PU_label'})
df.to_json('../test_files/MX_dataset.json')

### Do k-fold cross validation with bagged decision tree base classifiers.
Run PU learning on a sample data set. n_repeats and n_bags should be large values for production runs. Here we use small values so the example runs quickly.

In [1]:
from mlmsynth import PULearner

In [2]:
pul = PULearner()
n_splits = 3  # 3-fold CV
n_repeats = 5  # Repeat the entire kfold CV 10 times for averaging
n_bags = 5  # 10 bags for bootstrap aggregating.

pu_stats = pul.cv_baggingDT('../test_files/MX_dataset.json', splits=n_splits, repeats=n_repeats, bags=n_bags)

Start PU Learning.
Performed Repeated 3-fold: 1 out of 5
True Positive Rate: 0.50 (+/- 0.00)
Performed Repeated 3-fold: 2 out of 5
True Positive Rate: 0.11 (+/- 0.31)
Performed Repeated 3-fold: 3 out of 5
True Positive Rate: 0.42 (+/- 0.36)


  predict_utrain = f_oob[:, 1] / n_oob
  predict_utrain = f_oob[:, 1] / n_oob
  label_U[:, :splits * repeats][np.where(prob_U > 0.5)] = 1
  label_U_rp[np.where(prob_U_rp > 0.5)] = 1
  labels[np.where(prob > 0.5)] = 1


Performed Repeated 3-fold: 4 out of 5
True Positive Rate: 0.44 (+/- 0.83)
Performed Repeated 3-fold: 5 out of 5
True Positive Rate: 0.39 (+/- 0.16)
Accuracy: 0.39
95% confidence interval: [0.27, 0.53]


In [None]:
df = pul.df_U.copy()
df['synth_score'] = pu_stats['prob']
df.head()

In [None]:
df1 = pd.read_json('../test_files/MAX_dataset.json')
df2 = pd.read_json('../test_files/MX_dataset.json')
# df.loc[df.eval('PU_label == 0'), 'test'] = 1
merge = list(('M', 'X', 'n'))
df = pd.merge(df1, df2, on=merge, how='outer', suffixes=['_p', '_c'])
df.head()

In [None]:
df = pd.read_json('../test_files/MX_dataset.json')
np.asarray(df._get_numeric_data())[0,:]