In [None]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from scipy.interpolate import griddata
from scipy.spatial import Delaunay

from utils.data import (load_data, bonds, bond_to_float, calculate_moments,
                        plot_cevr, plot_pca, plot_profiles, plot_crystal_graph)
from utils.model import load, prepare_data, train_models, plot_scores, predict, plot_confusion_matrix

seed = 12

In [None]:
# load data
dirname = 'data/'
data = load_data(dirname, structure=True, labeled=False, sort=True)

In [None]:
# compute and plot PCA summary
X = np.stack(data['elf'].sum())
n_components = X.shape[-1]
pca = PCA(n_components=n_components, svd_solver='full')
z = pca.fit_transform(X)

# plot cumulative explained variance
plot_cevr(pca.explained_variance_ratio_, 0.999, save_path=None)

In [None]:
# calculate moments
data = calculate_moments(data)

In [None]:
# plot PC projections
plot_pca(z, data, bonds, axes=[0,1,2], save_path=None)

In [None]:
# plot profiles along principal axes
plot_profiles(z, data, pca, colorby='label', axes=[0,1], save_path=None)

In [None]:
# train or load model
fit = False
model_path = 'models/clfs_051022_pc8_en_srt'
n_components = 8
test_size = 0.15
n_models = 100
columns = ['en_diff']

if fit:
    # train models
    X_data, y_data, pca, scaler = prepare_data(data, n_components, test_size, seed=seed, pca=None, scaler=None,
                                               columns=columns)
    CLFs = train_models(X_data, y_data, pca, scaler, n_models=n_models, seed=seed, save_path=model_path)
    
    # plot model performance on held-out test set
    y_pred_mean = np.stack([CLFs['clfs'][i].predict_proba(X_data[1]) for i in range(n_models)]).mean(axis=0)
    y_pred_std = np.stack([CLFs['clfs'][i].predict_proba(X_data[1]) for i in range(n_models)]).std(axis=0)
    plot_scores(y_pred_mean, y_pred_std, y_data[1], save_path=None)

else:
    # load models
    CLFs = load(model_path + '.joblib')

In [None]:
# make predictions and optionally save output dataframe
data = predict(data, CLFs, columns=columns, save_path=None)

In [None]:
# plot confusion matrix
plot_confusion_matrix(data, normalize=False, save_path=None)

In [None]:
# plot crystal graph with edges colored by predicted bond type
plot_crystal_graph(data, index=0, save_path=None)