# Data Splitting Example
## Goals: Demonstrate how to extract (wrangle) data from DFT (e.g., VASP) outputs for later featurization
### Data
**Relaxation**: *tests/data/vasp_relax/vasprun.xml*  

**AIMD Trajectory**: *tests/data/vasp_md/vasprun.xml*

## Imports

In [None]:
import numpy as np
import pickle
from pymatgen.io.vasp.outputs import Vasprun
from uf3.data.io import DataCoordinator 
from uf3.util import subsample
import pandas as pd
import traceback

## Data Files

In [2]:
relax = './tests/data/vasp_relax/vasprun.xml'
AIMD = './tests/data/vasp_md/vasprun.xml'

## Data Extraction
### Relaxation

In [None]:
df_relax=DataCoordinator()
prefix = 'relax_'
df = df_relax.dataframe_from_trajectory(relax, prefix=prefix, load=False,)
n_samples = len(df)
energy_list = df['energy'].values
assert n_samples <= len(energy_list)
subsamples = subsample.farthest_point_sampling(energy_list, max_samples=n_samples, min_diff=0.0)
print(f"Subsampling {n_samples} samples from {len(energy_list)}")
df = df.iloc[np.sort(subsamples)]
df_relax.load_dataframe(df, prefix=prefix)
df_relax=df_relax.consolidate()

### AIMD

In [None]:
df_aimd=DataCoordinator()
prefix = 'aimd_'
df = df_aimd.dataframe_from_trajectory(AIMD, prefix=prefix, load=False,)
n_samples = len(df)
energy_list = df['energy'].values
assert n_samples <= len(energy_list)
subsamples = subsample.farthest_point_sampling(energy_list, max_samples=n_samples, min_diff=0.0)
print(f"Subsampling {n_samples} samples from {len(energy_list)}")
df = df.iloc[np.sort(subsamples)]
df_aimd.load_dataframe(df, prefix=prefix)
df_aimd=df_aimd.consolidate()

## Combine extracted data and save to file

In [22]:
data = pd.concat([df_relax, df_aimd])
filepath = 'data.pkl'
data.to_pickle(filepath)