In [1]:
import pyarrow.dataset as ds
import pyarrow.compute as pc
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
dataset = ds.dataset("all-to-all-rmsd-tfd/")

In [3]:
dataset.schema

mapped_smiles: string
cmiles: string
inchi: string
ff_qcarchive_id: int64
ff_energy: double
rmsd: double
tfd: double
qm_qcarchive_id: int64
qm_energy: double
method: string

In [4]:
dataset.count_rows()

1033322

In [5]:
df = dataset.to_table().to_pandas()

In [6]:
df.method.unique()

array(['fb-fit-v0-single-mean-k100_unconstrained',
       'fb-fit-v0-single-mean-k20_unconstrained',
       'fb-fit-v1-single-mean-k100_unconstrained',
       'fb-fit-v2-single-mean-k100_unconstrained',
       'fb-fit-v3-single-mean-k100_unconstrained',
       'openff_unconstrained-1.3.1', 'openff_unconstrained-2.0.0',
       'openff_unconstrained-2.1.0', 'openff_unconstrained-2.2.1-ashgc',
       'openff_unconstrained-2.2.1', 'openff_unconstrained-2.3.0rc1'],
      dtype=object)

In [7]:
STEM_TO_NAME = {
    "openff_unconstrained-2.0.0": "Sage 2.0.0",
    "openff_unconstrained-2.2.1": "Sage 2.2.1",
    "openff_unconstrained-2.3.0rc1": "Sage 2.3.0rc1",
    "fb-fit-v1-single-mean-k100_unconstrained": "v1-k100",
    "fb-fit-v3-single-mean-k100_unconstrained": "v3-k100",
}

In [8]:
df = df[df.method.isin(STEM_TO_NAME)]
len(df)

469684

In [9]:
counts = df.groupby("ff_qcarchive_id").count()
valid_ids = counts[counts.method == len(STEM_TO_NAME)].index
df = df[df.ff_qcarchive_id.isin(valid_ids)]
len(df)

467685

In [10]:
df["FF"] = [STEM_TO_NAME.get(x, x) for x in df.method.values]

In [11]:
df["mismatches"] = df.ff_qcarchive_id != df.qm_qcarchive_id

In [12]:
mismatches_by_ff = df[["FF", "mismatches"]].groupby("FF").sum()

In [13]:
mismatches_by_ff

Unnamed: 0_level_0,mismatches
FF,Unnamed: 1_level_1
Sage 2.0.0,66347
Sage 2.2.1,66170
Sage 2.3.0rc1,66053
v1-k100,66181
v3-k100,66097


In [14]:
df.groupby("FF").count()

Unnamed: 0_level_0,mapped_smiles,cmiles,inchi,ff_qcarchive_id,ff_energy,rmsd,tfd,qm_qcarchive_id,qm_energy,method,mismatches
FF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sage 2.0.0,93537,93537,93537,93537,93537,93537,93501,93537,93537,93537,93537
Sage 2.2.1,93537,93537,93537,93537,93537,93537,93501,93537,93537,93537,93537
Sage 2.3.0rc1,93537,93537,93537,93537,93537,93537,93501,93537,93537,93537,93537
v1-k100,93537,93537,93537,93537,93537,93537,93501,93537,93537,93537,93537
v3-k100,93537,93537,93537,93537,93537,93537,93501,93537,93537,93537,93537
