# Example Statistics
Example: We want to know how many datasets are in MDF and which datasets have the most records.

In [1]:
from tqdm import tqdm
import pandas as pd
from mdf_forge.forge import Forge

In [2]:
mdf = Forge()

In [3]:
# First, let's search for all the datasets. There are less than 10,000 currently, so `search()` will work fine.
res = mdf.search("mdf.resource_type:dataset", advanced=True)
# Now, let's pull out the source_name, title, and number of records for each dataset.
mdf_resources = []
for r in tqdm(res):
    q = "mdf.links.parent_id:" + r["mdf"]["mdf_id"]
    x, info = mdf.search(q, advanced=True, info=True, limit=0)
    mdf_resources.append((r['mdf']['source_name'], r['mdf']['title'], info["total_query_matches"]))
df = pd.DataFrame(mdf_resources, columns=['source_name','title', 'num_records'])

100%|██████████| 1280/1280 [06:45<00:00,  3.16it/s]


In [4]:
# Finally, we can print the data we gathered.
print("Number of data resources: {n_datasets}".format(n_datasets=len(df)))
df.sort_values(by="num_records", ascending=False).head(15)

Number of data resources: 1280


Unnamed: 0,source_name,title,num_records
430,chembl_db,ChEMBL Database,1727112
411,oqmd,The Open Quantum Materials Database,686999
8,cod,Crystallography Open Database,373631
1015,dss_tox,Distributed Structure-Searchable Toxicity (DSS...,162371
1012,gdb9_14,Quantum Chemistry Structures and Properties of...,139980
831,h2o_13,Machine-learning approach for one- and two-bod...,45482
180,nist_xps_db,NIST X-ray Photoelectron Spectroscopy Database,29189
830,gdb8_15,Electronic spectra from TDDFT and machine lear...,21786
426,amcs,The American Mineralogist Crystal Structure Da...,19540
31,ecp_sar_environments,Prediction of Compounds in Different Local SAR...,10914


In [5]:
# Bonus: How many records are in MDF in total?
df["num_records"].sum()

3325791