Skip to content

ACE: compare extractions

James Kent edited this page May 6, 2024 · 2 revisions
6/1: coordinates_df = pd.read_table('/coordinates.csv', sep=",", dtype={"pmid": str})
 6/2: import pandas as pd
 6/3: coordinates_df = pd.read_table('/coordinates.csv', sep=",", dtype={"pmid": str})
 6/4: coordinates_df
 6/5: coordinates_df.loc[0,:]
 6/6: coordinates_df.loc[0,:].table_caption
 6/7: coordinates_df.loc[1,:].table_caption
 6/8: coordinates_df.loc[4,:].table_caption
 6/9: coordinates_df.loc[10,:].table_caption
6/10: coordinates_df.loc[15,:].table_caption
6/11: coordinates_df.loc[150,:].table_caption
6/12: pmids = coordinates_df['pmid'].values
6/13: pmids
6/14: list(pmids)
6/15: pmids = list(coordinates_df['pmid'].values)
6/16: pmids
6/17: import numpy as np
6/18: pmids = list(np.unique(coordinates_df['pmid'].values))
6/19: pmids
6/20: len(pmids)
6/21: BaseStudy.query.filter(BaseStudy.pmid.in_(pmids)).all()
6/22: len(BaseStudy.query.filter(BaseStudy.pmid.in_(pmids)).all())
6/23: existing_base_studies = BaseStudy.query.filter(BaseStudy.pmid.in_(pmids)).all()
6/24:
len(existing_base_studies
)
6/25: existing_base_studies.versions
6/26: existing_base_studies[0].versions
6/27: existing_base_studies[0].versions[0].source
6/28: existing_base_studies[0].versions[0].analyses
6/29: existing_base_studies[0].versions[0].analyses[0]
6/30: existing_base_studies[0].versions[0].analyses[0].table_id
6/31: from sqlalchemy.orm import selectinload
6/32: existing_base_studies = BaseStudy.query.filter(BaseStudy.pmid.in_(pmids)).options(selectinload(BaseStudy.versions)).all()
6/33:
existing_base_studies = BaseStudy.query.filter(BaseStudy.pmid.in_(pmids)).options(selectinload(BaseStudy.versions.options(selectinload(Study.analyses
)))).all()
6/34: existing_base_studies = BaseStudy.query.filter(BaseStudy.pmid.in_(pmids)).options(selectinload(BaseStudy.versions).options(selectinload(Study.analyses))).all()
6/35: len(existing_base_studies)
6/36: existing_base_studies[0]
6/37: existing_base_studies[0].updated_at
6/38: existing_base_studies[0].pmid
6/39: '10585521' in pmids
6/40: coordinates_df.query('pmid == "10585521"')
6/41: existing_base_studies[0].versions
6/42: existing_base_studies[0].versions[0]
6/43: existing_base_studies[0].versions[0].updated_at
6/44: existing_base_studies[100].versions[0].updated_at
6/45: existing_base_studies[106].versions[0].updated_at
6/46: existing_base_studies[150].versions[0].updated_at
6/47: existing_base_studies[150].versions[0].created_at
6/48: len(existing_base_studies)
6/49: coordinates_df['pmid']
6/50: coordinates_df['pmid'].unique()
6/51: len(coordinates_df['pmid'].unique())
6/52: existing_base_studies
6/53: existing_studies_dict = {bs.pmid: next(filter(lambda s:  s.source == "neurosynth", bs.versions)) for bs in existing_base_studies}
6/54: existing_studies_dict = {bs.pmid: next(filter(lambda s: s.source == "neurosynth", bs.versions), None) for bs in existing_base_studies}
6/55: existing_studies_dict
6/56: for coordinates_df.groupby("pmid")
6/57: coordinates_df.groupby("pmid")
6/58: coordinates_df.groupby("pmid").groupby("table_id")
6/59: coordinates_df.groupby(("pmid", "table_id"))
6/60: coordinates_df.groupby(["pmid", "table_id"])
6/61: list(coordinates_df.groupby(["pmid", "table_id"]))
6/62: coordinates_df.groupby(["pmid", "table_id"])[0]
6/63: coordinates_df.groupby(["pmid", "table_id"]).pmid
6/64: next(coordinates_df.groupby(["pmid", "table_id"]))
6/65: coordinates_df.groupby(["pmid", "table_id"]).get_group(grouped_data.groups.keys()[0])
6/66: coordinates_df.groupby(["pmid", "table_id"]).get_group(coordinates_df.groupby(["pmid", "table_id"]).groups.keys()[0])
6/67: coordinates_df.groupby(["pmid", "table_id"]).get_group(list(coordinates_df.groupby(["pmid", "table_id"]).groups.keys())[0])
6/68: coordinates_df.groupby(["pmid", "table_id"]).get_group(list(coordinates_df.groupby(["pmid", "table_id"]).groups.keys())[2])
6/69: coordinates_df.groupby(["pmid", "table_id"]).get_group(list(coordinates_df.groupby(["pmid", "table_id"]).groups.keys())[3])
6/70:
for coord_analysis in coordinates_df.groupby(["pmid", "table_id"]):
    existing_studies_dict[coord_analysis.pmid[0]]
6/71:
for (pmid, table_id, coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    existing_studies_dict[pmid]
6/72:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    existing_studies_dict[pmid]
6/73:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    existing_studies_dict[pmid]
6/74:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    study = existing_studies_dict[pmid]
    analyses = study.analyses
    matching_analysis = next(filter(lambda a: a.table_id == table_id, analyses), None)
    if not matching_analysis:
        print('no matching analysis')
6/75:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    study = existing_studies_dict[pmid]
    if not study:
        continue
    analyses = study.analyses
    matching_analysis = next(filter(lambda a: a.table_id == table_id, analyses), None)
    if not matching_analysis:
        print('no matching analysis')
6/76: table_id
6/77: analyses
6/78: analyses[0].table_id
6/79:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    study = existing_studies_dict[pmid]
    if not study:
        continue
    analyses = study.analyses
    matching_analysis = next(filter(lambda a: a.table_id == str(table_id), analyses), None)
    if not matching_analysis:
        print('no matching analysis')
6/80:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    study = existing_studies_dict[pmid]
    if not study:
        continue
    analyses = study.analyses
    matching_analysis = next(filter(lambda a: a.table_id == str(table_id), analyses), None)
    if not matching_analysis:
        print(f'no matching analysis: {pmid}')
6/81: existing_studies_dict['34490458']
6/82: existing_studies_dict['34490458'].analyses
6/83: existing_studies_dict['34490458'].analyses[0].table_id
6/84:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    study = existing_studies_dict[pmid]
    if not study:
        continue
    analyses = study.analyses
    matching_analysis = next(filter(lambda a: a.table_id == str(table_id), analyses), None)
    if not matching_analysis:
        print(f'no matching analysis: {pmid} for {table_id}')
6/85: existing_studies_dict['34490458'].analyses[0].table_id
6/86:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    study = existing_studies_dict[pmid]
    if not study:
        continue
    analyses = study.analyses
    matching_analysis = next(filter(lambda a: a.table_id == str(table_id), analyses), None)
    if not matching_analysis:
        print(f'no matching analysis: {pmid} for {table_id}')
    db_point_set = {(p.x, p.y, p.z) for p in matching_analysis.points}
    table_point_set = set(coord_df[["x", "y", "z"]])
    assert db_point_set == table_point_set
6/87:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    study = existing_studies_dict[pmid]
    if not study:
        continue
    analyses = study.analyses
    matching_analysis = next(filter(lambda a: a.table_id == str(table_id), analyses), None)
    if not matching_analysis:
        print(f'no matching analysis: {pmid} for {table_id}')
    db_point_set = {(p.x, p.y, p.z) for p in matching_analysis.points}
    table_point_set = set(coord_df[["x", "y", "z"]])
    print(db_point_set == table_point_set)
6/88: table_point_set
6/89: db_point_set
6/90:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    study = existing_studies_dict[pmid]
    if not study:
        continue
    analyses = study.analyses
    matching_analysis = next(filter(lambda a: a.table_id == str(table_id), analyses), None)
    if not matching_analysis:
        print(f'no matching analysis: {pmid} for {table_id}')
    db_point_set = {(p.x, p.y, p.z) for p in matching_analysis.points}
    table_point_set = set(coord_df[["x", "y", "z"]])
    print(db_point_set == table_point_set)
6/91: db_point_set
6/92: table_point_set
6/93: coord_df
6/94: coord_df[["x", "y", "z"]]
6/95: coord_df[["x", "y", "z"]].values
6/96: set(coord_df[["x", "y", "z"]].values)
6/97: coord_df[["x", "y", "z"]].values
6/98: set(coord_df[["x", "y", "z"]].itertuples(index=False, name=None))
6/99:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    study = existing_studies_dict[pmid]
    if not study:
        continue
    analyses = study.analyses
    matching_analysis = next(filter(lambda a: a.table_id == str(table_id), analyses), None)
    if not matching_analysis:
        print(f'no matching analysis: {pmid} for {table_id}')
    db_point_set = {(p.x, p.y, p.z) for p in matching_analysis.points}
    table_point_set = set(coord_df[["x", "y", "z"]].itertuples(index=False, name=None))
    if db_point_set == table_point_set:
        print(f"extraction is different: {pmid} for {table_id}")
6/100:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    study = existing_studies_dict[pmid]
    if not study:
        continue
    analyses = study.analyses
    matching_analysis = next(filter(lambda a: a.table_id == str(table_id), analyses), None)
    if not matching_analysis:
        print(f'no matching analysis: {pmid} for {table_id}')
        continue
    db_point_set = {(p.x, p.y, p.z) for p in matching_analysis.points}
    table_point_set = set(coord_df[["x", "y", "z"]].itertuples(index=False, name=None))
    if db_point_set == table_point_set:
        print(f"extraction is different: {pmid} for {table_id}")
6/101: table_point_set
6/102: db_point_set
6/103: db_point_set == table_point_set
6/104:
for ((pmid, table_id), coord_df) in coordinates_df.groupby(["pmid", "table_id"]):
    if pmid not in existing_studies_dict:
        continue
    study = existing_studies_dict[pmid]
    if not study:
        continue
    analyses = study.analyses
    matching_analysis = next(filter(lambda a: a.table_id == str(table_id), analyses), None)
    if not matching_analysis:
        print(f'no matching analysis: {pmid} for {table_id}')
        continue
    db_point_set = {(p.x, p.y, p.z) for p in matching_analysis.points}
    table_point_set = set(coord_df[["x", "y", "z"]].itertuples(index=False, name=None))
    if db_point_set != table_point_set:
        print(f"extraction is different: {pmid} for {table_id}")