Skip to content

Commit

Permalink
improve ingestion of studies
Browse files Browse the repository at this point in the history
  • Loading branch information
jdkent committed Jun 15, 2024
1 parent 7df1e11 commit a31c373
Show file tree
Hide file tree
Showing 6 changed files with 959 additions and 305 deletions.
257 changes: 124 additions & 133 deletions store/neurostore/data/ace/sample_coordinates.csv

Large diffs are not rendered by default.

17 changes: 6 additions & 11 deletions store/neurostore/data/ace/sample_metadata.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
doi,title,journal,coordinate_space,publication_year,authors,pmid
,Temporal cortex activation during speech recognition: an optical topography study.,Cognition,TAL,1999,K L Sakai T Takeuchi H Sato,10585521
,The amygdala theory of autism.,Neuroscience and biobehavioral reviews,TAL,2000,S C Williams C Ashwin S Wheelwright E T Bullmore H A Ring S Baron-Cohen,10781695
,Recognizing one's own face.,Cognition,TAL,2001,"T T Kircher, C Senior, M L Phillips, S Rabe-Hesketh, P J Benson, E T Bullmore, M Brammer, A Simmons, M Bartels, A S David",11062324
,Functional imaging of brain responses to pain. A review and meta-analysis (2000).,Neurophysiologie clinique = Clinical neurophysiology,UNKNOWN,2000,"R Peyron, B Laurent, L García-Larrea",11126640
,[EEG and ischemic stroke in full-term newborns].,Neurophysiologie clinique = Clinical neurophysiology,UNKNOWN,2003,"D Selton, M André, J M Hascoët",12909390
,Differential involvement of left prefrontal cortex in inductive and deductive reasoning.,Cognition,MNI,2004,"Vinod Goel, Raymond J Dolan",15178381
,A meta-analysis of structural brain abnormalities in PTSD.,Neuroscience and biobehavioral reviews,MNI,2006,"Anke Karl, Michael Schaefer, Loretta S Malta, Denise Dörfel, Nicolas Rohleder, Annett Werner",16730374
10.1016/j.biopsych.2008.04.033,Neural responses to sad facial expressions in major depression following cognitive behavioral therapy.,Biological psychiatry,TAL,2008,"Cynthia H Y Fu, Steven C R Williams, Anthony J Cleare, Jan Scott, Martina T Mitterschiffthaler, Nicholas D Walsh, Catherine Donaldson, John Suckling, Chris Andrew, Herbert Steiner, Robin M Murray",18550030
10.1016/j.biopsych.2008.04.032,Regional gray matter changes are associated with cognitive deficits in remitted geriatric depression: an optimized voxel-based morphometry study.,Biological psychiatry,MNI,2008,"Yonggui Yuan, Wanlin Zhu, Zhijun Zhang, Feng Bai, Hui Yu, Yongmei Shi, Yun Qian, Wen Liu, Tianzi Jiang, Jiayong You, Zhening Liu",18550031
10.1093/cercor/bhn100,Sex differences and the impact of steroid hormones on the developing human brain.,"Cerebral cortex (New York, N.Y. : 1991)",MNI,2008,"Susanne Neufang, Karsten Specht, Markus Hausmann, Onur Güntürkün, Beate Herpertz-Dahlmann, Gereon R Fink, Kerstin Konrad",18550597
pmcid,pmid,doi,title,journal,publication_year,license,authors,coordinate_space,source
9001100.0,35419051,10.1155/2022/8068988,BMRMI Reduces Depressive Rumination Possibly through Improving Abnormal FC of Dorsal ACC,Neural Plast,2022.0,https://creativecommons.org/licenses/by/4.0/,"Yang, Ming-Hao; Guo, Zhi-Peng; Lv, Xue-Yu; Zhang, Zhu-Qing; Wang, Wei-Dong; Wang, Jian; Hong, Lan; Lin, Ying-Na; Liu, Chun-Hong",MNI,pubget
5789340.0,29300304,10.3390/brainsci8010009,Behavioral and Brain Activity Indices of Cognitive Control Deficits in Binge Drinkers,Brain Sci,2018.0,http://creativecommons.org/licenses/by/4.0/,"Molnar, Sean M.; Beaton, Lauren E.; Happer, Joseph P.; Holcomb, Lee A.; Huang, Siyuan; Arienzo, Donatello; Marinkovic, Ksenija",TAL,pubget
3242169.0,22194728,10.3389/fpsyt.2011.00068,Behavioral Risk Elicits Selective Activation of the Executive System in Adolescents: Clinical Implications,Front Psychiatry,2011.0,http://www.frontiersin.org/licenseagreement,"Yaxley, Richard H.; Van Voorhees, Elizabeth E.; Bergman, Sara; Hooper, Stephen R.; Huettel, Scott A.; De Bellis, Michael D.",MNI,pubget
7821103.0,33064887,10.1002/ejp.1680,Polymorphisms of the μ‐opioid receptor gene influence cerebral pain processing in fibromyalgia,Eur J Pain,2020.0,http://creativecommons.org/licenses/by/4.0/,"Ellerbrock, Isabel; Sandström, Angelica; Tour, Jeanette; Kadetoff, Diana; Schalling, Martin; Jensen, Karin B.; Kosek, Eva",MNI,pubget
3960334.0,24397999,10.1016/j.dcn.2013.12.003,Girls’ challenging social experiences in early adolescence predict neural response to rewards and depressive symptoms,Dev Cogn Neurosci,2013.0,http://creativecommons.org/licenses/by-nc-nd/3.0/,"Casement, Melynda D.; Guyer, Amanda E.; Hipwell, Alison E.; McAloon, Rose L.; Hoffmann, Amy M.; Keenan, Kathryn E.; Forbes, Erika E.",UNKNOWN,pubget
717 changes: 706 additions & 11 deletions store/neurostore/data/ace/sample_text.csv

Large diffs are not rendered by default.

268 changes: 120 additions & 148 deletions store/neurostore/ingest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
)
from neurostore.models.data import StudysetStudy, _check_type

META_ANALYSIS_WORDS = ['meta analysis', 'meta-analysis', 'systematic review']


def ingest_neurovault(verbose=False, limit=20, overwrite=False, max_images=None):
# Store existing studies for quick lookup
Expand Down Expand Up @@ -125,6 +127,7 @@ def add_collection(data):
)
images.append(image)

base_study.update_has_images_and_points()
db.session.add_all(
[base_study] + [s] + list(analyses.values()) + images + list(conditions)
)
Expand Down Expand Up @@ -232,7 +235,7 @@ def ingest_neurosynth(max_rows=None):
columns = [
c
for c in source_base_study.__table__.columns
if c != "versions"
if c not in ("versions", "__ts_vector__")
]
for ab in base_studies[1:]:
for col in columns:
Expand Down Expand Up @@ -360,9 +363,10 @@ def ingest_neurosynth(max_rows=None):
for note in notes:
to_commit.append(note.analysis)
db.session.add_all([annot] + notes + to_commit + [d])
db.session.flush()
db.session.commit()
for bs in base_studies:
bs.update_has_images_and_points()
db.session.add_all(base_studies)
db.session.commit()


Expand Down Expand Up @@ -442,17 +446,28 @@ def ingest_neuroquery(max_rows=None):
studies=Study.query.filter_by(source="neuroquery").all(),
)
db.session.add(d)
db.session.flush()
db.session.commit()
for bs in base_studies:
bs.update_has_images_and_points()
db.session.add_all(base_studies)
db.session.commit()


def load_ace_files(coordinates_file, metadata_file, text_file):
coordinates_df = pd.read_table(coordinates_file, sep=",", dtype={"pmid": str})
metadata_df = pd.read_table(metadata_file, sep=",", dtype={"pmid": str})
text_df = pd.read_table(text_file, sep=",", dtype={"pmid": str})
coordinates_df = pd.read_table(coordinates_file, sep=",", dtype=str)
metadata_df = pd.read_table(metadata_file, sep=",", dtype=str)
text_df = pd.read_table(text_file, sep=",", dtype=str)

for col in ['x', 'y', 'z']:
if col in coordinates_df.columns:
coordinates_df[col] = pd.to_numeric(coordinates_df[col], errors='coerce')

text_df.fillna("", inplace=True)
metadata_df.fillna("", inplace=True)
coordinates_df.fillna("", inplace=True)

for df in [coordinates_df, metadata_df, text_df]:
df.pmid = df.pmid.str.split(".").str[0]
# preprocessing
metadata_df.set_index("pmid", inplace=True)
text_df.set_index("pmid", inplace=True)
Expand All @@ -464,173 +479,130 @@ def load_ace_files(coordinates_file, metadata_file, text_file):
return coordinates_df, metadata_df, text_df


def ace_ingestion_logic(coordinates_df, metadata_df, text_df):
to_commit = []
# see if there are duplicates for the newly created base_studies
all_base_studies = []
with db.session.no_autoflush:
all_studies = {
s.pmid: s for s in Study.query.filter_by(source="neurosynth").all()
def ace_ingestion_logic(coordinates_df, metadata_df, text_df, skip_existing=False):
def get_base_study(metadata_row):
doi = None if isinstance(metadata_row.doi, float) or metadata_row.doi == '' else metadata_row.doi
pmid = metadata_row.Index
base_studies = BaseStudy.query.filter(or_(BaseStudy.doi == doi, BaseStudy.pmid == pmid)).all()

if len(base_studies) == 1:
return base_studies[0]
elif len(base_studies) > 1:
return merge_base_studies(base_studies)
else:
created_bs = [bs for bs in all_base_studies if bs.doi == doi and bs.pmid == pmid]
if created_bs:
return created_bs[0]
return BaseStudy.query.filter_by(pmid=pmid).one_or_none()

def merge_base_studies(base_studies):
source_base_study = next(filter(lambda bs: bs.pmid == pmid and bs.doi == doi, base_studies), base_studies[0])
other_base_studies = [bs for bs in base_studies if bs.id != source_base_study.id]
columns = [c.name for c in source_base_study.__table__.columns if c.name not in ("versions", "__ts_vector__")]
for ab in other_base_studies:
for col in columns:
source_attr = getattr(source_base_study, col)
new_attr = getattr(ab, col)
setattr(source_base_study, col, source_attr or new_attr)
source_base_study.versions.extend(ab.versions)
db.session.delete(ab)
return source_base_study

def update_study_info(study, metadata_row, text_row, doi, pmcid, year, level):
study_info = {
"name": metadata_row.title,
"doi": doi,
"pmid": metadata_row.Index,
"pmcid": pmcid,
"description": text_row.abstract,
"authors": metadata_row.authors,
"publication": metadata_row.journal,
"year": year,
"level": level,
}
for metadata_row, text_row in zip(
metadata_df.itertuples(), text_df.itertuples()
):
base_study = None
doi = None if isinstance(metadata_row.doi, float) else metadata_row.doi
id_ = pmid = metadata_row.Index
year = (
None
if np.isnan(metadata_row.publication_year)
else int(metadata_row.publication_year)
)
# find an base_study based on available information
if doi is not None:
base_studies = BaseStudy.query.filter(
or_(BaseStudy.doi == doi, BaseStudy.pmid == pmid)
).all()
if isinstance(study, Study):
study_info["source"] = "neurosynth" if "ace" in metadata_row.source else "pubget",
for col, value in study_info.items():
source_attr = getattr(study, col)
setattr(study, col, source_attr or value)

if len(base_studies) == 1:
base_study = base_studies[0]
elif len(base_studies) > 1:
# find the first abstract study with both pmid and doi
source_base_study = next(
filter(
lambda bs: bs.pmid == pmid and bs.doi == doi, base_studies
),
base_studies[0],
)
other_base_studies = [
bs for bs in base_studies if bs.id != source_base_study.id
]
# do not overwrite the versions column
# we want to append to this column
columns = [
c.name
for c in source_base_study.__table__.columns
if c != "versions"
]
for ab in other_base_studies:
for col in columns:
source_attr = getattr(source_base_study, col)
new_attr = getattr(ab, col)
setattr(source_base_study, col, source_attr or new_attr)
source_base_study.versions.extend(ab.versions)
# delete the extraneous record
db.session.delete(ab)
def process_coordinates(id_, s, metadata_row):
analyses = []
points = []
try:
study_coord_data = coordinates_df.loc[[id_]]
except KeyError:
print(f"pmid: {id_} has no coordinates")
return analyses, points
for order, (t_id, df) in enumerate(study_coord_data.groupby("table_id")):
a = Analysis.query.filter_by(table_id=str(t_id), study_id=s.id).one_or_none() or Analysis()
a.name = df["table_label"][0] or str(t_id)
a.table_id = str(t_id)
a.order = a.order or order
a.description = df["table_caption"][0] if not df["table_caption"].isna()[0] else None
if not a.study:
a.study = s
analyses.append(a)
point_idx = 0
for _, p in df.iterrows():
point = Point(
x=p["x"], y=p["y"], z=p["z"],
space=metadata_row.coordinate_space,
kind=df["statistic"][0] if not df["statistic"].isna()[0] else "unknown",
analysis=a,
order=point_idx
)
points.append(point)
point_idx += 1
return analyses, points

base_study = source_base_study
else:
# see if it exists in the already created base_studies
created_bs = [
bs
for bs in all_base_studies
if bs.doi == doi and bs.pmid == pmid
]
if created_bs:
base_study = created_bs[0]
to_commit = []
all_base_studies = []

if doi is None:
base_study = BaseStudy.query.filter_by(pmid=pmid).one_or_none()
with db.session.no_autoflush:
all_studies = {s.pmid: s for s in Study.query.filter_by(source="neurosynth").all()}
for metadata_row, text_row in zip(metadata_df.itertuples(), text_df.itertuples()):
level = 'meta' if any(word in metadata_row.title.lower() for word in META_ANALYSIS_WORDS) else 'group'
base_study = get_base_study(metadata_row)
pmid = metadata_row.Index
pmcid = None if isinstance(metadata_row.pmcid, float) or metadata_row.pmcid == '' else metadata_row.pmcid
doi = None if isinstance(metadata_row.doi, float) or metadata_row.doi == '' else metadata_row.doi
year = None if isinstance(metadata_row.publication_year, float) or metadata_row.publication_year == '' else int(float(metadata_row.publication_year))

if skip_existing and base_study is not None and any(s.source == "neurosynth" for s in base_study.versions):
continue

if base_study is None:

base_study = BaseStudy(
name=metadata_row.title,
doi=doi,
pmid=pmid,
authors=metadata_row.authors,
publication=metadata_row.journal,
description=text_row.abstract,
pmcid=pmcid,
authors=metadata_row.authors or None,
publication=metadata_row.journal or None,
description=text_row.abstract or None,
year=year,
level="group",
level=level,
)
else:
# try to update the abstract study if information is missing
study_info = {
"name": metadata_row.title,
"doi": doi,
"pmid": pmid,
"description": text_row.abstract,
"authors": metadata_row.authors,
"publication": metadata_row.journal,
"year": year,
"level": "group",
}
for col, value in study_info.items():
source_attr = getattr(base_study, col)
setattr(base_study, col, source_attr or value)
update_study_info(base_study, metadata_row, text_row, doi, pmcid, year, level)

# append base study to commit
to_commit.append(base_study)

s = all_studies.get(pmid, Study())
update_study_info(s, metadata_row, text_row, doi, pmcid, year, level)

# try to update the study if information is missing
study_info = {
"name": metadata_row.title,
"doi": doi,
"pmid": pmid,
"description": text_row.abstract,
"authors": metadata_row.authors,
"publication": metadata_row.journal,
"year": year,
"level": "group",
"source": "neurosynth",
}
for col, value in study_info.items():
source_attr = getattr(s, col)
setattr(s, col, source_attr or value)

analyses = []
points = []

try:
study_coord_data = coordinates_df.loc[[id_]]
except KeyError:
print(f"pmid: {id_} has no coordinates")
continue
for order, (t_id, df) in enumerate(study_coord_data.groupby("table_id")):
a = (
Analysis.query.filter_by(table_id=str(t_id)).one_or_none()
or Analysis()
)
a.name = df["table_label"][0] or str(t_id)
a.table_id = str(t_id)
a.order = a.order or order
a.description = (
df["table_caption"][0]
if not df["table_caption"].isna()[0]
else None
)
if not a.study:
a.study = s
analyses.append(a)
point_idx = 0
for _, p in df.iterrows():
point = Point(
x=p["x"],
y=p["y"],
z=p["z"],
space=metadata_row.coordinate_space,
kind=(
df["statistic"][0]
if not df["statistic"].isna()[0]
else "unknown"
),
analysis=a,
entities=[Entity(label=a.name, level="group", analysis=a)],
order=point_idx,
)
points.append(point)
point_idx += 1
analyses, points = process_coordinates(pmid, s, metadata_row)
to_commit.extend(points)
to_commit.extend(analyses)
# append study as version of study
base_study.versions.append(s)

db.session.add_all(to_commit)
db.session.flush()
db.session.commit()
for bs in all_base_studies:
bs.update_has_images_and_points()
db.session.add_all(all_base_studies)
db.session.commit()


Expand Down
3 changes: 2 additions & 1 deletion store/neurostore/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ class Studyset(BaseMixin, db.Model):
passive_deletes=True,
cascade="all, delete-orphan",
)
__ts_vector__ = db.Column(
_ts_vector = db.Column(
"__ts_vector__",
TSVector(),
db.Computed(
"to_tsvector('english', coalesce(name, '') || ' ' || coalesce(description, ''))",
Expand Down
2 changes: 1 addition & 1 deletion store/neurostore/openapi

0 comments on commit a31c373

Please sign in to comment.