In [1]:
import json
import glob
import random
import jsonlines
import pandas as pd

# Create few shot subsets

In [None]:
corpus = "depression-rct"
labels_df = pd.read_csv(f"{corpus}/labels.csv", index_col=0)

In [None]:
# find 5-shot with at least one example of each label
random.seed(42)
pmids = set(labels_df.index)
remaining = {c:set([0,1]) for c in labels_df.columns}
few_shot_pmids = []
while len(few_shot_pmids) < 5 :
    chosen_pmid = None
    if not few_shot_pmids :
        chosen_pmid = pmids.pop()
    else:
        best_score = 0
        for pmid in pmids :
            score = 0
            pmid_labels = labels_df.loc[pmid].to_dict()
            for k in remaining :
                if pmid_labels[k] in remaining[k]:
                    score += 1
            if score >= best_score:
                chosen_pmid = pmid
                best_score = score
    if chosen_pmid in pmids : pmids.remove(chosen_pmid)
    chosen_pmid_labels = labels_df.loc[chosen_pmid].to_dict()
    for k in remaining :
        if chosen_pmid_labels[k] in remaining[k]:
            remaining[k].remove(chosen_pmid_labels[k])
    few_shot_pmids.append(chosen_pmid)

In [None]:
pd.set_option('display.max_columns', None)
train = labels_df.loc[few_shot_pmids]
test = labels_df.drop(index=few_shot_pmids)
train

In [None]:
train.to_csv(f"{corpus}/train_labels.csv")
test.to_csv(f"{corpus}/test_labels.csv")

# Get full instances of each abstract for test and train

In [5]:
corpus="depression-rct"
sentences_df = pd.read_csv(f"{corpus}/sentences.csv")
train_df = pd.read_csv(f"{corpus}/train_labels.csv", index_col="fid")


Unnamed: 0,pmid,secid,sentid,section,sentence_text,pred_consort_relevant
0,10+1007_s10560-018-0584-y,0,0,Title,"Nonviolence Communication to Reduce Stress, An...",[]
1,10+1007_s10560-018-0584-y,1,1,,Stress is one of the important dimensions of s...,[]
2,10+1007_s10560-018-0584-y,1,2,,"In the meantime, adolescents and especially gi...",[]
3,10+1007_s10560-018-0584-y,1,3,,The purpose of this study was to evaluate the ...,[]
4,10+1007_s10560-018-0584-y,1,4,,The target community in this research was 50 p...,['D03b']
...,...,...,...,...,...,...
1040,32847217,1,1,Background,This study evaluated the efficacy of a univers...,[]
1041,32847217,2,2,Method,160 adolescents were randomly assigned to the ...,[]
1042,32847217,3,3,Results,Decreased depressive symptoms for the interven...,[]
1043,32847217,3,4,Results,Significant changes in self-esteem and coping ...,[]


In [8]:
train_df.columns

Index(['D01', 'D02', 'D03a', 'D03b', 'D04a', 'D04b', 'D05', 'D06a', 'D06b',
       'D07', 'D08a', 'D08b', 'D08c', 'D09', 'D10', 'D11', 'D12', 'D13a',
       'D13b', 'D14a', 'D14b', 'D15'],
      dtype='object')

In [13]:
unique_fids = sentences_df["pmid"].unique()
for fid in unique_fids :
    pmid_df = sentences_df[sentences_df["pmid"]==fid]
     # fill prediction context with filtered (or not) abstract
    cons_df = pmid_df.fillna("").groupby('section').agg({
        "sentence_text":lambda x: ' '.join(x),
        "secid":"max",
    }).sort_values(by="secid").reset_index() # must keep secid to keep order of sections , else groupby will mess the order
    cons_df["section"] = cons_df["section"].apply(lambda x: x + " : " if x else x)
    cons_df = cons_df["section"] + cons_df["sentence_text"]
    abstract = '\n'.join(cons_df)
    break
print(fid)
print(abstract)

10+1007_s10560-018-0584-y
Title : Nonviolence Communication to Reduce Stress, Anxiety and Depression in Young Iranian Women: A Randomized Experiment
Stress is one of the important dimensions of social communication that directly and indirectly affect anxiety management or depression. In the meantime, adolescents and especially girls are more likely to be harmed in poor and vulnerable communities. The purpose of this study was to evaluate the effect of group training with non-violence communication (NVC) method on stress, anxiety and depression of young girls. The target community in this research was 50 participants that were randomly selected from the charity centers of Tehran suburbs. Participants were randomly divided and placed equally into the experimental and control groups. The study entry criteria were students aged from 11 to 18 years, the vulnerable socio-economic income and the stress cut off point as 12. Eight training sessions of NVC were presented for the experimental gro

In [None]:
# fill prediction context with filtered (or not) abstract
cons_df = pmid_df.fillna("").groupby('section').agg({
    "sentence_text":lambda x: ' '.join(x),
    "secid":"max",
}).sort_values(by="secid").reset_index() # must keep secid to keep order of sections , else groupby will mess the order
cons_df["section"] = cons_df["section"].apply(lambda x: x + " : " if x else x)
cons_df = cons_df["section"] + cons_df["sentence_text"]
abstract = '\n'.join(cons_df)

# Parse few shot subsets explanations (TRAIN SET)

In [73]:
corpus = "depression-rct"
sentences_df = pd.read_csv(f"{corpus}/sentences.csv")
labels_df = pd.read_csv(f"{corpus}/train_labels.csv",index_col="fid")
questions_df = pd.read_csv(f"{corpus}/questions_and_examples.csv",index_col="id")
if "depression" in corpus :
    original_labels_df =  pd.read_csv(f"{corpus}/original_labels.csv",index_col="fid")
train_corpus = {}
for txt_path in glob.glob(f"../inference_vllm/out/{corpus}_expl-gen_*/txt/*.txt"):
    # parse path
    corpus = txt_path.split('/')[-3].split('_')[0]
    fid, cid = txt_path.split('/')[-1].split('.')[0].rsplit('_',1)
    if corpus == "covid" :
        fid = int(fid)
    # read text
    text_lines = open(txt_path).readlines()
    expl = ""
    for l in text_lines :
        if l.startswith("Explanation : "): 
            expl = l.replace("Explanation : ","").strip()
            expl = expl.replace("<|eot_id|><|start_header_id|>assistant<|end_header_id|>","")
        # we do not break because we want to retrieve last explanation only because the first is the one from the example
    # retrieve abstract
    pmid_df = sentences_df[sentences_df["pmid"]==fid]
    cons_df = pmid_df.fillna("").groupby('section').agg({
        "sentence_text":lambda x: ' '.join(x),
        "secid":"max",
    }).sort_values(by="secid").reset_index() # must keep secid to keep order of sections , else groupby will mess the order
    cons_df["section"] = cons_df["section"].apply(lambda x: x + " : " if x else x)
    cons_df = cons_df["section"] + cons_df["sentence_text"]
    abstract = '\n'.join(cons_df)
    # retrieve answer
    answer_id = labels_df.loc[fid][cid]
    if answer_id == 0 :
        answer_text = "No"
    elif answer_id == 1 :
        answer_text = "Yes"
    # retrieve original answer
    if "depression" in corpus :
        original_answer = original_labels_df.loc[fid]["con."+cid[1:]+".rct"]
    # retrieve question
    question = questions_df.loc[cid]["question"]
    # add to dict
    if fid not in train_corpus:
        train_corpus[fid] = {"context":abstract, "question":{cid:question}, "explanation": {cid:expl}, "answer":{cid:answer_text}}
        if "depression" in corpus :
            train_corpus[fid]["original_answer"] = {cid:original_answer}
            train_corpus[fid]["trial_design"] = "rct"
        assert train_corpus[fid]["context"]
    else :
        train_corpus[fid]["explanation"][cid] = expl
        train_corpus[fid]["question"][cid] = question
        train_corpus[fid]["answer"][cid] = answer_text
        if "depression" in corpus :
            train_corpus[fid]["original_answer"][cid] = original_answer

In [74]:
train_corpus = [ {"fid":fid}|example for fid, example in train_corpus.items()]

In [75]:
train_corpus

[{'fid': '30533601',
  'context': 'Title : Effect of Internet-based Cognitive Behavioral Humanistic and Interpersonal Training vs. Internet-based General Health Education on Adolescent Depression in Primary Care: A Randomized Clinical Trial.\nImportance : Although 13-20% of American adolescents experience a depressive episode annually, no scalable primary care model for adolescent depression prevention is currently available.\nObjective : To study whether CATCH-IT (Competent Adulthood Transition with Cognitive Behavioral Humanistic and Interpersonal Training) reduces the hazard for depression in at-risk adolescents identified in primary care, as compared to a general health education attention control (HE).\nDesign : The Promoting AdolescenT Health (PATH) study compares CATCH-IT and HE in a phase 3 single-blind multicenter randomized attention control trial. Participants were enrolled from 2012 to 2016 and assessed at 2, 6, 12, 18, and 24 months post-randomization.\nSetting : Primary c

In [76]:
with jsonlines.open(f'{corpus}_train.jsonl', mode='w') as writer:
    writer.write_all(train_corpus)

# Reformat test data in one file

In [97]:
test_corpus = []
corpus = "covid"
sentences_df = pd.read_csv(f"{corpus}/sentences.csv")
labels_df = pd.read_csv(f"{corpus}/test_labels.csv",index_col="fid")
questions_dict = pd.read_csv(f"{corpus}/questions_and_examples.csv",index_col="id")["question"].to_dict()
#original_labels_df = pd.read_csv(f"{corpus}/original_labels.csv",index_col="fid")
#id_conv = {old:now for old,now in zip(original_labels_df.columns,labels_df.columns)}
#original_labels_df = original_labels_df.rename(columns=id_conv)
for fid in labels_df.index :
    # retrieve abstract
    pmid_df = sentences_df[sentences_df["pmid"]==fid]
    cons_df = pmid_df.fillna("").groupby('section').agg({
        "sentence_text":lambda x: ' '.join(x),
        "secid":"max",
    }).sort_values(by="secid").reset_index() # must keep secid to keep order of sections , else groupby will mess the order
    cons_df["section"] = cons_df["section"].apply(lambda x: x + " : " if x else x)
    cons_df = cons_df["section"] + cons_df["sentence_text"]
    abstract = '\n'.join(cons_df)
    # parse consort specific
    answer_dict = labels_df.replace(1,"Yes").replace(0,"No").loc[fid].to_dict()
    #original_answers_dict = original_labels_df.loc[fid].to_dict()
    original_answers_dict = answer_dict
    full_dict =  {
        "id":str(fid).replace('+','.').replace("_","/"),
        "corpus":corpus.split('-')[0],
        "trial_design":"rct",
        "context":abstract,
        "questions":questions_dict,
        "explanations":None,
        "answers":answer_dict,
        "original_answers":original_answers_dict,
    }
    test_corpus.append(full_dict)


In [100]:
len(test_corpus)

35

In [101]:
with jsonlines.open(f'{corpus}_test.jsonl', mode='w') as writer:
    writer.write_all(test_corpus)

In [74]:
full_train_set = []
for f in glob.glob("./*_train.jsonl"):
    with jsonlines.open(f) as reader:
        for obj in reader:
            if "disease" in obj :
                obj["corpus"] = obj.pop("disease")
            if "trial_design" not in obj :
                obj["trial_design"] = "rct"
            if "original_answer" not in obj:
                obj["original_answer"] = obj["answer"]
            if "corpus" not in obj :
                obj["corpus"] = "depression"
            obj["questions"] = obj.pop("question")
            obj["explanations"] = obj.pop("explanation")
            obj["answers"] = obj.pop("answer")
            obj["original_answers"] = obj.pop("original_answer")
            if type(obj["fid"]) == str:
                obj["fid"] = obj["fid"].replace('+','.').replace("_","/")
            obj["id"] = str(obj.pop("fid"))
            order = ['id','corpus','trial_design','context',  'questions', 'explanations', 'answers', 'original_answers']
            reordered_dict = {k: obj[k] for k in order}
            full_train_set.append(reordered_dict)

In [75]:
full_train_set

[{'id': '32425006',
  'corpus': 'covid',
  'trial_design': 'rct',
  'context': 'Title : Efficacy of internet-based integrated intervention on depression and anxiety symptoms in patients with COVID-19.\n: Public health crises, such as the outbreak of severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) since Dec. 2019, are widely acknowledged as severe traumatic events that impose threats not only because of physical concerns but also because of the psychological distress of infected patients. We designed an internet-based integrated intervention and evaluated its efficacy on depression and anxiety symptoms in patients infected by SARS-CoV-2.',
  'questions': {'C02': 'Is there a structured summary of the trial design (eg, parallel, crossover, cluster, non-inferiority) ?',
   'C14a': 'Are the benefits and harms balanced in the conclusion ?',
   'C08a': 'Is there only a brief description of blinding (eg, single-blind, double-blind, triple-blind) ?',
   'C03a': 'Are the eligibility

In [76]:
with jsonlines.open('train.jsonl', mode='w') as writer:
    writer.write_all(full_train_set)

In [104]:
full_test_set = []
for f in glob.glob("./*_test.jsonl"):
    with jsonlines.open(f) as reader:
        for obj in reader:
            full_test_set.append(obj)

In [105]:
len(full_test_set)

192

In [106]:
full_test_set

[{'id': '10.1007/s10560-018-0584-y',
  'corpus': 'depression',
  'trial_design': 'rct',
  'context': 'Title : Nonviolence Communication to Reduce Stress, Anxiety and Depression in Young Iranian Women: A Randomized Experiment\nStress is one of the important dimensions of social communication that directly and indirectly affect anxiety management or depression. In the meantime, adolescents and especially girls are more likely to be harmed in poor and vulnerable communities. The purpose of this study was to evaluate the effect of group training with non-violence communication (NVC) method on stress, anxiety and depression of young girls. The target community in this research was 50 participants that were randomly selected from the charity centers of Tehran suburbs. Participants were randomly divided and placed equally into the experimental and control groups. The study entry criteria were students aged from 11 to 18 years, the vulnerable socio-economic income and the stress cut off point 

In [107]:
with jsonlines.open('test.jsonl', mode='w') as writer:
    writer.write_all(full_test_set)

# Reformat consort-abstract original  examples and sentences

In [3]:
rct_lab = pd.read_csv("depression-rct/labels.csv")
crt_lab = pd.read_csv("depression-crt/rct_labels.csv")

In [9]:
pd.concat([rct_lab,crt_lab]).set_index("fid").sort_index().to_csv("labels_depression.csv")

In [2]:
cov_q = pd.read_csv("covid/questions_and_examples.csv")
dep_q = pd.read_csv("depression-rct/questions_and_examples.csv")

In [19]:
text2id = { t:i for i,t in enumerate(pd.concat([cov_q["context"],dep_q["context"]]).unique())}
id2text = {v:k for k,v in text2id.items()}

In [26]:
qdf = pd.concat([cov_q,dep_q])
qdf["fid"]= qdf["context"].apply(lambda x:text2id[x])
qdf = qdf.set_index("id")
qdf.head()

Unnamed: 0_level_0,question,context,cot_explanation,annot_multi_hop,answer,kappa_unweighted,fid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C01,Is the study identified as randomized in the t...,Title : Effectiveness of a strategy to improve...,The title of this abstract ends with 'a cluste...,False,Yes,,0
C02,Is there a structured summary of the trial des...,Title : Effectiveness of a strategy to improve...,"In the Design, setting and patients section, '...",False,Yes,,0
C03a,Are the eligibility criteria for participants ...,Title : Effectiveness of a strategy to improve...,"In the Design, setting and patients section, t...",True,Yes,,0
C03b,Are the settings or locations where the data w...,Title : Effectiveness of a strategy to improve...,"In the Design, setting and patients section, t...",False,Yes,,0
C04,Are the interventions sufficiently detailed fo...,Title : Corticosteroids for the prevention of ...,"In the Intervention section, the experimental ...",True,Yes,,1


In [29]:
train_corpus = []

for fid,text in id2text.items() :
    fid_df = qdf[qdf["fid"]==fid]
    questions = fid_df["question"].to_dict()
    explanations = fid_df["cot_explanation"].to_dict()
    answers = fid_df["answer"].to_dict()
    example = {
        "id":None,
        "corpus":"consort-abstract-examples",
        "trial_design":"rct",
        "context":text,
        "questions":questions,
        "explanations":explanations,
        "answers":answers,
        "original_answers":None,
    }
    train_corpus.append(example)

In [None]:
train_corpus

In [37]:
with jsonlines.open('train.jsonl') as reader:
    auto_train_corpus = [l for l in reader.iter()]


In [39]:
full_train_corpus = auto_train_corpus + train_corpus

In [43]:
with jsonlines.open('train.jsonl', mode='w') as writer:
    writer.write_all(full_train_corpus)

# Join questions

In [32]:
question_dict = pd.read_csv("kappas.csv",index_col="id").to_dict(orient="index")
with openquestion_dict

{'D01': {'kappa': 0.96,
  'question': 'Is the study identified as randomized in the title ?',
  'corr_id': 'C01'},
 'D02': {'kappa': 0.38,
  'question': 'Is there a structured summary of the trial design (e.g., parallel, crossover, cluster, non-inferiority) ?',
  'corr_id': 'C02'},
 'D03a': {'kappa': 0.77,
  'question': 'Are the eligibility criteria for participants mentioned?',
  'corr_id': 'C03a'},
 'D03b': {'kappa': 0.81,
  'question': 'Are the settings or locations where the data were collected stated in the abstract ?',
  'corr_id': 'C03b'},
 'D04a': {'kappa': 0.8,
  'question': 'Do the authors report essential features of the experimental intervention (if needed) ?',
  'corr_id': 'C04'},
 'D04b': {'kappa': 0.76,
  'question': 'Do the authors report essential features of the comparison (= control) intervention (if needed) ?',
  'corr_id': 'C04'},
 'D05': {'kappa': 0.73,
  'question': 'Are there specific objectives or hypothesis stated ?',
  'corr_id': 'C05'},
 'D06a': {'kappa': 0.

In [33]:
with open("questions.jsonl","w",encoding="utf-8") as outf:
    json.dump(question_dict,outf)

In [3]:
covdf = pd.read_csv("cov.csv",index_col=0)
depdf = pd.read_csv("dep.csv",index_col=0)

In [4]:
merged_df = pd.merge(depdf, covdf, on='question', how='outer')
merged_df = merged_df[["c_id","d_id","question","kappa_unweighted"]]
merged_df = merged_df.sort_values(by=["c_id","d_id"])
merged_df

Unnamed: 0,c_id,d_id,question,kappa_unweighted
33,C01,D01,Is the study identified as randomized in the t...,0.96
34,C02,D02,Is there a structured summary of the trial des...,0.38
1,C03a,D03a,Are the eligibility criteria for participants ...,0.77
8,C03b,D03b,Are the settings or locations where the data w...,0.81
3,C04,,Are the interventions sufficiently detailed fo...,
10,C05,D05,Are there specific objectives or hypothesis st...,0.73
7,C06,,Are the primary outcomes clearly described for...,
32,C07a,,"Is the random assignment declared (eg, random,...",
30,C07b,D07,If they declared a random allocation to interv...,0.49
12,C07c,,Are they referring to allocation concealment ?,


In [None]:
merged_df.to_csv("questions.csv",index=False)

In [3]:
qdf = pd.read_csv("questions.xlsx")
qdf

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xdd in position 15: invalid continuation byte