# PublicDatasets (Analyser)

## 1. Re-reading

In [1]:
special_separator = '___'

In [2]:
import pandas as pd
import re

In [3]:
research_paper_table = pd.read_csv('data/ResearchPapers.csv',index_col=0)

In [4]:
#Drop ignored
research_paper_table
research_paper_table = research_paper_table[research_paper_table.included] 

In [5]:
research_paper_table.head()

Unnamed: 0,Venue,Title,included
1,CHIL 2022,Data Augmentation for Electrocardiograms,True
2,CHIL 2022,MedMCQA: A Large-scale Multi-Subject Multi-Cho...,True
3,CHIL 2022,Disability prediction in multiple sclerosis us...,True
4,CHIL 2022,Lead-agnostic Self-supervised Learning for Loc...,True
5,CHIL 2022,Context-Sensitive Spelling Correction of Clini...,True


### 1.1 get file names

In [6]:
def get_text_filename(v,t,path='data/texts/'):
    return path+v+special_separator+t+'.txt'

In [7]:
texts = []
titles=[]
venues = []
for index, row in research_paper_table.iterrows():
    texts.append(get_text_filename(row["Venue"],row["Title"]))
    titles.append(row["Title"])
    venues.append(row["Venue"])

In [8]:
# mention_matches


## 2. Get Datasets

### 2.1 Get section function

In [9]:
get_dataset_section = None
#REPLACE THE get_dataset_section function to change behavior

In [10]:
def get_section(contents, section_header="data and code availability", section_end=['1. introduction','© 2022']):
    """
    Get the Data Source section from a research paper.
    
    :param contents: Text contents of the resaerch paper.
    :param section_header: A string indicating the start of the dataset section.
    :param section_end: A list of strings to indicate end of dataset section.  
    :return: returns substring of text region between section_header and a potential section_end. returns "" if it fails to find it.
    
    """
    
    contents = contents.lower()
    contents = contents.replace("\n\n", "$$$" )
    contents = contents.replace("-\n", "" )
    # contents = contents.replace("\n", "" )
    contents = contents.replace("$$$", "\n" )

    idx0=contents.find(section_header)
    contents=contents[idx0:]
    for end in section_end:
        idxend=contents.find(end)
        if idxend==-1: continue
        contents=contents[:idxend]
        return contents
    return ""
    #print(contents)

### 2.2 Get section (CHIL)

In [11]:

def get_dataset_section(contents):
    """Get the Data and Code Availability section from CHIL papers"""
    #CHANGE THIS FUNCTION TO ALTER THE BEHAVIOR OF HOW THE DATASET SECTION IS EXTRACTED FROM THE TEXT
    return get_section(contents, section_header="data and code availability", section_end=['1. introduction','© 2022'])



In [12]:

# items = {text_file:[] for text_file in texts}
text_contents = []

for i in range(len(texts)):
    with open(texts[i], 'r') as f:
        contents = f.read()
        # items[titles[i]] = get_section(contents)
        text_contents.append(get_section(contents))

## 3. Get Mentions

### 3.1 regex patterns

In [13]:
xn = r'\n?' #optional newline
son = r'(?: |\n)' #space or newline

author = fr"(?:[\nA-Za-z'`-]+)"
etal = fr"(?:et{son}al\.?)"
additional = f"(?:{xn},?{son}(?:(?:and{son}|&{son})?{author}|{etal}))"
year_num = f"(?:{xn}19|20)[0-9][0-9]"
page_num = f"(?:{xn},{son}p\.? [0-9]+)?"  # Always optional
year = fr"(?:{xn},{son}*{year_num}{page_num}| *\({year_num}{page_num}\))"
inline_citation = fr'\b(?!(?:Although|Also)\b){xn}{author}{xn}{additional}*{xn}{year}{xn}[a-f]?'
# ADAPTED FROM https://stackoverflow.com/a/63633049/2089784

num_section = r'([0-9]\.)+[0-9]'
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'  #https://urlregex.com/
split_url_pattern = fr'h{xn}t{xn}t{xn}p{xn}[s]?{xn}:{xn}/{xn}/{xn}(?:[a-zA-Z{xn}]|[0-9]|[$-_@.&+]|[!*\(\{xn}),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

seq_num=r', [0-9]'
contained_num=r'\([0-9]\)'
footnote_pattern = fr'(?:(?:(?<=[a-zA-Z\.]) ?|,)[1-9](?!\d)|{seq_num}|{contained_num})'

# print(inline_citation)

In [14]:

def collect_tokens(pat,str,name,tokens):
    """Collect pat matches from str, remove from str and insert into tokens."""
    matches = re.findall(pat,str)
    str = re.sub(pat, '', str)
    tokens |= {name: matches}
    return str


### 3.2 Post-Processing

In [15]:
# def clean_newlines(dict):
#     for key, val in dict.items():
#         dict[key] = [ s.replace('\n','') for s in val ]
        
# def clean_url_end(dict):
#     dict["URL"] = [ s.rstrip('.') for s in dict["URL"]  ]
    
def clean_tokens(dict):
    dict["Inline Citation"] = [ s.replace('\n',' ') for s in dict["Inline Citation"]  ]
    dict["URL"] = [ s.replace('\n','') for s in dict["URL"]  ]
    dict["URL"] = [ s.rstrip('.') for s in dict["URL"]  ]

## 4. Assembling the mentions table

In [16]:

# for name in mention_matches:
#     merged_dict[name]=mention_matches[name]
# for keyword in keyword_matches:
#     merged_bdict[keyword]=keyword_matches[keyword]

# NEW CSV USING get_dasa function from 3.4

# for item in items:
#     print(item)
#     print(items[item])


# MENTIONS_TABLE = pd.DataFrame()

# MENTIONS_TABLE = pd.DataFrame( columns=('Venue', 'Paper Title', 'Citation Style', 'Citation') )

mention_venues, mention_titles, mention_styles, mentions, notes = [], [], [], [], []

for i in range(len(text_contents)):
    references = {}
    context = block = text_contents[i]
    venue = venues[i]
    title = titles[i]
    
    if (type(block) == type(None)):
        # print("ADD AN ERROR")
        # print(venue, title, "ERROR:TEXT WAS NOT PARSIBLE", '')
        
        # MENTIONS_TABLE = pd.concat([MENTIONS_TABLE, df2], axis=0)

        mention_venues.append(venue)
        mention_titles.append(title)
        mention_styles.append(r)
        mentions.append('')
        notes.append("ERROR:TEXT WAS NOT PARSIBLE")
        
        continue
    # print(titles[i])
    # print(block)
    
    block = collect_tokens(inline_citation,block,"Inline Citation",references)
    block = collect_tokens(split_url_pattern,block,"URL",references)
    block = collect_tokens(num_section,block,"Numbered Section",{})
    # print(block)
    block = collect_tokens(footnote_pattern,block,"Footnote",references) 
    
    clean_tokens(references)
    
    mention_list = []
    first=True
    for r,l in references.items():
        for ll in l:
            mention_venues.append(venue)
            mention_titles.append(title)
            mention_styles.append(r)
            mentions.append(ll)
            if first:
                notes.append(context)
            else:
                notes.append("")
            first=False



MENTIONS_TABLE = pd.DataFrame(
    {
        'Venue': mention_venues,
        'Paper Title': mention_titles,
        'Mention Style': mention_styles,
        'Mention': mentions,
        'Notes' : notes
    }
)

In [17]:
MENTIONS_TABLE

Unnamed: 0,Venue,Paper Title,Mention Style,Mention,Notes
0,CHIL 2022,Data Augmentation for Electrocardiograms,Inline Citation,"wagner et al., 2020",data and code availability we use three\ndatas...
1,CHIL 2022,Data Augmentation for Electrocardiograms,Inline Citation,"goldberger et al., 2000",
2,CHIL 2022,Data Augmentation for Electrocardiograms,URL,https://github.com/aniruddhraghu/ecg_aug,
3,CHIL 2022,Lead-agnostic Self-supervised Learning for Loc...,Inline Citation,"reyna et al., 2021",data and code availability this paper uses\nth...
4,CHIL 2022,Lead-agnostic Self-supervised Learning for Loc...,Inline Citation,"wagner et al., 2020",
...,...,...,...,...,...
72,CHIL 2022,Identification of Subgroups With Similar Benef...,Inline Citation,jiang and li (2016),
73,CHIL 2022,Identification of Subgroups With Similar Benef...,Inline Citation,thomas and brunskill (2016),
74,CHIL 2022,Identification of Subgroups With Similar Benef...,Inline Citation,kallus and uehara (2020),
75,CHIL 2022,Identification of Subgroups With Similar Benef...,Inline Citation,komorowski et al. (2018),


## 5. Saving

In [18]:
# #TODO:
# print("* SWITCH TO MULTI-MATCHING")
MENTIONS_TABLE.to_csv("data/DatasetMentions_Unprocessed.csv")

## **SOME MANUAL ANNOTATION STILL REQUIRED**