In [1]:
###script to extract terms from abstracts and outcome reports
###extracts discipline terms, burning glass skills, and educational program terms
import re
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")

In [24]:
###import data
##grant data
nsf_df = pd.read_csv("../output/grants_1.csv")

##discipline terms list
#lists generated from 1a and 1b scripts, 
#then combined and with 'categories' categories column added manually
#through process of two-rater consensus
disc_df = pd.read_csv("../output/discipline_terms.csv")

##program terms list
program_df = pd.read_csv("../input/program_terms.csv")

## burning glass skills list 
#(originally from burningglassinstitute, downloaded from BÃ¶rner et al 2018)
bg_df = pd.read_csv("../input/bg_skills_list.csv")

## Web of Science discipline categories and crosswalk
#wos categories for discipline terms
wos_df = pd.read_csv("../input/wos_terms.csv")
#wos crosswalk from narrow to broad categories
wos_xwalk = pd.read_csv("../input/wos_categories.csv")

## STEAMB categories list
steamb_df = pd.read_csv("../input/steamb_terms.csv")


In [25]:
##create function to remove disciplines contained within other disciplines
def Remove_Subset(List):
    ListCopy=List
    for Element1 in List:
        for Element2 in List:
            if (Element2 in Element1) and (Element1!= Element2):
                ListCopy.remove(Element2)
    return(ListCopy)

In [26]:
###extract disciplines
#make list of discipline terms
terms_list = disc_df['terms'].tolist()

#convert terms to string
for i in range(len(terms_list)):
    terms_list[i] = str(terms_list[i])

#make spacy matcher out of terms list
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in terms_list]

matcher.add("Names", patterns)

In [27]:
## go through abstract sentences and use matcher to pull out discipline terms
nsf_df['fields_raw'] = ['none']*len(nsf_df)

for row in range(0,len(nsf_df)):
    #empty the list
    discipline_list=[]
    #set abstract
    abstract = str(nsf_df.loc[row, 'Abstract']).lower()
    #if abstract:
    if not pd.isna(abstract):
        #regexreplace sciences > science
        abstract = re.sub('sciences','science',abstract)
    
        doc = nlp(abstract)
        for match_id, start, end in matcher(doc):
            #append to list
            discipline_list.append(str(doc[start:end]))
            
        #remove repeats
        discipline_list=list(set(discipline_list))
        
        #remove terms containted within other terms
        discipline_list = Remove_Subset(discipline_list)
    
        #remove punctuation
        disc_col = str(discipline_list)
        disc_col = re.sub('\[','',disc_col)
        disc_col = re.sub('\]','',disc_col)
        disc_col = re.sub('\'','',disc_col)
    else:
        disc_col=""

    #add this row to df
    nsf_df.loc[row, 'fields_raw'] = disc_col

In [29]:
## repeat for outcome reports
nsf_df['por_fields_raw'] = ['none']*len(nsf_df)

for row in range(0,len(nsf_df)):
    #empty the list
    discipline_list=[]
    #set abstract
    abstract = str(nsf_df.loc[row, 'Outcome Report']).lower()
    #if abstract:
    if not pd.isna(abstract):
        #regexreplace sciences > science
        abstract = re.sub('sciences','science',abstract)
    
        doc = nlp(abstract)
        for match_id, start, end in matcher(doc):
            #append to list
            discipline_list.append(str(doc[start:end]))
            
        #remove repeats
        discipline_list=list(set(discipline_list))
        
        #remove terms containted within other terms
        discipline_list = Remove_Subset(discipline_list)
    
        #remove punctuation
        disc_col = str(discipline_list)
        disc_col = re.sub('\[','',disc_col)
        disc_col = re.sub('\]','',disc_col)
        disc_col = re.sub('\'','',disc_col)
    else:
        disc_col=""

    #add this row to df
    nsf_df.loc[row, 'por_fields_raw'] = disc_col

In [30]:
## convert terms extracted from the text into corrected discipline terms 
# using the 'consensus' column 
# (e.g. 'computational science'>'computer science' or 'biological and computational engineering'> 'biological engineering, computational engineering')

##define replace function
def field_replace(String):
    #split string into terms
    String_terms = String.split(', ')
    cat_list = []
    for term in String_terms:
        try:
            category=disc_df.loc[disc_df['terms'] == term]["categories"].item()
            cat_list.append(category)
        # StringCopy = re.sub(term, category, StringCopy)
        except:
            category=disc_df.loc[disc_df['terms'] == term]["categories"].tolist()
            discipline_list.append(category)
            
    #remove duplicates
    cat_list=list(set(cat_list))
        
    #remove puncuation
    NewString = str(cat_list)
    NewString = re.sub('\[','',NewString)
    NewString = re.sub('\]','',NewString)
    NewString = re.sub('\'','',NewString)
    
    return(NewString)

##set new column in df
nsf_df['Disciplines'] = ['none']*len(nsf_df)

##iterate input terms and replace
for row in range(0,len(nsf_df)):
    #run replace on the string of the row
    disc = nsf_df.loc[row, 'fields_raw']
    if disc != 'none':
        new_fields = field_replace(disc)

        #then set that as the new value
        nsf_df.loc[row, 'Disciplines'] = str(new_fields)


In [31]:
## Repeat for Outcome reports
##set new column in df
nsf_df['Disciplines Outcome Reports'] = ['none']*len(nsf_df)

##iterate input terms and replace
for row in range(0,len(nsf_df)):
    #run replace on the string of the row
    disc = nsf_df.loc[row, 'por_fields_raw']
    if disc != 'none':
        new_fields = field_replace(disc)

        #then set that as the new value
        nsf_df.loc[row, 'Disciplines Outcome Reports'] = str(new_fields)


In [34]:
###add discipline broad categories- Web of Science categories

## add WoS categories (narrow)
wos_func_df = wos_df[['terms', 'wos_terms']]

#rename columns
wos_func_df = wos_func_df.rename(columns={'wos_terms': 'categories'})

##define replace function
def wos_replace(String):
    #split string into terms
    String_terms = String.split(', ')
    cat_list = []
    for term in String_terms:    
        series_bool = wos_func_df['terms'] == term    
        if series_bool.any():
            try:
                category = wos_func_df.loc[wos_func_df['terms'] == term]["categories"].item()
                category = category.strip()
                cat_list.append(category)
            except:
                category=wos_func_df.loc[wos_func_df['terms'] == term]["categories"].tolist()
                discipline_list.append(category)
     #remove duplicates
    cat_list=list(set(cat_list))
        
    #remove puncuation
    NewString = str(cat_list)
    NewString = re.sub('\[','',NewString)
    NewString = re.sub('\]','',NewString)
    NewString = re.sub('\'','',NewString)

    return(NewString)

##set new column in df
nsf_df['WoS Categories'] = ['none']*len(nsf_df)

##iterate input terms and replace
for row in range(0,len(nsf_df)):
    #run replace on the string of the row
    disc = nsf_df.loc[row, 'Disciplines']
    if disc != 'none':
        new_fields = wos_replace(disc)
        #then set that as the new value
        if new_fields != 'nan':
            nsf_df.loc[row, 'WoS Categories'] = str(new_fields)

In [33]:
###repeat for outcome reports
##set new column in df
nsf_df['WoS Categories Outcome Reports'] = ['none']*len(nsf_df)

##iterate input terms and replace
for row in range(0,len(nsf_df)):
    #run replace on the string of the row
    disc = nsf_df.loc[row, 'Disciplines Outcome Reports']
    if disc != 'none':
        new_fields = wos_replace(disc)
        #then set that as the new value
        if new_fields != 'nan':
            nsf_df.loc[row, 'WoS Categories Outcome Reports'] = str(new_fields)


In [37]:
###set broad categories (what we actually want)
#rename columns
wos_xwalk = wos_xwalk.rename(columns={'WoS subject category': 'terms', 'Broad area': 'categories'})

##fix unbreakable space characters
wos_xwalk['terms'] = wos_xwalk['terms'].str.replace(r'\s', ' ', regex=True)
wos_xwalk['terms'] = wos_xwalk['terms'].str.rstrip()

wos_xwalk['categories'] = wos_xwalk['categories'].str.replace(r'\s', ' ', regex=True)
wos_xwalk['categories'] = wos_xwalk['categories'].str.rstrip()

##define replace function
def wosx_replace(String):
    #split string into terms
    String_terms = String.split(', ')
    cat_list = []
    for term in String_terms:
        series_bool = wos_xwalk['terms'] == term    
        if series_bool.any():
            try:
                category=wos_xwalk.loc[wos_xwalk['terms'] == term]["categories"].item()
                cat_list.append(category)
            except:
                category=wos_xwalk.loc[wos_xwalk['terms'] == term]["categories"].tolist()
                discipline_list.append(category)
     #remove duplicates
    cat_list=list(set(cat_list))
        
    #remove puncuation
    NewString = str(cat_list)
    NewString = re.sub('\[','',NewString)
    NewString = re.sub('\]','',NewString)
    NewString = re.sub('\'','',NewString)

    return(NewString)

##set new column in df
nsf_df['WoS Broad Categories'] = ['none']*len(nsf_df)

##iterate input terms and replace
for row in range(0,len(nsf_df)):
    #run replace on the string of the row
    disc = nsf_df.loc[row, 'WoS Categories']
    if disc != 'none':
        new_fields = wosx_replace(disc)

        #then set that as the new value
        nsf_df.loc[row, 'WoS Broad Categories'] = str(new_fields)

In [38]:
## repeat for outcome reports
##set new column in df
nsf_df['WoS Broad Categories Outcome Reports'] = ['none']*len(nsf_df)

##iterate input terms and replace
for row in range(0,len(nsf_df)):
    #run replace on the string of the row
    disc = nsf_df.loc[row, 'WoS Categories Outcome Reports']
    if disc != 'none':
        new_fields = wosx_replace(disc)

        #then set that as the new value
        nsf_df.loc[row, 'WoS Broad Categories Outcome Reports'] = str(new_fields)

In [39]:
### add discipline categories- STEAMB
steamb_func_df = steamb_df[['consensus categories', 'broad categories']]

#rename columns
steamb_func_df = steamb_func_df.rename(columns={'consensus categories': 'terms', 'broad categories': 'categories'})

##define replace function
def steamb_replace(String):
    #split string into terms
    String_terms = String.split(', ')
    cat_list = []
    for term in String_terms:    
        series_bool = steamb_func_df['terms'] == term    
        if series_bool.any():
            try:
                category=steamb_func_df.loc[steamb_func_df['terms'] == term]["categories"].item()
                cat_list.append(category)
            except:
                category=steamb_func_df.loc[steamb_func_df['terms'] == term]["categories"].tolist()
                discipline_list.append(category)
     #remove duplicates
    cat_list=list(set(cat_list))
        
    #remove puncuation
    NewString = str(cat_list)
    NewString = re.sub('\[','',NewString)
    NewString = re.sub('\]','',NewString)
    NewString = re.sub('\'','',NewString)

    return(NewString)

##set new column in df
nsf_df['STEAMB Categories'] = ['none']*len(nsf_df)

##iterate input terms and replace
for row in range(0,len(nsf_df)):
    #run replace on the string of the row
    disc = nsf_df.loc[row, 'Disciplines']
    if disc != 'none':
        new_fields = steamb_replace(disc)

        #then set that as the new value
        nsf_df.loc[row, 'STEAMB Categories'] = str(new_fields)

In [40]:
## repeat for outcome reports
##set new column in df
nsf_df['STEAMB Categories Outcome Reports'] = ['none']*len(nsf_df)

##iterate input terms and replace
for row in range(0,len(nsf_df)):
    #run replace on the string of the row
    disc = nsf_df.loc[row, 'Disciplines Outcome Reports']
    if disc != 'none':
        new_fields = steamb_replace(disc)

        #then set that as the new value
        nsf_df.loc[row, 'STEAMB Categories Outcome Reports'] = str(new_fields)

In [41]:
### program terms extract
terms_list = program_df['terms'].tolist()

#convert terms to string
for i in range(len(terms_list)):
    terms_list[i] = str(terms_list[i])

#make spacy matcher out of terms list
matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp(name) for name in terms_list]

matcher.add("Names", patterns)

#make new column
nsf_df['Program Terms'] = ['none']*len(nsf_df)

for row in range(0,len(nsf_df)):
    #empty the list
    programs_list=[]
    abstract = str(nsf_df.loc[row, 'Abstract']).lower()
    #if abstract:
    if not pd.isna(abstract):
        doc = nlp(abstract)
        for match_id, start, end in matcher(doc):
            #append to list
            programs_list.append(str(doc[start:end]))    
        #remove repeats
        programs_list=list(set(programs_list))
    else:
        programs_list=[]
    #add list to df
    nsf_df.loc[row, 'Program Terms'] = str(programs_list)

In [42]:
## repeat for Outcome Reports
#make new column
nsf_df['Program Terms Outcome Reports'] = ['none']*len(nsf_df)

for row in range(0,len(nsf_df)):
    #empty the list
    programs_list=[]
    abstract = str(nsf_df.loc[row, 'Outcome Report']).lower()
    #if abstract:
    if not pd.isna(abstract):
        doc = nlp(abstract)
        for match_id, start, end in matcher(doc):
            #append to list
            programs_list.append(str(doc[start:end]))    
        #remove repeats
        programs_list=list(set(programs_list))
    else:
        programs_list=[]
    #add list to df
    nsf_df.loc[row, 'Program Terms Outcome Reports'] = str(programs_list)

In [43]:
### skills terms extract
terms_list = bg_df['terms'].tolist()

#convert terms to string
for i in range(len(terms_list)):
    terms_list[i] = str(terms_list[i])

#make spacy matcher out of terms list
matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp(name) for name in terms_list]

matcher.add("Names", patterns)

#make new column
nsf_df['BG Skills'] = ['none']*len(nsf_df)

for row in range(0,len(nsf_df)):
    #empty the list
    programs_list=[]
    abstract = str(nsf_df.loc[row, 'Abstract']).lower()
    #if abstract:
    if not pd.isna(abstract):
    
        doc = nlp(abstract)
        for match_id, start, end in matcher(doc):
            #append to list
            programs_list.append(str(doc[start:end]))
            
        #remove repeats
        programs_list=list(set(programs_list))
    else:
        programs_list=[]
    #add list to df
    nsf_df.loc[row, 'BG Skills'] = str(programs_list)

In [44]:
## Repeat for outcome reports

#make new column
nsf_df['BG Skills Outcome Reports'] = ['none']*len(nsf_df)

for row in range(0,len(nsf_df)):
    #empty the list
    programs_list=[]
    abstract = str(nsf_df.loc[row, 'Outcome Report']).lower()
    #if abstract:
    if not pd.isna(abstract):
    
        doc = nlp(abstract)
        for match_id, start, end in matcher(doc):
            #append to list
            programs_list.append(str(doc[start:end]))
            
        #remove repeats
        programs_list=list(set(programs_list))
    else:
        programs_list=[]
    #add list to df
    nsf_df.loc[row, 'BG Skills Outcome Reports'] = str(programs_list)

In [46]:
##clean up columns
nsf_df = nsf_df.drop(['from_sentences', 'trimmed_sentences','disciplines_manual',
                      'extracted_disc_llm1','extracted_disc_llm2','fields_raw', 
                      'por_fields_raw','WoS Categories','WoS Categories Outcome Reports'], axis=1)


In [48]:
###save data with new columns
nsf_df.to_csv('../output/grants.csv', index=False)