### Abbreviation Expansion

We discovered that abbreviation expansion was extremely important becuase the text refers to datasets as abbreviations mostly. Abbreviation expansion module inhanced the accuracy of our sentence classification model by 7 %. When there are multiple possible expansions True expansion of the abbreviation should also take into consideration the context of the text. Otherwise it will lead to bias in the model. Example being WLS which can be 'Wisconsin Longitudinal Study' or 'Weighted Least Square'. We did not handle this for now but it is crucial in any future work.

We developed a abbreviation expansion dictionary from the datsets metadata file. It was conststurcted using following approach:
- All the abbreviations were detected using regex matching
- N-Grams of the relevant length were extracted from the description an metions (both with and without stop word removal)
- Abbreviation and intials of N-grams were matched to create possible expansions
- Lemmatization resolved for the muliple expansion caused by  'X study' and 'X studies' 
- Manual pruning resolved the remaininig multiple expansions

We also attempted to create an abbreviation dictionary for the whole corpus, but it resulted in a lot of bogus expansions that would induce bias in the model. So we did not use it. But it is certianly something that can be worked on in future.

In [1]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
import string
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [13]:
import spacy
# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

In [14]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [15]:
file = 'data_sets.json'
directory = '/home/urwa/Documents/Coleridge/ProjectFiles/train_test/train_test/'
text = load_doc(directory+file)

loaded_json = json.loads(text)

In [16]:
## Checking if an abbreviation corresponds to a geographical location
def IsLocation(txt):
    doc = nlp(txt)
    return len(doc.ents) > 0 and doc.ents[0].label_ == 'GPE'

In [17]:
## Find abbreviation using Regex
def findAbbreviation(mentions):
    regex = r"\b[A-Z][A-Z]+\b"
    abbreviations = []
    for m in mentions:
        abbreviations += re.findall(regex, m)
    abbreviations = [w for w in abbreviations if not IsLocation(w)]

    abbreviations = [w for w in abbreviations if len(w)>2]
    
    # filter special words
    special_words = ['study','survey','data', 'research', 'surveys', 'studies','ii' ]
    abbreviations = [w for w in abbreviations if not w.lower() in special_words]
    
    return abbreviations

In [18]:
# turn a doc into clean tokens
def clean_mention(doc):
    
    doc = doc.replace('- ','')
    doc = doc.replace('-',' ')
    
    # split into tokens by white space
    tokens = doc.split()
    
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    
    # make lower case
    tokens = [word.lower() for word in tokens]  
    
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    
    return ' '.join(tokens)

In [19]:
### Return N grams in the specified range
def findExpansions(mentions,minL=2,maxL=10):
    # each field as a sentence
    docs = [clean_mention(m) for m in mentions]
    docs = [d for d in docs if len(d.split())>1]
    
    if len(docs) > 0:
    
        # create the tokenizer
        vectorizer = TfidfVectorizer(ngram_range=(minL, maxL))

        # fit the tokenizer on the documents
        tfidVec = vectorizer.fit(docs)
    
        return vectorizer.get_feature_names()
    
    else:
        return []

In [20]:
# Match initials to abbreviation
def matchInitial(abb,expansion):
    initials = ''.join([w[0] for w in expansion.split()])
    return abb == initials

In [21]:
# Find the abbreviation in possible expansions
def expandAbbreviation(key, worchChunks):
    key = key.lower()
    candidates = [w for w in worchChunks if len(w.split()) == len(key)]
    matches = []
    for c in candidates:
        if matchInitial(key,c):
            matches.append(c)
    return matches

In [22]:
abbDict = {}
i=0
for d in loaded_json:
    mentions = d['mention_list']
    if len(mentions) > 0:
        abbs = findAbbreviation(mentions)
        expansions = findExpansions(mentions,minL=2,maxL=10)
        for ab in abbs:
            matches = expandAbbreviation(ab,expansions)
            if len(matches) > 0:
                if ab in abbDict:
                    abbDict[ab] = list(set(abbDict[ab] + matches))
                else:
                    abbDict[ab] = list(set(matches))
    i += 1

In [23]:
len(abbDict)

204

In [24]:
abbDict

{'SRC': ['survey research center'],
 'CPS': ['current population studies',
  'center political studies',
  'current population surveys',
  'current population survey',
  'current population study'],
 'NES': ['national election studies', 'national election study'],
 'SIPP': ['survey income program participation',
  'surveys income program participation'],
 'PSID': ['panel study income dynamics'],
 'ANES': ['american national election studies'],
 'HIS': ['health interview survey', 'health interview surveys'],
 'NHIS': ['national health interview surveymultiple',
  'national health interview surveys',
  'national health interview survey'],
 'NCHS': ['national center health statistics'],
 'FBI': ['federal bureau investigation', 'federal bureau investigations'],
 'UCR': ['uniform crime reports',
  'uniform crime report',
  'uniform crime reporting'],
 'NLS': ['national longitudinal survey'],
 'NLSOM': ['national longitudinal survey older men'],
 'MFI': ['monetary financial institutions'],
 

In [2]:
# Load the stemmer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()

def stemExpansion(expansion):
    return " ".join([ps.stem(word) for word in expansion.split()])

In [26]:
## created the stemmed dictionary
lemAbbDict = {}
for k,expList in abbDict.items():
        lemAbbDict[k] = list(set([stemExpansion(exp) for exp in expList]))

In [27]:
lemAbbDict

{'SRC': ['survey research center'],
 'CPS': ['current popul survey', 'current popul studi', 'center polit studi'],
 'NES': ['nation elect studi'],
 'SIPP': ['survey incom program particip'],
 'PSID': ['panel studi incom dynam'],
 'ANES': ['american nation elect studi'],
 'HIS': ['health interview survey'],
 'NHIS': ['nation health interview surveymultipl',
  'nation health interview survey'],
 'NCHS': ['nation center health statist'],
 'FBI': ['feder bureau investig'],
 'UCR': ['uniform crime report'],
 'NLS': ['nation longitudin survey'],
 'NLSOM': ['nation longitudin survey older men'],
 'MFI': ['monetari financi institut'],
 'MIR': ['mfi interest rate'],
 'ITS': ['intern trade servic'],
 'SITS': ['statist intern trade servic'],
 'SHS': ['secur hold statist'],
 'PHF': ['panel household financ'],
 'FDI': ['foreign direct invest'],
 'RDSC': ['research data servic centr'],
 'EPESE': ['establish popul epidemiolog studi elderlyepeseprogram',
  'establish popul epidemiolog studi elderli'],

# Manual Cleaning of the dictionary

## Dictionary with stop word removal

In [4]:
dict2 = {'SIPP': ['survey incom program particip'],
 'PSID': ['panel studi incom dynam'],
 'NCHS': ['nation center health statist'],
 'FBI': ['feder bureau investig'],
 'NLSOM': ['nation longitudin survey older men'],
 'ITS': ['intern trade servic'],
 'SITS': ['statist intern trade servic'],
 'PHF': ['panel household financ'],
 'RDSC': ['research data servic centr'],
 'EPESE': ['establish popul epidemiolog studi elderli'],
 'HEPESE': ['hispan establish popul epidemiolog studi elderli'],
 'HRS': ['health retir survey'],
 'NHEFS': ['nation health epidemiolog follow studi'],
 'NIA': ['nation institut age'],
 'CDC': ['center diseas control'],
 'NIMH': ['nation institut mental health'],
 'NSFH': ['nation survey famili household'],
 'ALSA': ['australian longitudin studi age'],
 'PHDCN': ['project human develop chicago neighborhood'],
 'CLHNS': ['cebu longitudin health nutrit survey'],
 'ACTIVE': ['advanc cognit train independ vital elderli'],
 'MDICP': ['malawi diffus ideat chang project'],
 'MLSFH': ['malawi longitudin studi famili health'],
 'NSDUH': ['nation survey drug use health'],
 'LAC': ['latin america caribbean'],
 'CILS': ['children immigr longitudin survey'],
 'CNLSY': ['children nation longitudin survey youth'],
 'NLSY': ['nation longitudin survey youth'],
 'NPHS': ['nation pregnanc health survey'],
 'SALSA': ['sacramento area latino studi age'],
 'NSFG': ['nation survey famili growth'],
 'NSAS': ['nation survey ambulatori surgeri'],
 'MCD': ['multipl caus death'],
 'SECCYD': ['studi earli child care youth develop'],
 'SATSA': ['swedish adopt twin studi age'],
 'CLOC': ['chang live older coupl'],
 'LSADT': ['longitudin studi age danish twin'],
 'NIDA': ['nation institut drug abus'],
 'SATHCAP': ['sexual acquisit transmiss hiv cooper agreement program'],
 'SATH': ['sexual acquisit transmiss hiv'],
 'CSDD': ['cambridg studi delinqu develop'],
 'BJS': ['bureau justic statist'],
 'SISFCF': ['survey inmat state feder correct facil'],
 'SISCF': ['survey inmat state correct facil'],
 'SIFCF': ['survey inmat feder correct facil'],
 'NSFEC': ['nation survey famili econom condit'],
 'SYRP': ['survey youth residenti placement'],
 'SEBAS': ['social environ biomark age studi'],
 'WFHS': ['work famili health studi'],
 'DCRP': ['death custodi report program'],
 'RDSL': ['relationship dynam social life'],
 'LEMAS': ['law enforc manag administr statist'],
 'NAS': ['nation academi scienc'],
 'NSFB': ['nation survey fertil barrier'],
 'RISE': ['rehabilit impair studi elderli'],
 'NHSDA': ['nation household survey drug abus'],
 'NSSCA': ['nation survey self care age'],
 'SUPPORT': ['studi understand prognos prefer outcom risk treatment'],
 'QES': ['qualiti employ survey'],
 'NHSLS': ['nation health social life survey'],
 'NMIHS': ['nation matern infant health survey'],
 'NSPO': ['nation survey physician organ'],
 'NSAF': ['nation survey america famili'],
 'GIBS': ['gambl impact behavior studi'],
 'HBSC': ['health behavior school children'],
 'SSATS': ['survey substanc abus treatment servic'],
 'NSDE': ['nation studi daili experi'],
 'USC': ['univers southern california'],
 'IHDP': ['infant health develop program'],
 'CPHHD': ['center popul health health dispar'],
 'RPD': ['research pathway desist'],
 'SPPA': ['survey public particip art'],
 'IIMMLA': ['immigr intergener mobil metropolitan lo angel'],
 'RELATE': ['research earli life age trend effect'],
 'NSAL': ['nation survey american life'],
 'FHWAR': ['fish hunt wildlif associ recreat'],
 'PATH': ['popul assess tobacco health'],
 'STIC': ['servic treatment implement correct'],
 'NSA': ['nation survey adolesc'],
 'CSLLEA': ['censu state local law enforc agenc'],
 'PRMIHS': ['puerto rican matern infant health studi'],
 'HAALSI': ['health age africa longitudin studi indepth'],
 'WTORS': ['wheelchair tiedown occup restraint system'],
 'SQF': ['stop question frisk'],
 'SWAN': ['studi women across nation'],
 'SII': ['studi instruct improv'],
 'NSHAP': ['nation social life health and age project']       }

## Dict with out stop word removal

In [5]:
myDict ={'SRC': ['survey research center'],
 'NES': ['nation elect studi'],
 'CPS': ['current popul survey'],
 'MTF': ['monitor the futur'],
 'ANES': ['american nation elect studi'],
 'HIS': ['health interview survey'],
 'NHIS': ['nation health interview survey'],
 'UCR': ['uniform crime report'],
 'NLS': ['nation longitudin survey'],
 'MTFS': ['monitor the futur survey'],
 'NHANES': ['nation health and nutrit examin survey'],
 'MFI': ['monetari financi institut'],
 'MIR': ['mfi interest rate'],
 'SHS': ['secur hold statist'],
 'FDI': ['foreign direct invest'],
 'NLTCS': ['nation long term care survey'],
 'NLTC': ['nation long term care'],
 'NHANESI': ['nation health and nutrit examin survey I'],
 'NEFS': ['nhane epidemiolog followup survey'],
 'HANES': ['health and nutrit examin survey'],
 'NHDS': ['nation hospit discharg survey'],
 'ECA': ['epidemiolog catchment area'],
 'ACL': ['american chang live'],
 'LSOA': ['longitudin studi on age'],
 'SOA': ['studi on age'],
 'HES': ['health examin survey'],
 'NMES': ['nation medic expenditur survey'],
 'HHANES': ['hispan health and nutrit examin survey'],
 'EVS': ['european valu survey'],
 'WVS': ['world valu survey'],
 'DAWN': ['drug abus warn network'],
 'NHS': ['nation hospic studi'],
 'AHS': ['adolesc health survey'],
 'CHNS': ['china health nation survey'],
 'CTS': ['commun track survey'],
 'CPES': ['collabor psychiatr epidemiolog survey'],
 'NCS': ['nation comorbid survey'],
 'CLHLS': ['chines longitudin health longev survey'],
 'NEISS': ['nation electron injuri surveil system'],
 'AIP': ['all injuri program'],
 'CPSC': ['consum product safeti commiss'],
 'DATOS': ['drug abus treatment outcom studi'],
 'MID': ['militar interst disput'],
 'CHIP': ['china household incom project'],
 'MCOD': ['multipl caus of death'],
 'CMF': ['compress mortal file'],
 'NDI': ['nation death index'],
 'CPFOA': ['commun partnership for older adult'],
 'BJBC': ['better job better care'],
 'HOS': ['health outcom survey'],
 'MHOS': ['medicar health outcom survey'],
 'CES': ['consum expenditur survey'],
 'TED': ['treatment episod data'],
 'TEDS': ['treatment episod data set'],
 'ASAPS': ['adolesc substanc abus prevent studi'],
 'CAP': ['cooper agreement program'],
 'NCRP': ['nation correct report program'],
 'IHDS': ['india human develop survey'],
 'NVDRS': ['nation violent death report system'],
 'TUS': ['tobacco use supplement'],
 'CDS': ['child develop supplement'],
 'WHO': ['world health organ'],
 'NAMCS': ['nation ambulatori medic care survey'],
 'NHAMCS': ['nation hospit ambulatori medic care survey'],
 'MLS': ['matern lifestyl studi'],
 'CAS': ['colleg alcohol survey'],
 'FFCW': ['fragil famili child wellb'],
 'FFCWS': ['fragil famili child wellb studi'],
 'FFS': ['fragil famili survey'],
 'NPS': ['nation prison statist'],
 'FES': ['famili exchang studi'],
 'CHIPS': ['chines household incom project survey'],
 'NIBRS': ['nation incid base report system'],
 'LEOKA': ['law enforc offic kill and'],
 'NRC': ['nation research council'],
 'CVAR': ['commun vulner and respons'],
 'CWS': ['conting worker supplement'],
 'RWJF': ['robert wood johnson foundat'],
 'CCS': ['cancer control supplement'],
 'MCBS': ['medicar current beneficiari survey'],
 'NHES': ['nation health examin survey'],
 'NCI': ['nation cancer institut'],
 'NMFS': ['nation mortal followback survey'],
 'HELP': ['hospit elderli longitudin project'],
 'SBBS': ['small busi benefit survey'],
 'MHS': ['mental health supplement'],
 'SHR': ['supplementari homicid report'],
 'RLMS': ['russian longitudin monitor survey'],
 'CIC': ['commun in charg'],
 'ECLS': ['earli childhood longitudin studi'],
 'NORC': ['nation opinion research center'],
 'MEPS': ['medic expenditur panel survey'],
 'NNHS': ['nation nurs home survey'],
 'CAG': ['commun advisori group'],
 'AFL': ['activ for life'],
 'ACHS': ['alameda counti health studi'],
 'TCHS': ['tecumseh commun health studi'],
 'NHMS': ['nation health measur studi'],
 'GSS': ['gener social survey'],
 'ATUS': ['american time use survey'],
 'FSS': ['food secur supplement'],
 'IFLS': ['indonesian famili life survey'],
 'ACS': ['american commun survey'],
 'ASOC': ['and sens of control'],
 'NTIES': ['nation treatment improv evalu survey'],
 'LSOG': ['longitudin studi of gener'],
 'ETV': ['exposur to violenc'],
 'MTO': ['move to opportun'],
 'ADAM': ['arreste drug abus monitor'],
 'AHEAD': ['addict health evalu and diseas'],
 'CJDATS': ['crimin justic drug abus treatment studi'],
 'NDATSS': ['nation drug abus treatment system survey'],
 'ODATS': ['outpati drug abus treatment studi'],
 'EASS': ['east asian social survey'],
 'SCS': ['school crime supplement'],
 'HEGIS': ['higher educ gener inform survey'],
 'LHD': ['local health depart'],
 'HRMS': ['health reform monitor survey'],
 'NELS': ['nation educ longitudin studi'],
 'CMGPD': ['china multi gener panel dataset'],
 'SAQ': ['self administ questionnair'],
 'MACC': ['minnesota adolesc commun cohort'],
 'OPII': ['organiz process improv intervent'],
 'DATS': ['drug abus treatment studi'],
 'CID': ['citizenship involv democraci'],
 'FAS': ['flint adolesc studi'],
 'WLS': ['wisconsin longitudin studi'],
 'ICVS': ['intern crime victimis survey'],
 'BTG': ['bridg the gap'],
 'NYPD': ['new york polic depart'],
 'SCI': ['spinal cord injuri'],
 'PPCS': ['polic public contact survey'],
 'CZO': ['critic zone observatori'],
 'BCS': ['british crime survey'],
 'BRFSS': ['behavior risk factor surveil system'],
 'YPP': ['youth participatori polit'],
 'CNES': ['cross nation elect studi']}


In [6]:
# Merging the dictionaries
for k,v in dict2.items():
    if k not in myDict:
        myDict[k] = v

In [7]:
# cheching if any abbreviation has multiple expansions
for k,v in dict2.items():
    if len(v) >1:
        print(k)

In [8]:
# Saving the dictionary
with open('abbreviations.json', 'w') as fp:
    json.dump(myDict, fp)