This script is to pull from the existing Heliophysics vocabularies to create a database of all terms, whether they are defined, the number of times they are defined, and any gaps

compiling relevant space weather and helio relevant vocabularies [here](https://docs.google.com/spreadsheets/d/15D5tm6TchdZXfwjnDpJMh8ICCsU4C3qGM-amX9YjZkM/edit?usp=sharing)

We will not attempt to bring in all glossaries in this project. The subset we will begin with are: 
- Unified Astronomy Thesaurus
- AGU index terms
- NASA Heliophysics Vocabulary
- SPASE

Stretch glossaries will include: 
- NOAA Space Weather
- ESA Space Weather Glossary
- (Maybe) a very small subset of the AMS Glossary of Meteorology






In [1]:
import os, json
import numpy as np
import pandas as pd

import re

# Compiling Helio Glossaries

In [2]:
directory_vocabs = 'data/'


In [3]:
# Read in existing solar-related terms from UAT
pd_UAT = pd.read_csv(os.path.join(directory_vocabs,'UAT_Solar-related-concepts.csv'))
pd_UAT

Unnamed: 0,level 1,level 2,level 3,level 4,level 5,level 6,level 7,level 8,level 9,level 10,level 11
0,Observational astronomy,Astronomical instrumentation,Observatories,Solar observatories,,,,,,,
1,Observational astronomy,Astronomical instrumentation,Solar instruments,,,,,,,,
2,Observational astronomy,Astronomical instrumentation,Stellar tracking devices,Heliostats,,,,,,,
3,Observational astronomy,Astronomical instrumentation,Telescopes,Optical telescopes,Solar optical telescopes,,,,,,
4,Observational astronomy,Astronomical instrumentation,Telescopes,Radio telescopes,Solar radio telescopes,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
439,Solar system astronomy,Solar system,Space weather,,,,,,,,
440,Solar system astronomy,Solar system,The Moon,,,,,,,,
441,Solar system astronomy,Solar system,The Sun,,,,,,,,
442,Solar system astronomy,Solar system,,,,,,,,,


In [4]:
# Transforming to a new dataframe that can be combined with the other glossaries
list_UAT_terms = []
for c in pd_UAT.columns:
    print(c)
    list_UAT_terms = list_UAT_terms + list(set(pd_UAT[c].values))
    
list_UAT_new = []
for val in list_UAT_terms:
    if isinstance(val,float):
#         list_UAT_terms.remove(val)
        continue
    list_UAT_new = list_UAT_new + [val]
list_UAT_terms = [val.lower() for val in list_UAT_new]

pd_UAT_terms = pd.DataFrame(columns=['term','definition'])
pd_UAT_terms['term'] = list_UAT_terms

pd_UAT_terms['source'] = np.tile('uat',(len(pd_UAT_terms),1))

level 1
level 2
level 3
level 4
level 5
level 6
level 7
level 8
level 9
level 10
level 11


In [34]:
list_UAT_terms

['solar physics',
 'solar system astronomy',
 'observational astronomy',
 'solar particle emission',
 'solar spectral irradiance',
 'solar evolution',
 'solar wind',
 'solar radius',
 'solar magnetic fields',
 'solar mass',
 'solar electromagnetic emission',
 'helioseismology',
 'solar system',
 'solar abundances',
 'solar interior',
 'planetary science',
 'earth-moon system',
 'solar atmosphere',
 'cno anomaly',
 'solar activity',
 'lunar science',
 'sunspots',
 'solar flares',
 'solar radiation',
 'solar oscillations',
 'solar granulation',
 'solar motion',
 'solar surface',
 'astronomical techniques',
 'solar faculae',
 'astronomical instrumentation',
 'solar rotation',
 'natural satellite atmospheres',
 'antapex',
 'solar prominences',
 'lunar phase',
 'kuiper belt',
 'comet origins',
 'optical observation',
 'solar corona',
 'solar convective zone',
 'solar flares',
 'delta sunspots',
 'lunar origin',
 'planetary atmospheres',
 'planetary climates',
 'meteoroids',
 'planetary-disk

In [35]:
pd_UAT_terms

Unnamed: 0,term,definition,source
0,solar physics,,uat
1,solar system astronomy,,uat
2,observational astronomy,,uat
3,solar particle emission,,uat
4,solar spectral irradiance,,uat
...,...,...,...
423,kreutz group,,uat
424,meyer group,,uat
425,gegenschein,,uat
426,oort cloud objects,,uat


In [5]:
# Read in AGU Index Terms
pd_agu = pd.read_excel(os.path.join(directory_vocabs,'agu-index-terms.xlsx'))
pd_agu

# Pull out only the section terms we are interested in
#  - Informatics (1900)
#  - Interplanetary Physics (2100)
#  - Ionosphere (2400)
#  - Magnetosphere (2700)
#  - Space Weather (2101, 2788, 7900, 4305)
#  - Extreme events (1817, 3235, 4313)
#  - Nonlinear Geophysics (3200, 4307, 6944, 7839, 4400)
#  - Radio Science (6900)
#  - Solar Physics, Astrophysics, and Astronomy (7500)
#  - Space Plasma Physics (7800)
#  - (maybe) Mathematical Geophysics (3200)
#  - (maybe) Natural Hazards (4300)

# NOTE: Important to maintain the hierarchy
#  Establish structure that maintains the categories above defines the terms within them (e.g., Space Weather: Geomagnetically induced currents)

# NOTE: removing XX99 terms because they are always 'General or miscellaneous'

# Remove duplicate terms


Unnamed: 0,Code,Description
0,200,GEOHEALTH
1,205,Archaeological Geology
2,210,Coal Geology
3,215,Economic geology
4,216,Engineering geology
...,...,...
1313,9805,Instruments useful in three or more fields
1314,9810,New fields (not classifiable under other headi...
1315,9815,Notices and announcements
1316,9820,Techniques applicable in three or more fields


In [6]:

for r in range(len(pd_agu)): 
    if ~( (pd_agu['Code'][r] >= 1900) & (pd_agu['Code'][r] < 1999) |
         (pd_agu['Code'][r] >= 2100) & (pd_agu['Code'][r] < 2199) |
         (pd_agu['Code'][r] >= 2400) & (pd_agu['Code'][r] < 2499) |
         (pd_agu['Code'][r] >= 2700) & (pd_agu['Code'][r] < 2799) |
         (pd_agu['Code'][r] >= 3200) & (pd_agu['Code'][r] < 3299) |
         (pd_agu['Code'][r] >= 4300) & (pd_agu['Code'][r] < 4399) |
         (pd_agu['Code'][r] >= 6900) & (pd_agu['Code'][r] < 6999) |
         (pd_agu['Code'][r] >= 7500) & (pd_agu['Code'][r] < 7599) |
         (pd_agu['Code'][r] >= 7800) & (pd_agu['Code'][r] < 7899) ):
#         print('Code = {} --> Term = {}'.format(pd_agu['Code'][r],pd_agu['Description'][r]))
        pd_agu = pd_agu.drop([r])

for r in pd_agu.index:
    if '(' in pd_agu['Description'][r]:
        print('prior = {}'.format(pd_agu['Description'][r]))
        pd_agu['Description'][r] = pd_agu['Description'][r][0:pd_agu['Description'][r].find('(')-1]
        print('post = {}'.format(pd_agu['Description'][r]))

pd_agu

prior = Decision analysis (4324, 6309)
post = Decision analysis
prior = Forecasting (2722, 4315, 7924)
post = Forecasting
prior = Machine learning (0555)
post = Machine learning
prior = Modeling (0466, 0545, 0798, 1847, 4255, 4316)
post = Modeling
prior = Real-time and responsive information delivery (4346)
post = Real-time and responsive information delivery
prior = Spatial analysis and representation (0500, 3252)
post = Spatial analysis and representation
prior = Statistical methods: Descriptive (4318)
post = Statistical methods: Descriptive
prior = Statistical methods: Inferential (4318)
post = Statistical methods: Inferential
prior = Temporal analysis and representation (1872, 3270, 4277, 4475)
post = Temporal analysis and representation
prior = Uncertainty (1873, 3275)
post = Uncertainty
prior = Visualization and portrayal (0530)
post = Visualization and portrayal
prior = Coronal mass ejections (4305, 7513)
post = Coronal mass ejections
prior = Discontinuities (7811)
post = Discon

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_agu['Description'][r] = pd_agu['Description'][r][0:pd_agu['Description'][r].find('(')-1]


Unnamed: 0,Code,Description
432,1900,INFORMATICS
433,1902,Community modeling frameworks
434,1904,Community standards
435,1906,"Computational models, algorithms"
436,1908,Cyberinfrastructure
...,...,...
1154,7859,Transport processes
1155,7863,Turbulence
1156,7867,Wave/particle interactions
1157,7868,Wave/wave interactions


In [7]:
# Transforming into a new dataframe that can be combined with other glossaries
pd_agu_terms = pd.DataFrame(columns=['term','definition'])
pd_agu_terms['term'] = pd_agu['Description']
pd_agu_terms

pd_agu_terms['term'] = pd_agu_terms['term'].str.lower()

pd_agu_terms['source'] = np.tile('agu',(len(pd_agu_terms),1))
pd_agu_terms


Unnamed: 0,term,definition,source
432,informatics,,agu
433,community modeling frameworks,,agu
434,community standards,,agu
435,"computational models, algorithms",,agu
436,cyberinfrastructure,,agu
...,...,...,...
1154,transport processes,,agu
1155,turbulence,,agu
1156,wave/particle interactions,,agu
1157,wave/wave interactions,,agu


In [8]:
with open('data/AGU_index_terms_Helio.txt','w') as fp:
    for item in pd_agu_terms['term'].values:
        fp.write(str("{}\n".format(item)))
                 
fp.close()
    

In [9]:
os.path.exists(os.path.join(directory_vocabs,'spase-2.5.0-draft.json'))

True

In [40]:
# Read in SPASE
f = open(os.path.join(directory_vocabs,'spase-2.5.0-draft.json'))
json_spase = json.load(f)
f.close()
# json_spase
json_spase['dictionary']

# pd_spase = pd.read_json(os.path.join(directory_vocabs,'spase-2.5.0-draft.json'))
# pd_spase


{'Absorption': {'version': '2.5.0',
  'since': '1.3.5',
  'term': 'Absorption',
  'type': 'Item',
  'list': '',
  'element': '',
  'attributes': '',
  'definition': 'Decrease of radiant energy (relative to the background continuum spectrum).',
  'usedBy': [],
  'allowedValues': []},
 'AccessInformation': {'version': '2.5.0',
  'since': '1.0.0',
  'term': 'AccessInformation',
  'type': 'Container',
  'list': '',
  'element': '',
  'attributes': '',
  'definition': 'Attributes of the resource which pertain to how to accessing the resource, availability and storage format.',
  'usedBy': ['Catalog', 'DisplayData', 'Document', 'NumericalData'],
  'subElements': ['RepositoryID',
   'Availability',
   'AccessRights',
   'AccessURL',
   'Format',
   'Encoding',
   'DataExtent',
   'Acknowledgement'],
  'allowedValues': []},
 'AccessRights': {'version': '2.5.0',
  'since': '1.0.0',
  'term': 'AccessRights',
  'type': 'Enumeration',
  'list': 'AccessRights',
  'element': '',
  'attributes': '',


In [41]:
json_spase['dictionary']['AccessInformation']

{'version': '2.5.0',
 'since': '1.0.0',
 'term': 'AccessInformation',
 'type': 'Container',
 'list': '',
 'element': '',
 'attributes': '',
 'definition': 'Attributes of the resource which pertain to how to accessing the resource, availability and storage format.',
 'usedBy': ['Catalog', 'DisplayData', 'Document', 'NumericalData'],
 'subElements': ['RepositoryID',
  'Availability',
  'AccessRights',
  'AccessURL',
  'Format',
  'Encoding',
  'DataExtent',
  'Acknowledgement'],
 'allowedValues': []}

In [42]:
for key in json_spase['dictionary']:
    if 

SyntaxError: invalid syntax (<ipython-input-42-8bf50cc6c720>, line 2)

In [43]:

term_list_spase = []
definition_list_spase = []
type_list_spase = []
for key in json_spase['dictionary']:
#     print(key, '->', json_spase['dictionary'][key])
    term_list_spase.append(key)
    definition_list_spase.append(json_spase['dictionary'][key]['definition'])
    type_list_spase.append(json_spase['dictionary'][key]['type'])
        
pd_spase_terms = pd.DataFrame({'term':term_list_spase,'definition':definition_list_spase,'type':type_list_spase})

# separate words in SPASE
for t in range(len(pd_spase_terms['term'])):
    
#     print(re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', spase_pd['term'][t]))
    pd_spase_terms['term'][t] = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', pd_spase_terms['term'][t])
    
pd_spase_terms

pd_spase_terms['term'] = pd_spase_terms['term'].str.lower()
pd_spase_terms['source'] = np.tile('spase',(len(pd_spase_terms),1))

# Drop all types other than Item and Enumeration
mask_full = ( (pd_spase_terms['type']=='Container') | (pd_spase_terms['type']=='Text') | (pd_spase_terms['type']=='Numeric') | (pd_spase_terms['type']=='Duration') )
pd_spase_terms = pd_spase_terms.drop(pd_spase_terms.index[mask_full])
pd_spase_terms = pd_spase_terms.drop(columns=['type'])

pd_spase_terms

Unnamed: 0,term,definition,source
0,absorption,Decrease of radiant energy (relative to the ba...,spase
2,access rights,Permissions granted or denied by the host of a...,spase
4,ac electric field,Alternating electric field component of a wave.,spase
6,ac magnetic field,Alternating magnetic field component of a wave.,spase
7,active,Exerting an influence or producing a change or...,spase
...,...,...,...
737,white light,Photons with a wavelength in the visible range...,spase
738,white paper,An authoritative report giving information or ...,spase
739,xml,eXtensible Mark-up Language (XML). A structure...,spase
740,x rays,Photons with a wavelength range: 0.001 <= x < ...,spase


In [None]:
"""
I'm adding this here as it might be useful to run prior to, or instead of, the concatenating the frames.
I can also see value in running a similar function AFTER concatenating the frames.  
Obviously you can do what you like with it, or ignore it. :)

Use SequenceMatcher to calcuate a similarity value between two strings

# add to preamble
from difflib import SequenceMatcher

# calculate similarity of two strings
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# example
for i in pd_UAT_terms.index:
    for e in pd_spase_terms.index:
        # 90% similarity can generate some signifanct false positives;
        # may want to try 95% first
        if 0.90 <= similar(pd_spase_terms['label'].values[i].lower(), pd_UAT_terms['SWEET label'].values[e].lower()):
            #when I use this, I create separate dfs for matches and singletons
            #but obvs results can sent wherever
            # and example of output dfs (as csv) can be found in the following two SWEET discussion topics:
            #    -https://github.com/ESIPFed/sweet/discussions/267
            #    -https://github.com/ESIPFed/sweet/discussions/268


I'm working on definition comparison metrics (skos:definition text blocks) but that is in progress.
"""

In [45]:
# Compile terms from all glossaries and make a list of the common terms
frames = [pd_UAT_terms,pd_agu_terms,pd_spase_terms,pd_NASAhelio_terms]

pd_terms_total = pd.concat(frames)
pd_terms_total.reset_index(inplace=True)
pd_terms_total = pd_terms_total.drop(columns=['index'])
pd_terms_total


Unnamed: 0,term,definition,source
0,solar physics,,uat
1,solar system astronomy,,uat
2,observational astronomy,,uat
3,solar particle emission,,uat
4,solar spectral irradiance,,uat
...,...,...,...
1313,solar maximum,The time during the 11-year solar cycle when t...,nasa
1314,solar minimum,The time during the 11-year solar cycle when t...,nasa
1315,solar wind,The constant stream of solar coronal material ...,nasa
1316,substorm,A release of magnetic energy that originates i...,nasa


In [46]:
pd_terms_total_unique = pd_terms_total.sort_index().groupby('term').filter(lambda group: len(group) == 1)
pd_terms_total_unique

Unnamed: 0,term,definition,source
0,solar physics,,uat
1,solar system astronomy,,uat
2,observational astronomy,,uat
3,solar particle emission,,uat
4,solar spectral irradiance,,uat
...,...,...,...
1309,radiative zone,"In the radiative zone, energy from the core sl...",nasa
1311,solar energetic particles (sep),"During an eruptive event on the sun, the sun c...",nasa
1313,solar maximum,The time during the 11-year solar cycle when t...,nasa
1314,solar minimum,The time during the 11-year solar cycle when t...,nasa


In [47]:
pd_terms_total_nonunique = pd_terms_total.sort_index().groupby('term').filter(lambda group: len(group) > 1)
pd_terms_total_nonunique

Unnamed: 0,term,definition,source
6,solar wind,,uat
8,solar magnetic fields,,uat
11,helioseismology,,uat
21,sunspots,,uat
22,solar flares,,uat
...,...,...,...
1308,radiation belts,"Two belts of radiation that surround Earth, al...",nasa
1310,solar cycle,The sun goes through 11-year variations or cyc...,nasa
1312,solar flares,A great burst of light and radiation due to th...,nasa
1315,solar wind,The constant stream of solar coronal material ...,nasa


In [None]:
# TODO
# (DONE - June 14, 2022; RMM) Read in NASA Heliophysics Vocabulary 

# Compile terms from all glossaries, normalize the terms, and make a list of the common terms


# Read in NOAA Space Weather Glossary
# Read in ESA Space Weather Glossary
# Read in and subselect AMS Glossary of Meteorology


In [None]:
# Read in NOAA Space Weather Glossary
pd_noaa # Need to read from HTML: https://www.swpc.noaa.gov/content/space-weather-glossary

In [49]:
pd_terms_total_nonunique = pd_terms_total_nonunique.drop_duplicates(subset=['term'])


In [50]:
pd_terms_total_nonunique

Unnamed: 0,term,definition,source
6,solar wind,,uat
8,solar magnetic fields,,uat
11,helioseismology,,uat
21,sunspots,,uat
22,solar flares,,uat
...,...,...,...
733,aurora,An atmospheric phenomenon consisting of bands ...,spase
880,geomagnetic storm,A magnetospheric disturbance typically defined...,spase
989,magnetic field,A region of space near a magnetized body where...,spase
997,magnetosphere,The region of space above the atmosphere or su...,spase


In [51]:
# search for what is in the term list (non-unique)

pd_terms_total_nonunique.loc[pd_terms_total_nonunique['term'].str.contains('iono',case=False)]

Unnamed: 0,term,definition,source
215,planetary ionospheres,,uat
500,ionosphere,,agu


In [52]:
pd_terms_total_unique.loc[pd_terms_total_unique['term'].str.contains('geo',case=False)]

Unnamed: 0,term,definition,source
142,lunar geodesy,,uat
306,lunar geochronology,,uat
393,geocorona,,uat
441,geospatial,,agu
563,mathematical geophysics,,agu
581,geological,,agu
879,geo,Geographic - geocentric corotating - A coordin...,spase
881,geometric factor,A measure of the gathering power of a particl...,spase
1296,geomagnetically induced current (gic),The magnetic fluctuations caused by a geomagne...,nasa


In [53]:
pd_terms_total_unique.to_csv(os.path.join(directory_vocabs,'unique_terms.csv'),index=False)
pd_terms_total_nonunique.to_csv(os.path.join(directory_vocabs,'nonunique_terms.csv'),index=False)


# Exploring ADS synonym augmentation

In [2]:
import numpy as np
import pandas as pd
import os
import math

import urllib.request
import urllib.parse
import re

import matplotlib.pyplot as plt

In [3]:
txt_file = '/Users/ryanmcgranaghan/Documents/Helio_ECIP/dev/Helio-KNOW/ADS_enrichment/data/ads_simple_synonyms.txt'
f = open(txt_file,"r")
txt_data = f.read().split('\n')#.remove('')
f.close()

syns_data = [x.split('=>') for x in txt_data]


In [4]:
pd_syns = pd.DataFrame(syns_data,columns=['words','ADS term'])

In [5]:
pd_syns

Unnamed: 0,words,ADS term
0,"1820-30, 1820-303",1820-30
1,"first, 1st",first
2,"second, 2nd",second
3,"third, 3rd",third
4,"fourth, 4th",fourth
...,...,...
9736,"zt, zts",zt
9737,"zuckerman, zuckermann",zuckerman
9738,"zustandsdiagramm, zustandsdiagramms",zustandsdiagramm
9739,"zwicky, zw",zwicky


In [None]:
for i in range(500,520):
    print('synonyms: {}\n    --> ADS term: {}\n\n'.format(pd_syns.iloc[i].values[0],pd_syns.iloc[i].values[1]))

In [None]:
solar_ads_syns = pd_syns['our' in pd_syns['ADS term']]
solar_ads_syns

In [29]:
mask_term = [str(val).contains('our') for val in pd_syns['ADS term']]

AttributeError: 'str' object has no attribute 'contains'

In [32]:
for val in pd_syns['ADS term']:
#     print(val)
    if 'magnetosphere' in str(val):
        print(val)

 magnetosphere


In [28]:
str(val)

'None'