### Import needed packages

In [1]:
from rdflib import Graph
from nltk.corpus import wordnet
import pandas as pd
#import numpy as np

### Utilities for interacting with GCMD keywords graph

In [5]:
# load the GCMD keywords graph
def load_gcmd():
    g = Graph()
    g.parse("gcmd_keywords.rdf", format="application/rdf+xml")
    return g

# query graph to search for term in skos:prefLabel
# return list of terms as strings (should change to return by id)
def simple_label_query(g,term):
    qres = g.query(
    """SELECT DISTINCT (str(?label) as ?strlabel)
       WHERE {{
          ?name skos:prefLabel ?label .
          FILTER regex(?label,"(?i){}") .
       }}""".format(term))
    return [str(label[0]) for label in qres]

# query graph to search for term in skos:definition
# return list of terms as strings (should change to return by id)
def simple_definition_query(g,term):
    qres = g.query(
    """SELECT DISTINCT (str(?label) as ?strlabel)
       WHERE {{
          ?name skos:definition ?definition .
          ?name skos:prefLabel ?label
          FILTER regex(?definition,"(?i){}") .
       }}""".format(term))
    return [str(label[0]) for label in qres]


### Utilities for constructing a phrase search

Simple, first-pass search algorithm to implement:
 1. take synonyms of all search terms (WordNet--use Wiktionary later)
 2. search for search terms individually in
     1. the prefLabel
     2. the definition
 3. ranking algorithm:
       - 1 point for each search term found in prefLabel
       - .3 point for each search term found in definition
       - .2 point for each synonym group term found in prefLabel
       - .1 point for each synonym group term found in definition

In [11]:
# simple phrase tokenizer
def tokenize_phrase(phrase):
    return phrase.strip().lower().replace('-',' ').split()

# access synonyms via WordNet
# returns a list of strings
def get_synonyms(term):
    synonyms = []
    ss = wordnet.synsets(term)
    for s in ss:
        for l in s.lemmas():
            syn = l.name()
            if not syn in synonyms and (syn != term):
                synonyms.append(syn)
    return synonyms

# search algorithm, as outlined above
def search(g,phrase):
    print('Searching for phrase ... ')
    terms = tokenize_phrase(phrase)
    
    results = pd.DataFrame(columns = ['rank'])
    for term in terms:
    
        # search for exact term in label and in definition (nothing fancy here)
        label_matches = simple_label_query(g,term)
        #print(len(label_matches), len(np.unique(label_matches)))
        definition_matches = simple_definition_query(g,term)
        new_matches = [label for label in definition_matches if not label in label_matches]
        for label in label_matches:
            if label in results.index:
                results.loc[label,'rank'] += 1 
            else:
                results.loc[label,'rank'] = 1
        for label in new_matches:
            if label in results.index:
                results.loc[label,'rank'] += 0.3 
            else:
                results.loc[label,'rank'] = 0.3
        
        # search for synonyms for this term in label and in definition (nothing fancy here)
        # no multiple counting
        results['syn'] = False
        term_synonyms = get_synonyms(term)
        for syn in term_synonyms:
            label_matches = simple_label_query(g,syn)
            definition_matches = simple_definition_query(g,syn)
            new_matches = [label for label in definition_matches if not label in label_matches]
            for label in label_matches:
                if label in results.index and not results.loc[label,'syn']:
                    results.loc[label,'rank'] += .2
                elif not label in results.index:
                    results.loc[label,'rank'] = .2
                results.loc[label,'syn'] = True
            for label in new_matches:
                if label in results.index and not results.loc[label,'syn']:
                    results.loc[label,'rank'] += 0.1 
                elif not label in results.index:
                    results.loc[label,'rank'] = 0.1
                results.loc[label,'syn'] = True
    
    results = results.sort_values(by='rank',ascending=False)
    
    # print matches with rank >= 0.5
    print('The top search results for {} are:'.format(phrase))
    top_results = results.loc[results['rank']>=0.5]
    for index, row in top_results.iterrows():
        print('{}, {}'.format(index,row['rank']))

### Play around with results.

In [7]:
g = load_gcmd()

In [12]:
phrase = 'dew point temperature'
search(g,phrase)

Searching for phrase ... 
The top search results for dew point temperature are:
DEW POINT TEMPERATURE, 3.0
DEW POINT DEPRESSION, 2.4
DEWPOINT DEPRESSION, 2.4
TEMPERATURE TENDENCY, 1.4
LAND SURFACE TEMPERATURE, 1.3
TEMPERATURE-HUMIDITY INDEX, 1.1
UPPER AIR TEMPERATURE, 1.1
TEMPERATURE GRADIENT, 1.1
ATMOSPHERIC TEMPERATURE, 1.1
TEMPERATURE ANOMALIES, 1.1
SEA SURFACE SUBSKIN TEMPERATURE, 1.1
SKIN TEMPERATURE, 1.1
AIR TEMPERATURE, 1.1
RESIDENTIAL ENERGY DEMAND TEMPERATURE INDEX, 1.1
24 HOUR MAXIMUM TEMPERATURE, 1.1
24 HOUR MINIMUM TEMPERATURE, 1.1
BRIGHTNESS TEMPERATURE, 1.1
SOIL TEMPERATURE, 1.1
6 HOUR MAXIMUM TEMPERATURE, 1.1
SEA SURFACE FOUNDATION TEMPERATURE, 1.1
6 HOUR MINIMUM TEMPERATURE, 1.1
SEA SURFACE TEMPERATURE, 1.0
AIR TEMPERATURE RECONSTRUCTION, 1.0
TEMPERATURE GRADIENT RATE, 1.0
PARTICLE TEMPERATURE, 1.0
CLOUD BASE TEMPERATURE, 1.0
ANTENNA TEMPERATURE, 1.0
CLOUD MIDLAYER TEMPERATURE, 1.0
TOTAL TEMPERATURE, 1.0
SINK TEMPERATURE, 1.0
HIGHER MINIMUM NIGHTTIME TEMPERATURES, 1.0
C

In [13]:
phrase = 'relative humidity'
search(g,phrase)

Searching for phrase ... 
The top search results for relative humidity are:
RELATIVE HUMIDITY, 2.0
STORM RELATIVE WINDS, 1.0
SPECIFIC HUMIDITY, 1.0
TEMPERATURE-HUMIDITY INDEX, 1.0
HUMIDITY, 1.0
HUMIDITY MIXING RATIO, 1.0
HUMIDITY INDICES, 1.0
HUMIDITY INDEX, 1.0
ABSOLUTE HUMIDITY, 1.0
SATURATION SPECIFIC HUMIDITY, 1.0
SUPERSATURATION, 0.6
DROUGHT, 0.6
FIRE WEATHER INDEX, 0.6
HEAT INDEX, 0.6


In [14]:
phrase = 'airspeed'
search(g,phrase)

Searching for phrase ... 
The top search results for airspeed are:
AIRSPEED/GROUND SPEED, 1.0


In [15]:
phrase = 'precipitation'
search(g,phrase)

Searching for phrase ... 
The top search results for precipitation are:
EXTREME PRECIPITATION, 1.0
PRECIPITATION PROFILES, 1.0
PRECIPITATION TRENDS, 1.0
PRECIPITATION VARIABILITY, 1.0
PRECIPITATION, 1.0
CENTRAL INDIAN PRECIPITATION INDEX, 1.0
12 HOUR PRECIPITATION AMOUNT, 1.0
24 HOUR PRECIPITATION AMOUNT, 1.0
HOURLY PRECIPITATION AMOUNT, 1.0
ATMOSPHERIC PRECIPITATION INDICES, 1.0
PRECIPITATION RATE, 1.0
STANDARDIZED PRECIPITATION INDEX, 1.0
PRECIPITATION ANOMALIES, 1.0
PRECIPITATION INDICATORS, 1.0
TOTAL SURFACE PRECIPITATION RATE, 1.0
SOLID PRECIPITATION, 1.0
LIQUID PRECIPITATION, 1.0
LIQUID SURFACE PRECIPITATION RATE, 1.0
PRECIPITATION INDICES, 1.0
WEIGHTED ANOMALY STANDARDIZED PRECIPITATION INDEX, 1.0
ACCUMULATIVE CONVECTIVE PRECIPITATION, 1.0
ENSO PRECIPITATION INDEX, 1.0
CONVECTIVE SURFACE PRECIPITATION RATE, 1.0
DROUGHT/PRECIPITATION RECONSTRUCTION, 1.0
PRECIPITATION AMOUNT, 1.0
3 AND 6 HOUR PRECIPITATION AMOUNT, 1.0


In [16]:
phrase = 'ground velocity'
search(g,phrase)

Searching for phrase ... 
The top search results for ground velocity are:
AIRSPEED/GROUND SPEED, 1.6
HORIZONTAL WIND VELOCITY/SPEED, 1.2
WIND VELOCITY/SPEED PROFILES, 1.2
VERTICAL WIND VELOCITY/SPEED, 1.2
VELOCITY AZIMUTH DISPLAY VERTICAL WIND PROFILES, 1.1
ORBIT VELOCITY, 1.1
ACOUSTIC VELOCITY, 1.1
VELOCITY, 1.1
GROUND WATER FEATURES, 1.1
FLOW VELOCITY, 1.1
BACKGROUND INFORMATION, 1.1
GROUND WATER RECONSTRUCTION, 1.1
FROZEN GROUND, 1.1
GROUND ICE, 1.1
GROUNDWATER CHEMISTRY, 1.0
ICE VELOCITY, 1.0
MEAN RADIAL VELOCITY, 1.0
LINE OF SIGHT VELOCITY, 1.0
SOLAR VELOCITY FIELDS, 1.0
GROUND WATER, 1.0
GROUNDWATER MANAGEMENT, 1.0
DOPPLER VELOCITY, 1.0
SEASONALLY FROZEN GROUND, 1.0
GROUND WATER PROCESSES/MEASUREMENTS, 1.0
CORONAL MASS EJECTION, 0.5
KARST LANDFORMS, 0.5
TSUNAMIS, 0.5
ESTUARINE WETLANDS, 0.5
KARST LANDSCAPE, 0.5
LAND SURFACE TEMPERATURE, 0.5
LAND SUBSIDENCE, 0.5
WIND SPEED, 0.5
BASE FLOW, 0.5


In [17]:
phrase = 'elastic deformation'
search(g,phrase)

Searching for phrase ... 
The top search results for elastic deformation are:
ICE DEFORMATION, 1
OCEAN CRUST DEFORMATION, 1
