In [1]:
import numpy as np
import requests
import re
from rank_bm25 import BM25Okapi
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from country_list import countries_for_language

In [2]:
"""
    Country list needed later to check if a string is
    a country entity.
"""
countries = dict(countries_for_language('en'))
countries = {key : countries[key].lower() for key in countries}

## Manual copy-paste of all variables listed in the ACTRIS data manage plan
link: http://www.actris.eu/Portals/46/Publications/DataCentre/ACTRIS_Data_Management_Plan.pdf 

Variables are listed in the Appendix I

In [3]:
# ACTRIS Aerosol Particle Variables
# In situ aerosol particle variables
AP_in_situ = ["Particle light scattering coefficient ",
            "Particle light backscattering coefficient",
            "Particle number size distribution",
            "Particle light absorption coefficient",
            "Particle number concentration ",
            "Cloud condensation nuclei number concentration",
            "Hygroscopic growth factor",
            "Particulate organic and elemental carbon mass concentrations (OC/EC)",
            "Particulate size-resolved chemical composition (organic & inorganic size-resolved mass speciation)",
            "Particulate levogluocsan mass concentration"]
# ACTRIS in situ trace gas variables
TG_in_situ = ["NMHCs",
             "OVOCs",
             "NO",
             "NO2",
             "NOy"]
# ACTRIS Aerosol particle variables
# Aerosol remote sensing variables 
AP_remote_sensing = ["Aerosol backscatter coefficient profile",
                    "Aerosol extinction coefficient profile ",
                    "Lidar ratio profile",
                    "Ångström exponent profile",
                    "Backscatter-related Ångström exponent profile",
                    "Particle depolarization ratio profile",
                    "Particle layer geometrical properties",
                    "Particle layer optical properties",
                    "Aerosol optical depth",
                    "Planetary boundary layer height"]
# ACTRIS cloud variables 

# Cloud remote sensing variables (remote observations from ground)
C_remote_sensing = ["cloud/aerosol target classification",
                    "drizzle drop size distribution",
                    "drizzle water content",
                    "drizzle water flux",
                    "ice water content",
                    "liquid water content",
                    "liquid water path",
                    "rainrate"]

# In situ cloud variables
C_in_situ = ["Liquid Water Content"]

# Detailed list of trace gases included in ACTRIS - Alkanes, Alkenes, Alkynes
# Alkanes
TG_alkanes = ["ethane",
            "propane",
            "2-methylpropane",
            "n-butane ",
            "2-2-dimethylpropane",
            "2-methylbutane ",
            "n-pentane",
            "cyclopentane",
            "methyl-cyclopentane",
            "2-2-dimethylbutane",
            "2-3-dimethylbutane",
            "2-methylpentane",
            "3-methylpentane",
            "cyclohexane",
            "n-hexane",
            "methyl-cyclohexane",
            "2-2-3-trimethylbutane",
            "2-3-dimethylpentane",
            "2-2-dimethylpentane",
            "2-4-dimethylpentane",
            "3-3-dimethylpentane",
            "3-methylhexane",
            "2-methylhexane",
            "n-heptane",
            "2-2-4-trimethylpentane",
            "3-methylheptane",
            "n-octane",
            "n-nonane",
            "n-decane",
            "methyl-cyclohexane",
            "n-undecane",
            "n-dodecane",
            "n-tridecane",
            "n-tetradecane",
            "n-pentadecane",
            "n-hexadecane "]

# Alkenes
TG_alkenes = ["ethene",
            "propene",
            "trans-2-butene",
            "1-butene",
            "2-methylpropene",
            "cis-2-butene",
            "1-3-butadiene",
            "3-methyl-1-butene,"
            "2-methyl-2-butene,"
            "trans-2-pentene",
            "cyclopentene",
            "1-pentene",
            "cis-2-pentene",
            "1-hexene",
            "isoprene"]

# Alkynes
TG_alkynes = ["ethyne",
            "proypne",
            "1-butyne"]

# Detailed list of trace gases included in ACTRIS - OVOCs, Terpenes, Aromatics

# OVOCs
TG_OVOCs = ["methanol",
            "ethanol",
            "isopropano",
            "n-propanol",
            "n-butanol",
            "methyl-butanol",
            "formaldehyde",
            "acetaldehyde",
            "n-propanal",
            "n-butanal",
            "pentanal",
            "hexanal",
            "heptanal",
            "octanal",
            "decanal",
            "undecanal",
            "benzaldehyde",
            "acrolein",
            "acetone",
            "methylethylketon",
            "methacrolein",
            "methylvinylketon",
            "glyoxal",
            "methylglyoxal",
            "butylacetat",
            "acetonitrile"]

# Terpenes
TG_terpenes = ["alpha-thujene",
                "tricyclene",
                "alpha-pinene",
                "camphene",
                "sabinene",
                "myrcene",
                "beta-pinene",
                "alpha-phellandrene",
                "3-carene",
                "alpha-terpinene",
                "m-cymene",
                "cis-ocimene",
                "p-cymene",
                "limonene",
                "beta-phellandrene",
                "eucalyptol",
                "gamma-terpinene",
                "terpinolene",
                "camphor"]

#Aromatics
TG_aromatics = ["benzene",
                "toluene",
                "ethylbenzene",
                "m-p-xylene",
                "o-xylene",
                "1-3-5-trimethylbenzene",
                "1-2-4-trimethylbenzene",
                "1-2-3-trimethylbenzene"]

"""
    Single list of all variables.
"""
all_variables = TG_alkanes + TG_alkenes+ TG_alkynes + TG_OVOCs + TG_terpenes + TG_aromatics  

## Manually made variable hierarchy, an idea but not used yet in this version
It is labor intensive to make this by hand. Perhaps some way is possible to automate making a variable knowledge base using the infrastructure metadata schemas.

In [4]:
ACTRIS_var_dict = {
    
    "aerosol_particles" : {
        "in_situ" : AP_in_situ,
        "remote_sensing" : AP_remote_sensing
    },
    
    "cloud_variables" : {
        "remote_sensing" : C_remote_sensing,
        "in_situ" : C_in_situ
        
    },
    
    "trace_gasses" : {
        "alkanes" : TG_alkanes, 
        "alkenes" : TG_alkenes,
        "alkynes" : TG_alkynes,
        "ovocs" : TG_OVOCs,
        "terpenes" : TG_terpenes, 
        "aromatics" : TG_aromatics  
    }
    
}

In [5]:
# aerosol.absorption.coefficient, first result
d1 = open("CH0001G.20070101000000.20181205113000.filter_absorption_photometer.aerosol_absorption_coefficient.aerosol.1y.1h.CH02L_Magee_AE337_JFJ.CH02L_aethalometer_AE337.lev2.nas").read()

# propene, first result
d2_1 = open("CV0001G.20130701151343.20160823000000.online_gc..air.6mo.1h.CV01L_Agilent_GC-FID_7890A_G3440A.CV01L_Manual_AIR_only.lev0.nas").read()
d2_2 = open("CV0001G.20140101003925.20160823000000.online_gc..air.1y.1h.CV01L_Agilent_GC-FID_7890A_G3440A.CV01L_Manual_AIR_only.lev0.nas").read()
d2_3 = open("CV0001G.20150101001139.20160823000000.online_gc..air.1y.1h.CV01L_Agilent_GC-FID_7890A_G3440A.CV01L_Manual_AIR_only.lev0.nas").read()
d2 = d2_1 + d2_2 + d2_3

# pressure + ACTRIS-INSITU, first result
d3_1 = open("FI0023R.19920101000000.20181114115100.cpc.particle_number_concentration.pm10.1y.1h.FI03L_CPC_VAR_01.FI03L_CPC.lev2.nas").read()
d3_2 = open("FI0023R.19930101000000.20181114115100.cpc.particle_number_concentration.pm10.1y.1h.FI03L_CPC_VAR_01.FI03L_CPC.lev2.nas").read()
d3_3 = open("FI0023R.19940101000000.20181114115100.cpc.particle_number_concentration.pm10.1y.1h.FI03L_CPC_VAR_01.FI03L_CPC.lev2.nas").read()
d3 = d3_1 + d3_2 + d3_3

# 1-pentene, first result 
d4 = open("CZ0003R.20150101120000.20170728090000.steel_canister..air.1y.78h.CZ01L_Andersen_instrument.CZ01L_Agilent_VOC_AIR.lev2.nas").read()

# NOy.concentration, second result
d5_1 = open("FI0096G.19950101000000.20080611000000.chemiluminescence_photolytic.nitrogen_dioxide.air.1y.1h.FI01L_chemilum_96.FI01L_chemilum..nas").read()
d5_2 = open("FI0096G.19960101000000.20080611000000.chemiluminescence_photolytic.nitrogen_dioxide.air.1y.1h.FI01L_chemilum_96.FI01L_chemilum..nas").read()
d5_3 = open("FI0096G.19970101000000.20080611000000.chemiluminescence_photolytic.nitrogen_dioxide.air.1y.1h.FI01L_chemilum_96.FI01L_chemilum..nas").read()
d5 = d5_1 + d5_2 + d5_3

docset = [("https://actris.nilu.no/Data/Files/Display/?key=ae0630ef83e44c998b72bbb01b5cdabb", d1),
          ("https://actris.nilu.no/Data/Files/Display/?key=833978ac5be344e6b66d4e5de5e4043b", d2),
          ("https://actris.nilu.no/Data/Files/Display/?key=ec2282cd2ed0492cb6eac39c6a8931a3", d3),
          ("https://actris.nilu.no/Data/Files/Display/?key=7a5e6855f548495fbd3d36f86b29dd79", d4),
          ("https://actris.nilu.no/Data/Files/Display/?key=3fd98d9615084039bfdb17c2640e9997", d5)]

## Preprocessing functions to clean the raw text format

In [6]:
"""
    Checks if a string contains any alphabetic
    characters.
"""
def contains_alpha(string):
    
        for x in string:
            
            if x.isalpha():
                return True
            
        return False
    
"""
    Seperates raw text strings on commas and newlines.
    Removes pure numerics (e.g. 12.35).
    Removes " marks and makes everything lower case.
"""
def clean_document(doc):
    
        return [x.lower().replace('"','').replace("_", " ") for x in re.split(', |\n', doc)
                if x not in [''] and contains_alpha(x)]

## Fuzzy matching functions used to generalize queries

In [7]:
"""
    Match query words against a vocabulary
    and return top k matches.
"""
def top_k_fuzzy_matches(vocabulary, query_word, k):
    
    l = len(vocabulary)
    top_k_matches = [None] * l

    for i, vocab_word in enumerate(vocabulary):
        top_k_matches[i] = ((fuzz.ratio(vocab_word, query_word), vocab_word))
                
    top_k_matches.sort(reverse=True)   
    
    return top_k_matches[:k]

In [8]:
"""
    Expands an input query by adding fuzzy
    matching words. Fuzzyness defines
    how many of the top k fuzzy matches
    should be used.
"""
def fuzzy_query(vocabulary, query, k):
    
    for w in query:
        for _, match in top_k_fuzzy_matches(vocabulary, w, k):
            if match not in query:
                query.append(match)  

    return query

## Represent documents by a selection of entities that are common in all datasets 

In [9]:
def make_doc_dict(cleaned_document):

    document_dict = {
                 "country" : None,
                 "station name" : None,
                 "coordinates" : {},
                 "people" : [],
                 "etc" : []
                }
    
    for x in cleaned_document:
        word_set = x.split()
        l = len(word_set)

        for i, item in enumerate(word_set):

            if item in countries.values():
                document_dict["country"] = item

            elif item == "station":

                if "name:" in word_set:
                    document_dict["station name"] = word_set[i + 2]

                if "latitude:" in word_set:
                    document_dict["coordinates"].update({"latitude" : word_set[i + 2]})

                if "longitude:" in word_set:
                    document_dict["coordinates"].update({"longitude" : word_set[i + 2]})

                if "altitude:" in word_set:
                    document_dict["coordinates"].update({"altitude" : word_set[i + 2]})

            elif item == "originator:" or item == "submitter:":
                document_dict["people"].append(word_set[1])

        document_dict["variables"] = list(set(x.split()))

    return document_dict

In [10]:
"""
    ReRankingSystem class. The idea is to have class instances
    contain specific ranking methods. For now there is only one,
    okapi BM250. 
    
    During initializing the input documents are required to be
    a list of long strings containing the raw text. This seems
    to be the most common way datasets are made, atleast for ACTRIS.
    
    Further work that needs to be done:
        1. Be able to process formats other that raw text (e.g XML, JSON).
        Not sure if this should be done inside the RRS or maybe as a script
        outside of it.
        
        2. Include more ranking methods. Assuming each ranking method
        requires it's own input format for queries and documents, 
        significant work may have to be done to streamline this.
        
        3. Make a function / functions that process user feedback
        fo the learning methods.
        
        4. Increase the test dataset and include data from other
        infrastructures besides ACTRIS.
        
        5. Include ranking method comparison functionality.
        Either as a seperate script or built into the RRS.
        
        6. Make a more sophisticated representation of the
        documents than the current dictionary forms. This
        should use the metadata schema's provided by the
        infrastructures.
        
        7. Make a method to automate document representations
        (such as the dictionaries) from metadata schema's.
"""
class RRS:
    
    """
        During init the following things are specified:
            1. what ranking method should be used, standard is okapi BM250
            2. how fuzzy queries are, higher values means more fuzzy word
               matches are added to the query.
            3. Wether or not to use the organized dictionary structure
               specified above. 
        These values define the class instance of the RRS.
    """
    def __init__(self, corpus, ranking_method="BM25Okapi", fuzzyness=1, use_dicts=True):
        
        self.fuzzyness = fuzzyness
        
        self.docs = [(d[0], clean_document(d[1])) for d in corpus]
        self.doc_dicts = [(url, make_doc_dict(doc)) for url, doc in self.docs]
        
        self.ranker = None
        self.ranker_method = "None"
        
        self.vocabulary = []
        
        for _, d in self.docs:
            for w in d:
                self.vocabulary.append(w)
        self.vocabulary = list(set(self.vocabulary))
        
        if use_dicts == True:
            
            if ranking_method == "BM25Okapi":
                self.ranker = BM25Okapi([d[1]["variables"] for d in self.doc_dicts])
                self.ranker_method = "BM250kapi"
            
        else:
            
            if ranking_method == "BM25Okapi":
                self.ranker = BM25Okapi([d[1] for d in self.docs])
                self.ranker_method = "BM250kapi"
            
    """
        Calculate the scores given using the ranking model
        for the current class instance. 
        
        fuzzy_expansion specifies wether or not queries
        should be expanded with fuzzy matches.
    """        
    def get_scores(self, query, fuzzy_expansion=True):
        
        query = query.lower().split()
        
        if fuzzy_expansion == True:
            query = fuzzy_query(self.vocabulary, query, self.fuzzyness)
            print(query)
        
        if self.ranker_method == "BM250kapi":
            return self.ranker.get_scores(query)
        
    """
        Return a list of tuples containing the ranking score
        and the link to the dataset download.
        
        fuzzy_expansion specifies wether or not queries
        should be expanded with fuzzy matches.
    """
    def make_ranking(self, query, fuzzy_expansion=True):
        
        scores = self.get_scores(query, fuzzy_expansion)
        ranking = [None]*len(scores)
        
        for i, score in enumerate(scores):
            ranking[i] = (score, self.docs[i][0])
        
        ranking.sort()
        ranking.reverse()
        
        return ranking

In [11]:
rrs = RRS(docset)

In [12]:
rrs.make_ranking("aerosol")

['aerosol', 'jaroslav']


[(1.3024784865652848,
  'https://actris.nilu.no/Data/Files/Display/?key=ae0630ef83e44c998b72bbb01b5cdabb'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=ec2282cd2ed0492cb6eac39c6a8931a3'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=833978ac5be344e6b66d4e5de5e4043b'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=7a5e6855f548495fbd3d36f86b29dd79'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=3fd98d9615084039bfdb17c2640e9997')]

In [13]:
rrs.make_ranking("c4h10")

['c4h10', 'ch02l']


[(0.35745318440128865,
  'https://actris.nilu.no/Data/Files/Display/?key=833978ac5be344e6b66d4e5de5e4043b'),
 (0.22016675511487616,
  'https://actris.nilu.no/Data/Files/Display/?key=7a5e6855f548495fbd3d36f86b29dd79'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=ec2282cd2ed0492cb6eac39c6a8931a3'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=ae0630ef83e44c998b72bbb01b5cdabb'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=3fd98d9615084039bfdb17c2640e9997')]

In [14]:
rrs.make_ranking("particle")

['particle', 'martine']


[(1.167116981033096,
  'https://actris.nilu.no/Data/Files/Display/?key=ec2282cd2ed0492cb6eac39c6a8931a3'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=ae0630ef83e44c998b72bbb01b5cdabb'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=833978ac5be344e6b66d4e5de5e4043b'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=7a5e6855f548495fbd3d36f86b29dd79'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=3fd98d9615084039bfdb17c2640e9997')]

In [15]:
rrs.make_ranking("ethylbenzene")

['ethylbenzene']


[(0.7188643709634858,
  'https://actris.nilu.no/Data/Files/Display/?key=7a5e6855f548495fbd3d36f86b29dd79'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=ec2282cd2ed0492cb6eac39c6a8931a3'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=ae0630ef83e44c998b72bbb01b5cdabb'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=833978ac5be344e6b66d4e5de5e4043b'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=3fd98d9615084039bfdb17c2640e9997')]

In [16]:
rrs.make_ranking("no2")

['no2', 'no unit']


[(1.473357588301255,
  'https://actris.nilu.no/Data/Files/Display/?key=3fd98d9615084039bfdb17c2640e9997'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=ec2282cd2ed0492cb6eac39c6a8931a3'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=ae0630ef83e44c998b72bbb01b5cdabb'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=833978ac5be344e6b66d4e5de5e4043b'),
 (0.0,
  'https://actris.nilu.no/Data/Files/Display/?key=7a5e6855f548495fbd3d36f86b29dd79')]