In [44]:
import json
import re
import datefinder
from urllib.parse import urlparse

class HorizonParser():
    
    def __init__(self, dmp_title, dmp_description, dmp_created, dmp_modified, dmp_id, pi_name, pi_mail, pi_orcid):
        self.setup_dmp(dmp_title, dmp_description, dmp_created, dmp_modified, dmp_id, pi_name, pi_mail, pi_orcid)
    
    def setup_dmp(self, dmp_title, dmp_description, dmp_created, dmp_modified, dmp_id, pi_name, pi_mail, pi_orcid):
        self.dmp = {
            "dmp":{
                "title": dmp_title,
                "description":"Abstract::"+dmp_description,
                "created":dmp_created,
                "modified":dmp_modified,
                "dmp_id": {
                    "dmp_id": dmp_id,
                    "dmp_id_type": "HTTP-DOI"
                },
                "contact":{
                    "name": pi_name,
                    "mail": pi_mail,
                    "contact_id": {
                        "contact_id": pi_orcid,
                        "contact_id_type": "HTTP-ORCID"
                    }
                },
                "project": {}
            }
        }
    
    def __split_text_based_on_title(self, text):
        rep = {}
        for ds in self.dmp["dmp"]["dataset"]:
            title = ds["title"].lower()
            rep[title]=";"+title

        rep = dict((re.escape(k), v) for k, v in rep.items()) 
        pattern = re.compile("|".join(rep.keys()))
        text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text.lower()).split(";")
        return text
    
    def parse_question_1_1(self, text):
        if "description" in self.dmp["dmp"]["project"]:
            text=";State the purpose of the data collection/generation::"+text
            self.dmp["dmp"]["project"]["description"] += text
        else:
            text="State the purpose of the data collection/generation::"+text
            self.dmp["dmp"]["project"]["description"] = text
        return text
    
    def parse_question_1_2(self, text):
        if "description" in self.dmp["dmp"]:
            text=";Explain the relation to the objectives of the project::"+text
            self.dmp["dmp"]["description"] += text
        else:
            text="Explain the relation to the objectives of the project::"+text
            self.dmp["dmp"]["description"] = text
        return text
    
    def parse_question_1_3(self, text):
        rep = {"Title:": ";Title:", "Description:": ";Description:","Type:": ";Type:","Format:": ";Format:", "Source:":";Source:"} # define desired replacements here

        rep_lower = {}
        for k, v in rep.items():
            rep_lower[k.lower()]=v
        rep.update(rep_lower)    

        rep = dict((re.escape(k), v) for k, v in rep.items()) 
        pattern = re.compile("|".join(rep.keys()))
        text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)

        datasets = []
        current_dataset = None
        current_format = None
        for line in text.split(";"):
            line = line.replace("\n","")
            if line.startswith("Title:"):

                line = line[7:] if line[6:].startswith(" ") else line[6:]

                if current_dataset != None:
                    if "distribution" not in current_dataset:
                        current_dataset["distribution"] = []
                        current_dataset["distribution"].append({"title":"Project"})
                    else:
                        if current_format != None:
                            for ds in current_dataset["distribution"]:
                                ds["format"] = current_format
                            current_format = None
                    current_dataset["metadata"] = []
                    current_dataset["technical_resource"] = []
                    datasets.append(current_dataset)
                current_dataset = {"title":line}

            if line.startswith("Description:"):
                if current_dataset != None:
                    line = line[13:] if line[12:].startswith(" ") else line[12:]
                    current_dataset["description"]=line

            if line.startswith("Type:"):
                if current_dataset != None:
                    line = line[6:] if line[5:].startswith(" ") else line[5:]
                    current_dataset["type"]=line

            if line.startswith("Format:"):
                if current_dataset != None:
                    line = line[8:] if line[7:].startswith(" ") else line[7:]
                    current_format = line

            if line.startswith("Source:"):
                if current_dataset != None:
                    line = line[8:] if line[7:].startswith(" ") else line[7:]   
                    if line.lower() == "input":
                        current_dataset["distribution"] = []
                        current_dataset["distribution"].append({"title":"Origin"})
                        current_dataset["distribution"].append({"title":"Project"})
                    elif line.lower() == "produced":
                        current_dataset["distribution"] = []
                        current_dataset["distribution"].append({"title":"Project"})

        if current_dataset != None:
            if "distribution" not in current_dataset:
                current_dataset["distribution"] = []
                current_dataset["distribution"].append({"title":"Project"})
            else:
                if current_format != None:
                    for ds in current_dataset["distribution"]:
                        ds["format"] = current_format
                    current_format = None
            current_dataset["metadata"] = []
            current_dataset["technical_resource"] = []
            datasets.append(current_dataset)
        
        self.dmp["dmp"]["dataset"]=datasets
        return datasets
    
    def parse_question_1_4(self, text):
        ret_val = {}
        for ds in self.dmp["dmp"]["dataset"]:
            if ds["title"].lower() in text.lower():
                ds["keyword"] = "re-used"
                ret_val[ds["title"]] = "re-used"
            else:
                ds["keyword"] = "generated"
                ret_val[ds["title"]] = "generated"
        return ret_val
    
    def parse_question_1_5(self, text):
        ret_val = {}
        rep = {}
        for ds in self.dmp["dmp"]["dataset"]:
            
            # check if "input"
            bIsInput = False
            for dist in ds["distribution"]:
                if dist["title"] == "Origin":
                    bIsInput = True
                    break
                    
            
            if bIsInput:
                title = ds["title"].lower()
                rep[title]=";"+title

        rep = dict((re.escape(k), v) for k, v in rep.items()) 
        pattern = re.compile("|".join(rep.keys()))
        text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text.lower()).split(";")

        for ds in self.dmp["dmp"]["dataset"]:
            ret_val[ds["title"]] = []
            for dist in ds["distribution"]:
                if dist["title"] == "Origin":
                    for line in text:
                        if line.startswith(ds["title"].lower()):
                            url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)
                            if len(url) >= 1:
                                url = url[0]
                                if url.endswith(".") or url.endswith(",") or url.endswith(")"):
                                    url = url[:-1]
                                ds["dataset_id"] = {
                                    "dataset_id": url,
                                    "dataset_id_type": "HTTP-URI"
                                }
                                dist["access_url"]=url
                                ret_val[ds["title"]].append({
                                    "dataset_id": url,
                                    "dataset_id_type": "HTTP-URI",
                                    "access_url": url
                                })
                            else:
                                ds["dataset_id"] = {
                                    "dataset_id": ds["title"],
                                    "dataset_id_type": "custom"
                                }
                                ret_val[ds["title"]].append({
                                    "dataset_id": ds["title"],
                                    "dataset_id_type": "custom"
                                })
                            break
                else:
                    if "dataset_id" not in ds:
                        ds["dataset_id"] = {
                            "dataset_id": ds["title"],
                            "dataset_id_type": "custom"
                        }
                        ret_val[ds["title"]].append({
                            "dataset_id": ds["title"],
                            "dataset_id_type": "custom"
                        })
        return ret_val
 
    def parse_question_1_6(self, text):
        ret_val = {}
        order = ['b', 'kb', 'mb', 'gb', 'tb', 'pb']
       
        text = self.__split_text_based_on_title(text)

        for ds in self.dmp["dmp"]["dataset"]:
            for line in text:
                if line.startswith(ds["title"].lower()):
                    regex1  = re.compile(r'(\d+(?:\.\d+)?)\s*([kmgtp]?b)', re.IGNORECASE)
                    regex2 = re.compile(r'(\d+(?:\.\d+)?)',  re.IGNORECASE)
                    size1 = regex1.findall(line)
                    size2 = regex2.findall(line)
                    if len(size1) >= 1:
                        size1 = size1[0]
                        size1 = int(float(size1[0]) * (1024**order.index(size1[1].lower())))
                        
                        for dist in ds["distribution"]:
                            dist["byte_size"]=size1
                        ret_val[ds["title"]] = size1
                    elif len(size2) >= 1:
                        size2 = size2[0]                        
                        for dist in ds["distribution"]:
                            dist["byte_size"]=size2
                        ret_val[ds["title"]] = size2
        return ret_val

    def parse_question_1_7(self, text):
        if "description" in self.dmp["dmp"]:
            text=";Outline the data utility: to whom will it be useful::"+text
            self.dmp["dmp"]["description"] += text
        else:
            text="Outline the data utility: to whom will it be useful::"+text
            self.dmp["dmp"]["description"] = text
        return text
    
    def parse_question_2_1_1(self, text):
        ret_val = []       
        text = self.__split_text_based_on_title(text)

        for ds in self.dmp["dmp"]["dataset"]:
            for line in text:
                if line.startswith(ds["title"].lower()):   
                    
                    url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)
                    filename = re.findall('[\w\d\-.\/:]+\.\w+', line)
                    if len(url) >= 1:
                        url = url[0] 
                        if url.endswith(".") or url.endswith(",")  or url.endswith(")"):
                            url = url[:-1]
                        ds["metadata"].append({
                            "description":"Dataset Metadata",
                            "language":"en",
                            "metadata_id": {
                                "metadata_id": url,
                                "metadata_id_type": "HTTP-URI"
                            }
                        })
                        ret_val.append({
                            "dataset": ds["title"],
                            "description":"Dataset Metadata",
                            "language":"en",
                            "metadata_id": {
                                "metadata_id": url,
                                "metadata_id_type": "HTTP-URI"
                            }
                        })
                    if len(filename) >= 1:
                        new_names = []
                        for f in filename:
                            if not f.startswith("http"):
                                new_names.append(f)
                        filename = new_names[0] 
                        ds["metadata"].append({
                            "description":"Dataset Metadata",
                            "language":"en",
                            "metadata_id": {
                                "metadata_id": filename,
                                "metadata_id_type": "custom"
                            }
                        })
                        ret_val.append({
                            "dataset": ds["title"],
                            "description":"Dataset Metadata",
                            "language":"en",
                            "metadata_id": {
                                "metadata_id": filename,
                                "metadata_id_type": "custom"
                            }
                        })
        return ret_val
    
    def parse_question_2_1_3(self, text):
        ret_val = []
        urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
        filenames = re.findall('[\w\d\-.\/:]+\.\w+', text)
        
        for url in urls:
            if url.endswith(".") or url.endswith(",") or url.endswith(")"):
                url = url[:-1]
            for ds in self.dmp["dmp"]["dataset"]:
                ds["metadata"].append({
                    "description":"Naming Conventions",
                    "language":"en",
                    "metadata_id": {
                        "metadata_id": url,
                        "metadata_id_type": "HTTP-URI"
                    }
                })
                ret_val.append({
                    "dataset": ds["title"],
                    "description":"Naming Conventions",
                    "language":"en",
                    "metadata_id": {
                        "metadata_id": url,
                        "metadata_id_type": "HTTP-URI"
                    }
                })
            
        for filename in filenames:
            if not filename.startswith("http"):
                for ds in self.dmp["dmp"]["dataset"]:
                    ds["metadata"].append({
                        "description":"Naming Conventions",
                        "language":"en",
                        "metadata_id": {
                            "metadata_id": filename,
                            "metadata_id_type": "custom"
                        }
                    })
                    ret_val.append({
                        "dataset": ds["title"],
                        "description":"Naming Conventions",
                        "language":"en",
                        "metadata_id": {
                            "metadata_id": filename,
                            "metadata_id_type": "custom"
                        }
                    })
        return ret_val
    
    def parse_question_2_1_4(self, text):
        if "description" in self.dmp["dmp"]:
            text=";Outline the approach towards search keyword::"+text
            self.dmp["dmp"]["description"] += text
        else:
            text="Outline the approach towards search keyword ::"+text
            self.dmp["dmp"]["description"] = text
        return text
    
    def parse_question_2_1_5(self, text):
        ret_val = []        
        keywords = ["Endevor", "AccuRev SCM", "ClearCase", "Dimensions CM", "IC Manage", "PTC Integrity", "PVCS", "Rational Team Concert", "SCM Anywhere", "StarTeam", "Subversion", "SVN", "Surround SCM", "Vault", "Perforce Helix Core", "Synergy", "Plastic SCM", "Azure DevOps", "BitKeeper", "Code Co-op", "darcs", "Fossil", "Git", "Mercurial", "Monotone", "Pijul", "GNU Bazaar", "Revision Control System", "Source Code Control System", "Team Foundation Server"]
        text = text.lower()
        for keyword in keywords:
            keyword = keyword.lower()
            if keyword in text:
                for ds in self.dmp["dmp"]["dataset"]:
                    for dist in ds["distribution"]:
                        if dist["title"] == "Project":
                            dist["host"] = {
                                "title": keyword,
                                "supports_versioning": "yes"
                            }
                            ret_val.append({
                                "dataset":ds["title"],
                                "supports_versioning": "yes",
                                "title": keyword
                            })
        return ret_val

    def parse_question_2_1_6(self, text):
        ret_val = []
        urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
        filenames = re.findall('[\w\d\-.\/:]+\.\w+', text)
        
        for url in urls:
            if url.endswith(".") or url.endswith(",") or url.endswith(")"):
                url = url[:-1]
            for ds in self.dmp["dmp"]["dataset"]:
                ds["metadata"].append({
                    "description":"Metadata Creation",
                    "language":"en",
                    "metadata_id": {
                        "metadata_id": url,
                        "metadata_id_type": "HTTP-URI"
                    }
                })
                ret_val.append({
                    "dataset": ds["title"],
                    "description":"Metadata Creation",
                    "language":"en",
                    "metadata_id": {
                        "metadata_id": url,
                        "metadata_id_type": "HTTP-URI"
                    }
                })
            
        for filename in filenames:
            if not filename.startswith("http"):
                for ds in self.dmp["dmp"]["dataset"]:
                    ds["metadata"].append({
                        "description":"Metadata Creation",
                        "language":"en",
                        "metadata_id": {
                            "metadata_id": filename,
                            "metadata_id_type": "custom"
                        }
                    })
                    ret_val.append({
                        "dataset": ds["title"],
                        "description":"Metadata Creation",
                        "language":"en",
                        "metadata_id": {
                            "metadata_id": filename,
                            "metadata_id_type": "custom"
                        }
                    })
        return ret_val
    
    def parse_question_2_2_1(self, text):
        ret_val = {}     
        text = self.__split_text_based_on_title(text)

        for ds in self.dmp["dmp"]["dataset"]:
            for line in text:
                if line.startswith(ds["title"].lower()):   
                    
                    state = "closed"
                    if "open" in line.lower():
                        state = "open"
                    if "closed" in line.lower():
                        state = "closed"
                    
                    for dist in ds["distribution"]:
                        dist["data_access"] = state
                        ret_val[ds["title"]] = state
        return ret_val
    
    def parse_question_2_2_2(self, text):
        ret_val = []
        urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
        if len(urls) >= 1:
            url = urls[0]
            if url.endswith(".") or url.endswith(",") or url.endswith(")"):
                url = url[:-1]
            for ds in self.dmp["dmp"]["dataset"]:
                for dist in ds["distribution"]:
                    if dist["title"] == "Project":
                        dist["access_url"] = url
                        tld = re.findall('[^.]*\.[^.]{2,3}(?:\.[^.]{2,3})?$', url)
                        tld = urlparse(url).netloc
                        dist["host"]["title"] = tld
                        ret_val.append({"access_url": url, "host":{"title":tld}})
        return ret_val
    
    def parse_question_2_2_3(self, text):
        ret_val = []
        for ds in self.dmp["dmp"]["dataset"]:
            ds["technical_resource"].append({
                "description": text,
                "technical_resource_id":{
                    "technical_resource_id": "software_tools",
                    "technical_resource_id_type": "custom"
                }
            })
            ret_val.append({
                "dataset": ds["title"],
                "description": text,
                "technical_resource_id":{
                    "technical_resource_id": "software_tools",
                    "technical_resource_id_type": "custom"
                }
            })
            
        return ret_val
    
    def parse_question_2_2_4(self, text):
        return self.parse_question_2_2_2(text)
    
    def parse_question_2_2_5(self, text):
        ret_val = []
        for ds in self.dmp["dmp"]["dataset"]:
            ds["metadata"].append({
                "description": text,
                "metadata_id":{
                    "metadata_id": "restrictions",
                    "metadata_id_type": "custom"
                }
            })
            ret_val.append({
                "dataset": ds["title"],
                "description": text,
                "metadata_id":{
                    "metadata_id": "restrictions",
                    "metadata_id_type": "custom"
                }
            })
            
        return ret_val
    
    def parse_question_2_3_1(self, text):
        ret_val = []
        for ds in self.dmp["dmp"]["dataset"]:
            ds["technical_resource"].append({
                "description": text,
                "technical_resource_id":{
                    "technical_resource_id": "interoperability_standards",
                    "technical_resource_id_type": "custom"
                }
            })
            ret_val.append({
                "dataset": ds["title"],
                "description": text,
                "technical_resource_id":{
                    "technical_resource_id": "interoperability_standards",
                    "technical_resource_id_type": "custom"
                }
            })
            
        return ret_val
    
    def parse_question_2_3_2(self, text):
        ret_val = []
        for ds in self.dmp["dmp"]["dataset"]:
            ds["technical_resource"].append({
                "description": text,
                "technical_resource_id":{
                    "technical_resource_id": "vocabulary_standards",
                    "technical_resource_id_type": "custom"
                }
            })
            ret_val.append({
                "dataset": ds["title"],
                "description": text,
                "technical_resource_id":{
                    "technical_resource_id": "vocabulary_standards",
                    "technical_resource_id_type": "custom"
                }
            })
            
        return ret_val
    
    def parse_question_2_4_1(self, text):
        ret_val = []
        urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
        if len(urls) >= 1:
            url = urls[0]
            if url.endswith(".") or url.endswith(",") or url.endswith(")"):
                url = url[:-1]
            for ds in self.dmp["dmp"]["dataset"]:
                for dist in ds["distribution"]:
                    if dist["title"] == "Project":
                        dist["host"]["license"] = {
                            "license_ref": url
                        }
                        ret_val.append({
                            "dataset": ds["title"],
                            "license_ref": url
                        })
        return ret_val
    
    def parse_question_2_4_2(self, text):
        ret_val = []        
        dates = list(datefinder.find_dates(text))
        if len(dates) >= 1:
            date = dates[0]
            for ds in self.dmp["dmp"]["dataset"]:
                for dist in ds["distribution"]:
                    if dist["title"] == "Project":
                        if "license" in dist["host"]:
                            dist["host"]["license"]["start_date"] = date.strftime("%Y-%d-%m")
                        else:                          
                            dist["host"]["license"] = {
                                "start_date": date.strftime("%d-%m-%Y")
                            }
                        ret_val.append({
                            "dataset": ds["title"],
                            "start_date": date.strftime("%d-%m-%Y")
                        })
        else:
            for ds in self.dmp["dmp"]["dataset"]:
                for dist in ds["distribution"]:
                    if dist["title"] == "Project":
                        if "license" in dist["host"]:
                            dist["host"]["license"]["start_date"] = self.dmp["dmp"]["created"]
                        else:                          
                            dist["host"]["license"] = {
                                "start_date": self.dmp["dmp"]["created"]
                            }
                        ret_val.append({
                            "dataset": ds["title"],
                            "start_date": self.dmp["dmp"]["created"]
                        })  
        return ret_val
    
    def parse_question_2_4_3(self, text):
        ret_val = []
        for ds in self.dmp["dmp"]["dataset"]:
            ds["metadata"].append({
                "description": text,
                "metadata_id":{
                    "metadata_id": "third_party_usability",
                    "metadata_id_type": "custom"
                }
            })
            ret_val.append({
                "dataset": ds["title"],
                "description": text,
                "metadata_id":{
                    "metadata_id": "third_party_usability",
                    "metadata_id_type": "custom"
                }
            })
            
        return ret_val 
    
    def parse_question_2_4_4(self, text):
        ret_val = {}
        for ds in self.dmp["dmp"]["dataset"]:
            ds["data_quality_assurance"] = text
            ret_val[ds["title"]] = text
        return ret_val
    
    def parse_question_2_4_5(self, text):
        ret_val = []        
        dates = list(datefinder.find_dates(text))
        if len(dates) >= 1:
            date = dates[0]
            for ds in self.dmp["dmp"]["dataset"]:
                for dist in ds["distribution"]:
                    if dist["title"] == "Project":
                        if "license" in dist["host"]:
                            dist["available_till"] = date.strftime("%Y-%d-%m")
                        ret_val.append({
                            "dataset": ds["title"],
                            "available_till": date.strftime("%d-%m-%Y")
                        })
        return ret_val
    
    def parse_question_3_1(self, text):
        cost = {
            "title": "Costs for making your data FAIR",
            "description": text
        }
        prices = re.findall('(\$|€|EUR|USD)(\d*[.|,]*\d*)', text)        
        if len(prices) >= 1:
            price = prices[0]
            code = price[0]
            if code.lower().startswith("eur"):
                cost["currency_code"] = "EUR"
                cost["value"] = price[1]
            if code.lower().startswith("usd"):
                cost["currency_code"] = "USD"
                cost["value"] = price[1]
            if code.lower().startswith("$"):
                cost["currency_code"] = "USD"
                cost["value"] = price[1]
            if code.lower().startswith("€"):
                cost["currency_code"] = "EUR"
                cost["value"] = price[1]
            
        self.dmp["dmp"]["cost"] = [cost]
        return cost
    
    def parse_question_3_2(self, text):
        dmp_staff = []
        
        lines = text.split("\n")        
        current_staff = None
        for line in lines:
            # is URL?
            url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)
            if len(url) >= 1:
                current_staff["staff_id"] = {
                    "staff_id":url[0],
                    "staff_id_type": "HTTP-ORCID"
                }
                continue
                
            # is EMAIL?
            email = re.findall(r'[\w\.]+\@[\w]+(?:\.[\w]{3}|\.[\w]{2}\.[\w]{2})\b', line)
            if len(email) >= 1:
                current_staff["mbox"] = email[0]
                continue
                
            # else new staff
            if current_staff != None:
                dmp_staff.append(current_staff)
            current_staff = {}    
            current_staff["name"] = line
            
        if current_staff != None:
            dmp_staff.append(current_staff)
        
        self.dmp["dmp"]["dm_staff"] = dmp_staff
        return dmp_staff
    
    def parse_question_3_3(self, text):
        ret_val = {}
        for ds in self.dmp["dmp"]["dataset"]:
            ds["preservation_statement"] = text
            ret_val[ds["title"]] = text
        return ret_val

    def parse_question_4_1(self, text):
        ret_val = []
        for ds in self.dmp["dmp"]["dataset"]:
            ds["security_and_privacy"] = [{
                "title": "Data Security",
                "text": text
            }]
            ret_val.append({
                "dataset": ds["title"],
                "title": "Data Security",
                "text": text
            })
        return ret_val
    
    def parse_question_5_1(self, text):
        ret_val = {}
        
        self.dmp["dmp"]["ethical_issues_description"] = text
        ret_val["ethical_issues_description"] = text
        
        self.dmp["dmp"]["ethical_issues_exist"] = "no"
        ret_val["ethical_issues_exist"] = "no"
        
        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
        if len(url)>=1:
            url = url[0]
            if url.endswith(".") or url.endswith(",") or url.endswith(")"):
                url = url[:-1]
            self.dmp["dmp"]["ethical_issues_report"] = url
            ret_val["ethical_issues_report"] = url
            
            self.dmp["dmp"]["ethical_issues_exist"] = "yes"
            ret_val["ethical_issues_exist"] = "yes"
            
        return ret_val
    
    def parse_question_6_1(self, text):
        q = "Refer to other national/funder/sectorial/departmental procedures for data management that you are using (if any)"
        if "description" in self.dmp["dmp"]["project"]:
            text=";"+q+"::"+text
            self.dmp["dmp"]["project"]["description"] += text
        else:
            text=q+"::"+text
            self.dmp["dmp"]["project"]["description"] = text
        return text
    
    def generate(self):
        return json.dumps(self.dmp, indent=2, sort_keys=False)

## Test HorizonParser

In [45]:
dmp_title = "Parser Test"
dmp_description = "DMP Abstract is located here"
dmp_created = "2017-01-01" 
dmp_modified = "2018-01-01"
dmp_id = "http://validorcid.com/12345"
pi_name = "Martin Pichler"
pi_mail = "pichler.martin@outlook.at"
pi_orcid = "https://orcid.org/0000-0001-5305-9063"

horizon_parser = HorizonParser(dmp_title, dmp_description, dmp_created, dmp_modified, dmp_id, pi_name, pi_mail, pi_orcid)

### Test Question 1.1

In [46]:
q1_1_test_1 = "No new data was collected for this project. Only data from the Austrian and European open data portals has been used. The generated data are images that are used to draw conclusions between the divorce rate of Austria and the EU28."

horizon_parser.parse_question_1_1(q1_1_test_1)

'State the purpose of the data collection/generation::No new data was collected for this project. Only data from the Austrian and European open data portals has been used. The generated data are images that are used to draw conclusions between the divorce rate of Austria and the EU28.'

### Test Question 1.2

In [47]:
q1_2_test_1 = "The input data contains the annual number of divorces for Austria and the EU. The produced data is a visual analysis of the correlation between the Austrian and the EU28 divorce rate."

horizon_parser.parse_question_1_2(q1_2_test_1)

';Explain the relation to the objectives of the project::The input data contains the annual number of divorces for Austria and the EU. The produced data is a visual analysis of the correlation between the Austrian and the EU28 divorce rate.'

### Test Question 1.3

In [48]:
q1_3_test_1 = """Title: Ehescheidungen (Statistik Austria)
Description: Divorce statistic of austria
Type: Dataset
Format: csv
Source: Input

title: Divorces by duration of marriage (Eurostat)description: Divorce statistic of the EUType: DatasetFormat: tsvSource: Input

Title: scatter
Description: Scatterplot
Type: Image
Format: png
Source: Produced

Title:time_change
Description:Time Change Plot
Type:Image
Format:png
Source:Produced

Title: time_corr
Description: Time Correlation Plot
Type: Image
Format: png
Source: Produced"""

q1_3_test_2 = """Title: Ehescheidungen (Statistik Austria)
Description: Divorce statistic of austria
Type: Dataset
Format: csv
Source: Input

title: Divorces by duration of marriage (Eurostat)description: Divorce statistic of the EUType: DatasetFormat: tsvSource: Input

Title: scatter
Description: Scatterplot
Type: Image
Format: png
Source: Produced

Title:time_change
Description:Time Change Plot
Type:Image
Format:png
Source:Produced

Title: time_corr
Description: Time Correlation Plot
Type: Image
"""

display(horizon_parser.parse_question_1_3(q1_3_test_1))
#display(horizon_parser.parse_question_1_3(q1_3_test_2))

[{'title': 'Ehescheidungen (Statistik Austria)',
  'description': 'Divorce statistic of austria',
  'type': 'Dataset',
  'distribution': [{'title': 'Origin', 'format': 'csv'},
   {'title': 'Project', 'format': 'csv'}],
  'metadata': [],
  'technical_resource': []},
 {'title': 'Divorces by duration of marriage (Eurostat)',
  'description': 'Divorce statistic of the EU',
  'type': 'Dataset',
  'distribution': [{'title': 'Origin', 'format': 'tsv'},
   {'title': 'Project', 'format': 'tsv'}],
  'metadata': [],
  'technical_resource': []},
 {'title': 'scatter',
  'description': 'Scatterplot',
  'type': 'Image',
  'distribution': [{'title': 'Project', 'format': 'png'}],
  'metadata': [],
  'technical_resource': []},
 {'title': 'time_change',
  'description': 'Time Change Plot',
  'type': 'Image',
  'distribution': [{'title': 'Project', 'format': 'png'}],
  'metadata': [],
  'technical_resource': []},
 {'title': 'time_corr',
  'description': 'Time Correlation Plot',
  'type': 'Image',
  'distr

### Test Question 1.4

In [77]:
q1_4_test_1 = "Ehescheidungen (Statistik Austria) and Divorces by duration of marriage (Eurostat) are re-used for this project."

horizon_parser.parse_question_1_4(q1_4_test_1)

{'Ehescheidungen (Statistik Austria)': 're-used',
 'Divorces by duration of marriage (Eurostat)': 're-used',
 'scatter': 'generated',
 'time_change': 'generated',
 'time_corr': 'generated'}

### Test Question 1.5

In [50]:
q1_5_test_1 = """Ehescheidungen (Statistik Austria) was downloaded from https://www.data.gv.at/katalog/dataset/2d8ad82c-4730-3354-9971-9406f2ccf72c.

Divorces by duration of marriage (Eurostat) was downloaded from https://ec.europa.eu/eurostat/web/products-datasets/-/demo_ndivdur.

scatter was produced during research.

time_change was produced during research.

time_corr was produced during research."""


q1_5_test_2 = """Ehescheidungen (Statistik Austria) was downloaded from https://www.data.gv.at/katalog/dataset/2d8ad82c-4730-3354-9971-9406f2ccf72c.

scatter was produced during research.

time_corr was produced during research."""

q1_5test_3 = """
Ehescheidungen (Statistik Austria) was downloaded from https://www.data.gv.at/katalog/dataset/2d8ad82c-4730-3354-9971-9406f2ccf72c.
Divorces by duration of marriage (Eurostat) was downloaded from https://ec.europa.eu/eurostat/web/products-datasets/-/demo_ndivdur.
scatter was produced during research.
time_change was produced during research.
time_corr was produced during research.
Error while connecting to PostgreSQL ''
PostgreSQL connection is closed
"""

horizon_parser.parse_question_1_5(q1_5_test_1)
#horizon_parser.parse_question_1_5(q1_5_test_2)
#orizon_parser.parse_question_1_5(q1_5test_3)

{'Ehescheidungen (Statistik Austria)': [{'dataset_id': 'https://www.data.gv.at/katalog/dataset/2d8ad82c-4730-3354-9971-9406f2ccf72c',
   'dataset_id_type': 'HTTP-URI',
   'access_url': 'https://www.data.gv.at/katalog/dataset/2d8ad82c-4730-3354-9971-9406f2ccf72c'}],
 'Divorces by duration of marriage (Eurostat)': [{'dataset_id': 'https://ec.europa.eu/eurostat/web/products-datasets/-/demo_ndivdur',
   'dataset_id_type': 'HTTP-URI',
   'access_url': 'https://ec.europa.eu/eurostat/web/products-datasets/-/demo_ndivdur'}],
 'scatter': [{'dataset_id': 'scatter', 'dataset_id_type': 'custom'}],
 'time_change': [{'dataset_id': 'time_change', 'dataset_id_type': 'custom'}],
 'time_corr': [{'dataset_id': 'time_corr', 'dataset_id_type': 'custom'}]}

### Test Question 1.6

In [51]:
q1_6_test_1 = """Ehescheidungen (Statistik Austria) has a size of 1KB

Divorces by duration of marriage (Eurostat) has a size of 297KB

scatter has a size of 15360

time_change has a size of 30KB

time_corr has a size of 15KB"""

horizon_parser.parse_question_1_6(q1_6_test_1)

{'Ehescheidungen (Statistik Austria)': 1024,
 'Divorces by duration of marriage (Eurostat)': 304128,
 'scatter': '15360',
 'time_change': 30720,
 'time_corr': 15360}

### Test Question 1.7

In [52]:
q1_7_test_1 = "Storing the data makes it easy for other people to rerun and validate the correctness of the experiment. It also avoids the possible loss of the original data through the third-party providers and guarantees access to the data."

horizon_parser.parse_question_1_7(q1_7_test_1)

';Outline the data utility: to whom will it be useful::Storing the data makes it easy for other people to rerun and validate the correctness of the experiment. It also avoids the possible loss of the original data through the third-party providers and guarantees access to the data.'

### Test Question 2.1.1

In [53]:
q2_1_1_test_1 = "The metadata description for Ehescheidungen (Statistik Austria) can be found in the jupyter notebook file divorce_analysis.ipynb or can be accessed online at https://www.data.gv.at/katalog/dataset/2d8ad82c-4730-3354-9971-9406f2ccf72c. The metadata description for Divorces by duration of marriage (Eurostat) is stored in the same file \"divorce_analysis.ipynb\" or online at http://data.europa.eu/euodp/en/data/dataset/bRJAS74ZDdIpeU7mnKhMiA."

q2_1_1_test_2 = "The metadata description for Ehescheidungen (Statistik Austria) can be accessed online at https://www.data.gv.at/katalog/dataset/2d8ad82c-4730-3354-9971-9406f2ccf72c or can be found in the jupyter notebook file divorce_analysis.ipynb. The metadata description for Divorces by duration of marriage (Eurostat) is stored in the same file \"divorce_analysis.ipynb\" or online at http://data.europa.eu/euodp/en/data/dataset/bRJAS74ZDdIpeU7mnKhMiA."


horizon_parser.parse_question_2_1_1(q2_1_1_test_1)
#horizon_parser.parse_question_2_1_1(q2_1_1_test_2)

[{'dataset': 'Ehescheidungen (Statistik Austria)',
  'description': 'Dataset Metadata',
  'language': 'en',
  'metadata_id': {'metadata_id': 'https://www.data.gv.at/katalog/dataset/2d8ad82c-4730-3354-9971-9406f2ccf72c',
   'metadata_id_type': 'HTTP-URI'}},
 {'dataset': 'Ehescheidungen (Statistik Austria)',
  'description': 'Dataset Metadata',
  'language': 'en',
  'metadata_id': {'metadata_id': 'divorce_analysis.ipynb',
   'metadata_id_type': 'custom'}},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'description': 'Dataset Metadata',
  'language': 'en',
  'metadata_id': {'metadata_id': 'http://data.europa.eu/euodp/en/data/dataset/brjas74zddipeu7mnkhmia',
   'metadata_id_type': 'HTTP-URI'}},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'description': 'Dataset Metadata',
  'language': 'en',
  'metadata_id': {'metadata_id': 'divorce_analysis.ipynb',
   'metadata_id_type': 'custom'}}]

### Test Question 2.1.2

No processing needed

### Test Question 2.1.3

In [54]:
q2_1_3_test_1 = """The input files are saved with their original file names as provided by their original source. 

Other files, code and data use naming conventions commonly used with python (https://visualgit.readthedocs.io/en/latest/pages/naming_convention.html)"""

q2_1_3_test_2 = """The input files are saved with their original file names as provided by their original source. 

Other files, code and data use naming conventions commonly used with python (https://visualgit.readthedocs.io/en/latest/pages/naming_convention.html)

We also provide them in the file conventions.txt
"""


horizon_parser.parse_question_2_1_3(q2_1_3_test_1)
#horizon_parser.parse_question_2_1_3(q2_1_3_test_2)

[{'dataset': 'Ehescheidungen (Statistik Austria)',
  'description': 'Naming Conventions',
  'language': 'en',
  'metadata_id': {'metadata_id': 'https://visualgit.readthedocs.io/en/latest/pages/naming_convention.html',
   'metadata_id_type': 'HTTP-URI'}},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'description': 'Naming Conventions',
  'language': 'en',
  'metadata_id': {'metadata_id': 'https://visualgit.readthedocs.io/en/latest/pages/naming_convention.html',
   'metadata_id_type': 'HTTP-URI'}},
 {'dataset': 'scatter',
  'description': 'Naming Conventions',
  'language': 'en',
  'metadata_id': {'metadata_id': 'https://visualgit.readthedocs.io/en/latest/pages/naming_convention.html',
   'metadata_id_type': 'HTTP-URI'}},
 {'dataset': 'time_change',
  'description': 'Naming Conventions',
  'language': 'en',
  'metadata_id': {'metadata_id': 'https://visualgit.readthedocs.io/en/latest/pages/naming_convention.html',
   'metadata_id_type': 'HTTP-URI'}},
 {'dataset': 'time_co

### Test Question 2.1.4

In [55]:
q2_1_4_test_1 = "Not implemented."

horizon_parser.parse_question_2_1_4(q2_1_4_test_1)

';Outline the approach towards search keyword::Not implemented.'

### Test Question 2.1.5

In [56]:
q2_1_5_test_1 = """The original release was versioned as v1.0.0. Every further release however will follow a date based versioning pattern <year>.<month>.<day>.<sequence_within_day> (e.g. 2019.04.19.01).

Since the project is hosted as a git repository, keeping track of changes to the data is handled automatically."""

horizon_parser.parse_question_2_1_5(q2_1_5_test_1)

[{'dataset': 'Ehescheidungen (Statistik Austria)',
  'supports_versioning': 'yes',
  'title': 'git'},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'supports_versioning': 'yes',
  'title': 'git'},
 {'dataset': 'scatter', 'supports_versioning': 'yes', 'title': 'git'},
 {'dataset': 'time_change', 'supports_versioning': 'yes', 'title': 'git'},
 {'dataset': 'time_corr', 'supports_versioning': 'yes', 'title': 'git'}]

### Test Question 2.1.6

In [57]:
q2_1_6_test_1 = "The metadata in the jupyter notebook contains information about the original fields of the input data and what part of it actually is used in the project."

q2_1_6_test_2 = "The metadata in the jupyter notebook (divorce_analysis.ipynb) contains information about the original fields of the input data and what part of it actually is used in the project."


horizon_parser.parse_question_2_1_6(q2_1_6_test_1)
#horizon_parser.parse_question_2_1_6(q2_1_6_test_2)

[]

### Test Question 2.2.1

In [58]:
q2_2_1_test_1 = """Ehescheidungen (Statistik Austria) is open access.

Divorces by duration of marriage (Eurostat) is open access.

scatter is open access

time_change is open access

time_corr is open access"""

q2_2_1_test_2 = """Ehescheidungen (Statistik Austria) is closed access.

Divorces by duration of marriage (Eurostat) is closed access.

scatter is open access

time_change is open access

time_corr is open access"""

q2_2_1_test_3 = """Ehescheidungen (Statistik Austria) is closed access.

Divorces by duration of marriage (Eurostat) is closed access.

scatter is open access

time_change is undefined access
 
time_corr is open and closed access"""

horizon_parser.parse_question_2_2_1(q2_2_1_test_1)
#horizon_parser.parse_question_2_2_1(q2_2_1_test_2)
#horizon_parser.parse_question_2_2_1(q2_2_1_test_3)

{'Ehescheidungen (Statistik Austria)': 'open',
 'Divorces by duration of marriage (Eurostat)': 'open',
 'scatter': 'open',
 'time_change': 'open',
 'time_corr': 'open'}

### Test Question 2.2.2

In [59]:
q2_2_2_test_1 = "The data can be accessed through Github (https://github.com/martinpichler/data_stewardship_ex1)."

horizon_parser.parse_question_2_2_2(q2_2_2_test_1)

[{'access_url': 'https://github.com/martinpichler/data_stewardship_ex1)',
  'host': {'title': 'github.com'}},
 {'access_url': 'https://github.com/martinpichler/data_stewardship_ex1)',
  'host': {'title': 'github.com'}},
 {'access_url': 'https://github.com/martinpichler/data_stewardship_ex1)',
  'host': {'title': 'github.com'}},
 {'access_url': 'https://github.com/martinpichler/data_stewardship_ex1)',
  'host': {'title': 'github.com'}},
 {'access_url': 'https://github.com/martinpichler/data_stewardship_ex1)',
  'host': {'title': 'github.com'}}]

### Test Question 2.2.3

In [60]:
q2_2_3_test_1 = """The data can be downloaded either from Github directly (single files or zip) or can be downloaded with git.

The input data files are stored as .csv and .tsv files and need no additional software. 

The produced data are .png images and an image viewer is needed to open them."""

horizon_parser.parse_question_2_2_3(q2_2_3_test_1)

[{'dataset': 'Ehescheidungen (Statistik Austria)',
  'description': 'The data can be downloaded either from Github directly (single files or zip) or can be downloaded with git.\n\nThe input data files are stored as .csv and .tsv files and need no additional software. \n\nThe produced data are .png images and an image viewer is needed to open them.',
  'technical_resource_id': {'technical_resource_id': 'software_tools',
   'technical_resource_id_type': 'custom'}},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'description': 'The data can be downloaded either from Github directly (single files or zip) or can be downloaded with git.\n\nThe input data files are stored as .csv and .tsv files and need no additional software. \n\nThe produced data are .png images and an image viewer is needed to open them.',
  'technical_resource_id': {'technical_resource_id': 'software_tools',
   'technical_resource_id_type': 'custom'}},
 {'dataset': 'scatter',
  'description': 'The data can 

### Test Question 2.2.4

In [61]:
q2_2_4_test_1 = "As stated in previous sections, the complete project is available on Github (https://github.com/martinpichler/data_stewardship_ex1)."

horizon_parser.parse_question_2_2_4(q2_2_4_test_1)

[{'access_url': 'https://github.com/martinpichler/data_stewardship_ex1)',
  'host': {'title': 'github.com'}},
 {'access_url': 'https://github.com/martinpichler/data_stewardship_ex1)',
  'host': {'title': 'github.com'}},
 {'access_url': 'https://github.com/martinpichler/data_stewardship_ex1)',
  'host': {'title': 'github.com'}},
 {'access_url': 'https://github.com/martinpichler/data_stewardship_ex1)',
  'host': {'title': 'github.com'}},
 {'access_url': 'https://github.com/martinpichler/data_stewardship_ex1)',
  'host': {'title': 'github.com'}}]

### Test Question 2.2.5

In [62]:
q2_2_5_test_1 = "No restrictions, access is public and open to everyone."

horizon_parser.parse_question_2_2_5(q2_2_5_test_1)

[{'dataset': 'Ehescheidungen (Statistik Austria)',
  'description': 'No restrictions, access is public and open to everyone.',
  'metadata_id': {'metadata_id': 'restrictions',
   'metadata_id_type': 'custom'}},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'description': 'No restrictions, access is public and open to everyone.',
  'metadata_id': {'metadata_id': 'restrictions',
   'metadata_id_type': 'custom'}},
 {'dataset': 'scatter',
  'description': 'No restrictions, access is public and open to everyone.',
  'metadata_id': {'metadata_id': 'restrictions',
   'metadata_id_type': 'custom'}},
 {'dataset': 'time_change',
  'description': 'No restrictions, access is public and open to everyone.',
  'metadata_id': {'metadata_id': 'restrictions',
   'metadata_id_type': 'custom'}},
 {'dataset': 'time_corr',
  'description': 'No restrictions, access is public and open to everyone.',
  'metadata_id': {'metadata_id': 'restrictions',
   'metadata_id_type': 'custom'}}]

### Test Question 2.3.1

In [63]:
q2_3_1_test_1 = """All the data used and produced is based on non-restricted or proprietary data formats (e.g. .csv, .tsv, png, python, jupyter, git).

There is no domain-specific vocabulary to take in consideration."""

horizon_parser.parse_question_2_3_1(q2_3_1_test_1)

[{'dataset': 'Ehescheidungen (Statistik Austria)',
  'description': 'All the data used and produced is based on non-restricted or proprietary data formats (e.g. .csv, .tsv, png, python, jupyter, git).\n\nThere is no domain-specific vocabulary to take in consideration.',
  'technical_resource_id': {'technical_resource_id': 'interoperability_standards',
   'technical_resource_id_type': 'custom'}},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'description': 'All the data used and produced is based on non-restricted or proprietary data formats (e.g. .csv, .tsv, png, python, jupyter, git).\n\nThere is no domain-specific vocabulary to take in consideration.',
  'technical_resource_id': {'technical_resource_id': 'interoperability_standards',
   'technical_resource_id_type': 'custom'}},
 {'dataset': 'scatter',
  'description': 'All the data used and produced is based on non-restricted or proprietary data formats (e.g. .csv, .tsv, png, python, jupyter, git).\n\nThere is no doma

### Test Question 2.3.2

In [64]:
q2_3_2_test_1 = "Standard vocabulary is used, no mapping required."

horizon_parser.parse_question_2_3_2(q2_3_2_test_1)

[{'dataset': 'Ehescheidungen (Statistik Austria)',
  'description': 'Standard vocabulary is used, no mapping required.',
  'technical_resource_id': {'technical_resource_id': 'vocabulary_standards',
   'technical_resource_id_type': 'custom'}},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'description': 'Standard vocabulary is used, no mapping required.',
  'technical_resource_id': {'technical_resource_id': 'vocabulary_standards',
   'technical_resource_id_type': 'custom'}},
 {'dataset': 'scatter',
  'description': 'Standard vocabulary is used, no mapping required.',
  'technical_resource_id': {'technical_resource_id': 'vocabulary_standards',
   'technical_resource_id_type': 'custom'}},
 {'dataset': 'time_change',
  'description': 'Standard vocabulary is used, no mapping required.',
  'technical_resource_id': {'technical_resource_id': 'vocabulary_standards',
   'technical_resource_id_type': 'custom'}},
 {'dataset': 'time_corr',
  'description': 'Standard vocabulary is us

### Test Question 2.4.1

In [65]:
q2_4_1_test_1 = "The project is licenced under the MIT licence (https://opensource.org/licenses/MIT) and open for re-use by anyone."

horizon_parser.parse_question_2_4_1(q2_4_1_test_1)

[{'dataset': 'Ehescheidungen (Statistik Austria)',
  'license_ref': 'https://opensource.org/licenses/MIT'},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'license_ref': 'https://opensource.org/licenses/MIT'},
 {'dataset': 'scatter', 'license_ref': 'https://opensource.org/licenses/MIT'},
 {'dataset': 'time_change',
  'license_ref': 'https://opensource.org/licenses/MIT'},
 {'dataset': 'time_corr',
  'license_ref': 'https://opensource.org/licenses/MIT'}]

### Test Question 2.4.2

In [66]:
q2_4_2_test_1 = "Already openly available."
q2_4_2_test_2 = "The data will be made available on 2017-06-06"


horizon_parser.parse_question_2_4_2(q2_4_2_test_1)
#horizon_parser.parse_question_2_4_2(q2_4_2_test_2)

[{'dataset': 'Ehescheidungen (Statistik Austria)', 'start_date': '2017-01-01'},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'start_date': '2017-01-01'},
 {'dataset': 'scatter', 'start_date': '2017-01-01'},
 {'dataset': 'time_change', 'start_date': '2017-01-01'},
 {'dataset': 'time_corr', 'start_date': '2017-01-01'}]

### Test Question 2.4.3

In [67]:
q2_4_3_test_1 = "No restrictions, data and source code can be used by anyone."

horizon_parser.parse_question_2_4_3(q2_4_3_test_1)

[{'dataset': 'Ehescheidungen (Statistik Austria)',
  'description': 'No restrictions, data and source code can be used by anyone.',
  'metadata_id': {'metadata_id': 'third_party_usability',
   'metadata_id_type': 'custom'}},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'description': 'No restrictions, data and source code can be used by anyone.',
  'metadata_id': {'metadata_id': 'third_party_usability',
   'metadata_id_type': 'custom'}},
 {'dataset': 'scatter',
  'description': 'No restrictions, data and source code can be used by anyone.',
  'metadata_id': {'metadata_id': 'third_party_usability',
   'metadata_id_type': 'custom'}},
 {'dataset': 'time_change',
  'description': 'No restrictions, data and source code can be used by anyone.',
  'metadata_id': {'metadata_id': 'third_party_usability',
   'metadata_id_type': 'custom'}},
 {'dataset': 'time_corr',
  'description': 'No restrictions, data and source code can be used by anyone.',
  'metadata_id': {'metadata_id': '

### Test Question 2.4.4

In [68]:
q2_4_4_test_1 = "The project only uses already available data and no new data will be generated in the future. The used data was checked for completeness and correctness."

horizon_parser.parse_question_2_4_4(q2_4_4_test_1)

{'Ehescheidungen (Statistik Austria)': 'The project only uses already available data and no new data will be generated in the future. The used data was checked for completeness and correctness.',
 'Divorces by duration of marriage (Eurostat)': 'The project only uses already available data and no new data will be generated in the future. The used data was checked for completeness and correctness.',
 'scatter': 'The project only uses already available data and no new data will be generated in the future. The used data was checked for completeness and correctness.',
 'time_change': 'The project only uses already available data and no new data will be generated in the future. The used data was checked for completeness and correctness.',
 'time_corr': 'The project only uses already available data and no new data will be generated in the future. The used data was checked for completeness and correctness.'}

### Test Question 2.4.5

In [69]:
q2_4_5_test_1 = "There is no limit on how long the data will remain re-usable."
q2_4_5_test_2 = "The data will be usable till 2019-01-01"


horizon_parser.parse_question_2_4_5(q2_4_5_test_1)
#horizon_parser.parse_question_2_4_5(q2_4_5_test_2)

[]

### Test Question 3.1

In [70]:
q3_1_test_1 = "Since the data is stored in a public Github repository, no additional cost has to be covered."
q3_1_test_2 = "cost are $100"
q3_1_test_3 = "costs are EUR100"

horizon_parser.parse_question_3_1(q3_1_test_1)
#horizon_parser.parse_question_3_1(q3_1_test_2)
#horizon_parser.parse_question_3_1(q3_1_test_3)

{'title': 'Costs for making your data FAIR',
 'description': 'Since the data is stored in a public Github repository, no additional cost has to be covered.'}

### Test Question 3.2

In [71]:
q3_2_test_1 = """Martin Pichler
mpichler.dev@gmail.com
https://orcid.org/0000-0001-5305-9063"""

q3_2_test_2 = """Martin Pichler
mpichler.dev@gmail.com
https://orcid.org/0000-0001-5305-9063
Other Person
other.mail@mail.com
https://orcid.org/0000-0001-5305-9063"""

horizon_parser.parse_question_3_2(q3_2_test_1)
#horizon_parser.parse_question_3_2(q3_2_test_2)

[{'name': 'Martin Pichler',
  'mbox': 'mpichler.dev@gmail.com',
  'staff_id': {'staff_id': 'https://orcid.org/0000-0001-5305-9063',
   'staff_id_type': 'HTTP-ORCID'}}]

### Test Question 3.3

In [72]:
q3_3_test_1 = "Since the data is stored in a public Github repository, no additional cost has to be covered."

horizon_parser.parse_question_3_3(q3_3_test_1)

{'Ehescheidungen (Statistik Austria)': 'Since the data is stored in a public Github repository, no additional cost has to be covered.',
 'Divorces by duration of marriage (Eurostat)': 'Since the data is stored in a public Github repository, no additional cost has to be covered.',
 'scatter': 'Since the data is stored in a public Github repository, no additional cost has to be covered.',
 'time_change': 'Since the data is stored in a public Github repository, no additional cost has to be covered.',
 'time_corr': 'Since the data is stored in a public Github repository, no additional cost has to be covered.'}

### Test Question 4.1

In [73]:
q4_1_test_1 = "Data is stored in a git repository on Github and can only be modified by the owner or an administrator. If data is lost locally, the original data can be downloaded from the Github repository. No sensitive data is stored and thus the repository does not need to be addressed."

horizon_parser.parse_question_4_1(q4_1_test_1)

[{'dataset': 'Ehescheidungen (Statistik Austria)',
  'title': 'Data Security',
  'text': 'Data is stored in a git repository on Github and can only be modified by the owner or an administrator. If data is lost locally, the original data can be downloaded from the Github repository. No sensitive data is stored and thus the repository does not need to be addressed.'},
 {'dataset': 'Divorces by duration of marriage (Eurostat)',
  'title': 'Data Security',
  'text': 'Data is stored in a git repository on Github and can only be modified by the owner or an administrator. If data is lost locally, the original data can be downloaded from the Github repository. No sensitive data is stored and thus the repository does not need to be addressed.'},
 {'dataset': 'scatter',
  'title': 'Data Security',
  'text': 'Data is stored in a git repository on Github and can only be modified by the owner or an administrator. If data is lost locally, the original data can be downloaded from the Github repositor

### Test Question 5.1

In [74]:
q5_1_test_1 = "No ethical issues exist"

q5_1_test_2 = "Ethical issues are reported at http://www.issues.com"

horizon_parser.parse_question_5_1(q5_1_test_1)
#horizon_parser.parse_question_5_1(q5_1_test_2)

{'ethical_issues_description': 'No ethical issues exist',
 'ethical_issues_exist': 'no'}

### Test Question 6.1

In [75]:
q6_1_test_1 = "No additional procedures are used."

horizon_parser.parse_question_6_1(q6_1_test_1)

';Refer to other national/funder/sectorial/departmental procedures for data management that you are using (if any)::No additional procedures are used.'

### Test Generate

In [76]:
print(horizon_parser.generate())

{
  "dmp": {
    "title": "Parser Test",
    "description": "Abstract::DMP Abstract is located here;Explain the relation to the objectives of the project::The input data contains the annual number of divorces for Austria and the EU. The produced data is a visual analysis of the correlation between the Austrian and the EU28 divorce rate.;Outline the data utility: to whom will it be useful::Storing the data makes it easy for other people to rerun and validate the correctness of the experiment. It also avoids the possible loss of the original data through the third-party providers and guarantees access to the data.;Outline the approach towards search keyword::Not implemented.",
    "created": "2017-01-01",
    "modified": "2018-01-01",
    "dmp_id": {
      "dmp_id": "http://validorcid.com/12345",
      "dmp_id_type": "HTTP-DOI"
    },
    "contact": {
      "name": "Martin Pichler",
      "mail": "pichler.martin@outlook.at",
      "contact_id": {
        "contact_id": "https://orcid.org/