diff --git a/scripts/golr-exporter.py b/scripts/golr-exporter.py new file mode 100644 index 0000000..09df921 --- /dev/null +++ b/scripts/golr-exporter.py @@ -0,0 +1,91 @@ +#!/usr/bin/python + +import os +import getopt +import urllib +import json +import csv +import shutil +import argparse + +def uniqAndSort(output): + sorted_suffix = '.sorted2' + f = open(output, "r") + fs = open(output + sorted_suffix, "w") + fs.writelines(sorted(set(f.readlines()))) + shutil.move(output + sorted_suffix, output) + +def transformLabel(input, output): + with open(input) as data_file: + data = json.load(data_file) + + with open(output, 'w') as tsvfile: + writer = csv.writer(tsvfile, delimiter='\t') + for entry in data: + id = entry["subject"] + label = entry["subject_label"] + writer.writerow([id, label]) + + uniqAndSort(output) + +def transformAssociation(input, output): + with open(input) as data_file: + data = json.load(data_file) + + with open(output, 'w') as tsvfile: + writer = csv.writer(tsvfile, delimiter='\t') + for entry in data: + id = entry["subject"] + for obj in entry["objects"]: + writer.writerow([id, obj]) + + uniqAndSort(output) + +def main(): + + biolink = "http://localhost:5000/api" #mart/labels/gene/phenotype/NCBITaxon%3A7955 + + taxon_map = { + 'Hs': 9606, + 'Mm': 10090, + 'Dr': 7955, + 'Dm': 7227, + 'Ce': 6239 + } + + parser = argparse.ArgumentParser(description='Fetcher from biolink for monarch data', + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('-t', '--taxon', type=str, required=False, + help='species prefix: ' + ",".join(taxon_map.keys())) + args = parser.parse_args() + + if args.taxon is None: + tax_list = taxon_map.keys() + else: + tax_list = args.taxon.split(',') + + print("Running for: " + ",".join(tax_list)) + for tax in tax_list: + if not os.path.exists(tax): + os.makedirs(tax) + + subjs = ["gene"] + obj = "phenotype" + + if tax == "Hs": + subjs = ["disease", "case"] + + for subj in subjs: + assocFileJson = tax + "/" + subj + "-" + obj + ".json" + assocFileTsv = tax + "/" + subj + "-" + obj + ".tsv" + labelFileTsv = tax + "/" + subj + "-label.tsv" + + assocURL = biolink + "/mart/" + subj + "/" + obj + "/NCBITaxon:" + str(taxon_map.get(tax)) + print("fetching " + assocURL) + urllib.urlretrieve (assocURL, assocFileJson) + transformAssociation(assocFileJson, assocFileTsv) + transformLabel(assocFileJson, labelFileTsv) + os.remove(assocFileJson) + +if __name__ == "__main__": + main()