- Request for API Access to OMIM – https://omim.org/api

- A set of scripts to extract textual information and much more from OMIM – https://github.com/amirieb/omim-api

In [1]:
import os
import sys
import json
import requests
import time
from random import randint

# sys.path.append('../../')

# from scripts.utils import read_data

In [3]:
def read_data(jsonfilename):
    t0 = time()
    if '.gz' in jsonfilename:
        with gzip.open(jsonfilename, 'r') as fin:        # 4. gzip
            json_bytes = fin.read()                      # 3. bytes (i.e. UTF-8)
        json_str = json_bytes.decode('utf-8')            # 2. string (i.e. JSON)
        data = json.loads(json_str)                      # 1. data
    else:
        f = open(jsonfilename, 'r')
        data = json.load(f)
    print("done in %0.3fs" % (time() - t0))
    return data


In [3]:
omim_full_path = 'OMIM_FULL_May_2023.json'

def dump_data(data):
    with open(omim_full_path, 'w') as outfile:
        json.dump(data, outfile)

___

In [2]:
'''set the your API_KEY here, see https://omim.org/api'''
API_KEY = ''

___
__Phenotypic Series Titles__

- Manually download Phenotypic Series info as `tsv` file from – https://www.omim.org/phenotypicSeriesTitles/all/

In [2]:
!cat OMIM-Phenotypic-Series-Titles-all.tsv | head -n 20

OMIM Phenotypic Series Titles 
Downloaded:	May 27, 2023
Copyright (c) 1966-2023 Johns Hopkins University OMIM, data are provided for research purposes only.

Phenotypic Series Title	Phenotypic Series number
Abdominal obesity-metabolic syndrome	PS605552
Achondrogenesis	PS200600
Acne inversa	PS142690
Acrodysostosis	PS101800
Acromesomelic dysplasia	PS602875
Adams-Oliver syndrome	PS100300
Advanced sleep phase syndrome	PS604348
Agammaglobulinemia	PS601495
Aicardi-Goutieres syndrome	PS225750
Alagille syndrome	PS118450
Alopecia, isolated	PS203655
Alopecia-intellectual disability syndrome	PS203650
Alport syndrome	PS301050
Alternating hemiplegia of childhood	PS104290
Amelogenesis imperfecta	PS104500


___
- Then download these files – `https://omim.org/downloads/<your-api-key>`

In [3]:
# read list of mim ids to download, read the most recent list from omim.org
os.system('curl -o ./mim2gene.txt https://omim.org/static/omim/data/mim2gene.txt')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  920k  100  920k    0     0  1104k      0 --:--:-- --:--:-- --:--:-- 1114k


0

In [6]:
os.system(f'curl -o ./mimTitles.txt https://data.omim.org/downloads/{api_key}/mimTitles.txt')
os.system(f'curl -o ./genemap2.txt https://data.omim.org/downloads/{api_key}/genemap2.txt')
os.system(f'curl -o ./morbidmap.txt https://data.omim.org/downloads/{api_key}/morbidmap.txt')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2887k  100 2887k    0     0  2342k      0  0:00:01  0:00:01 --:--:-- 2355k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3235k  100 3235k    0     0  2906k      0  0:00:01  0:00:01 --:--:-- 2922k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  729k  100  729k    0     0   816k      0 --:--:-- --:--:-- --:--:--  820k


0

___

__Extracting description of OMIM phenotype and gene entries.__

Here I aim to use scripts from https://github.com/amirieb/omim-api to get the most updated OMIM data.

In [4]:
omim_data = {}
omim_data['omim'] = []

mims = [line.split('\t')[0].strip() for line in open('mim2gene.txt', 'r').readlines() if not line.startswith('#')]

# if any, reload entries that have been downloaded so far.
mim_ids_down = set([])
if os.path.exists(omim_full_path):
    omim_down = read_data(omim_full_path)
    for row in omim_down['omim']:
        mim = row['entry']['mimNumber']
        if mim not in mim_ids_down:
            mim_ids_down.add(mim)
            omim_data['omim'].append(row)
            del mims[mims.index(str(mim))]
print('#downloaded mims: ', len(mim_ids_down))
print('#to download mims: ', len(mims))

done in 3.250s
#downloaded mims:  28302
#to download mims:  0


In [None]:
# send queries to the api
i, cnt_item, mim_errors = 0, 19, []  # cnt_item: number of items to download per request.. double check, THERE IS A LIMIT IMPOSED BY THE API
while i < len(mims):
    time.sleep(randint(5, 15))
    try:
        link = 'https://api.omim.org/api/entry?mimNumber=' + ','.join(
            [mim for mim in mims[i:i + cnt_item]]) + '&format=json&include=all&apiKey=' + API_KEY
        print(i + cnt_item, link)
        r = requests.get(link)
        data = r.json()
        for entry in data['omim']['entryList']:
            # print '\t', i + cnt_item, entry['mimNumber'],
            omim_data['omim'].append(entry)
        i += cnt_item
    except():
        # print(mims[i:i + cnt_item])
        mim_errors.append(mims[i:i + cnt_item])
        print("Unexpected error:", sys.exc_info()[0])
        dump_data(omim_data)
        continue

In [None]:
r.status_code
# == 200

In [None]:
dump_data(omim_data)
if len(mim_errors) > 0:
    print('\n errors for these mim ids: ', mim_errors)
    print('\n re-run the code to restart downloading from the last downloaded mim id.')
else:
    print('Done!')

In [9]:
ls -lh OMIM_FULL_May_2023.json

-rw-r--r--  1 abearab  staff   362M May 29 21:46 OMIM_FULL_May_2023.json


### 

In [5]:
!gzip OMIM_FULL_May_2023.json

In [6]:
ls -lh OMIM_FULL_May_2023.json.gz

-rw-r--r--  1 abearab  staff   101M May 29 21:46 OMIM_FULL_May_2023.json.gz


In [7]:
!date

Mon May 29 21:47:19 PDT 2023
