In [1]:
# Read files
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from ratelimit import limits, sleep_and_retry

cis = pd.read_csv("sig-cis.csv")
trans = pd.read_csv("sig-trans.csv")
data = pd.concat([cis, trans], ignore_index=True)

In [2]:
# Set the Ensembl API endpoint and species
server = "https://rest.ensembl.org"
species = "human"

# Define a function to query the Ensembl API for gene coordinates
@sleep_and_retry
@limits(calls=15, period=1)
def get_gene_coordinates(gene_name):
    # Set the API endpoint and parameters for the gene lookup
    ext = f"/lookup/symbol/{species}/{gene_name}?expand=1"
    # Send the GET request to the API
    response = requests.get(server + ext, headers={"Content-Type": "application/json"})
    # Parse the response JSON and retrieve the gene coordinates
    if response.ok:
        gene_data = response.json()
        start_pos = gene_data["start"]
        end_pos = gene_data["end"]
        return (start_pos, end_pos)
    else:
        return ("N/A", "N/A")

# Create new columns to store the gene start and end positions
data["GeneStart"] = ""
data["GeneEnd"] = ""

# Define a function to query gene coordinates for a batch of gene symbols
def query_gene_coordinates_batch(gene_symbols):
    results = [get_gene_coordinates(gene_symbol) for gene_symbol in gene_symbols]
    return results

# Query gene coordinates for all unique gene symbols in batches
gene_symbols = data["GeneSymbol"].unique().tolist()
batch_size = 1000
for i in range(0, len(gene_symbols), batch_size):
    print(f"Querying genes {i+1}-{i+batch_size} of {len(gene_symbols)}")
    batch_symbols = gene_symbols[i:i+batch_size]
    batch_results = query_gene_coordinates_batch(batch_symbols)
    for j, gene_symbol in enumerate(batch_symbols):
        gene_start, gene_end = batch_results[j]
        data.loc[data["GeneSymbol"] == gene_symbol, "GeneStart"] = gene_start
        data.loc[data["GeneSymbol"] == gene_symbol, "GeneEnd"] = gene_end

Querying genes 1-1000 of 17405
Querying genes 1001-2000 of 17405
Querying genes 2001-3000 of 17405
Querying genes 3001-4000 of 17405
Querying genes 4001-5000 of 17405
Querying genes 5001-6000 of 17405
Querying genes 6001-7000 of 17405
Querying genes 7001-8000 of 17405
Querying genes 8001-9000 of 17405
Querying genes 9001-10000 of 17405
Querying genes 10001-11000 of 17405
Querying genes 11001-12000 of 17405
Querying genes 12001-13000 of 17405
Querying genes 13001-14000 of 17405
Querying genes 14001-15000 of 17405
Querying genes 15001-16000 of 17405
Querying genes 16001-17000 of 17405
Querying genes 17001-18000 of 17405


In [6]:
data.columns

Index(['Pvalue', 'SNP', 'SNPChr', 'SNPPos', 'AssessedAllele', 'OtherAllele',
       'Zscore', 'Gene', 'GeneSymbol', 'GeneChr', 'GenePos', 'NrCohorts',
       'NrSamples', 'FDR', 'BonferroniP', 'GeneStart', 'GeneEnd'],
      dtype='object')

In [4]:
data.to_csv('sig-combined-with-genes.csv', index=False)