In [1]:
!pip install biopython
from Bio import Entrez
from Bio import SeqIO

# Set your email address (required by NCBI Entrez)
Entrez.email = "aleksandra.m.karolak@gmail.com"

# Define the gene symbol
gene_symbol = "USP28"

# Search NCBI Gene database for the gene symbol
handle = Entrez.esearch(db="gene", term=f"{gene_symbol}[Gene Name] AND Homo sapiens[Organism]")
record = Entrez.read(handle)

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [2]:
if record["Count"] == "0":
    print(f"No gene record found for {gene_symbol}")
else:
    # Get the first gene ID from the search results
    gene_id = record["IdList"][0]

    # Fetch the gene information using the gene ID
    gene_handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml")
    gene_record = Entrez.read(gene_handle)

    # Extract chromosome location information
    chromosome = gene_record[0]["Entrezgene_locus"][0]["Gene-commentary_products"][0]["Gene-commentary_accession"]

    print(f"The chromosome location of {gene_symbol} gene is: {chromosome}")

    gene_handle.close()
handle.close()

The chromosome location of USP28 gene is: NR_174609


In [4]:
def get_gene_info(gene_symbol):
    try:
        # Search NCBI Gene database for the gene symbol
        handle = Entrez.esearch(db="gene", term=f"{gene_symbol}[Gene Name] AND Homo sapiens[Organism]")
        record = Entrez.read(handle)

        if record["Count"] == "0":
            print(f"No gene record found for {gene_symbol}")
            return None

        # Get the first gene ID from the search results
        gene_id = record["IdList"][0]

        # Fetch the gene information using the gene ID
        gene_handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml")
        gene_record = Entrez.read(gene_handle)

        # Extract chromosome location information
        chromosome = gene_record[0]["Entrezgene_locus"][0]["Gene-commentary_products"][0]["Gene-commentary_accession"]
        gene_start = gene_record[0]["Entrezgene_locus"][0]["Gene-commentary_products"][0]["Gene-commentary_intervals"][0]["Seq-interval_from"]
        gene_end = gene_record[0]["Entrezgene_locus"][0]["Gene-commentary_products"][0]["Gene-commentary_intervals"][0]["Seq-interval_to"]

        print(f"Gene Symbol: {gene_symbol}")
        print(f"Chromosome: {chromosome}")
        print(f"Gene Start Position: {gene_start}")
        print(f"Gene End Position: {gene_end}")

        gene_handle.close()
        handle.close()

    except Exception as e:
        print(f"Error: {e}")

# Call the function with the gene symbol
get_gene_info(gene_symbol)

Error: 'Gene-commentary_intervals'


In [8]:
import re

def parse_variant(variant_str):
    # Define a regular expression pattern to extract information
    pattern = r'^(ENST\d+)\((\w+)\):c\.(-?\d+)([ACGTNacgtn])>([ACGTNacgtn])$'

    # Use regex to match the pattern in the variant string
    match = re.match(pattern, variant_str)

    if match:
        enst = match.group(1)  # ENST ID
        gene_name = match.group(2)  # Gene name
        position = match.group(3)  # Position
        ref_base = match.group(4).upper()  # Reference base
        alt_base = match.group(5).upper()  # Alternate base

        return enst, gene_name, position, ref_base, alt_base
    else:
        raise ValueError("Invalid variant string format")

# Example usage:
variant_str = "ENST00000608377(DNAH9):c.-237A>T"
enst, gene_name, position, ref_base, alt_base = parse_variant(variant_str)

print("ENST:", enst)
print("Gene Name:", gene_name)
print("Position:", position)
print("Reference Base:", ref_base)
print("Alternate Base:", alt_base)


ENST: ENST00000608377 , DNAH9

Position: -237
Reference Base: A
Alternate Base: T
