<a href="https://colab.research.google.com/github/mouktik05/research/blob/main/mouktik_cdna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install biopython requests


Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [None]:
import requests

In [None]:
def get_transcript_exons(transcript_id):
    """Fetch exon information for a given Ensembl transcript ID."""
    #url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=gene;content-type=application/json"
    url = f"https://grch37.rest.ensembl.org/overlap/id/{transcript_id}?feature=gene;content-type=application/json"
    #https://grch37.rest.ensembl.org/overlap/id/ENST00000025008?feature=gene;content-type=application/json
    #https://rest.ensembl.org/overlap/id/ENST00000025008?feature=gene;content-type=application/json
    #response = requests.get(url, headers={"Content-Type": "application/json"})
    response = requests.get(url, json={"start": "value"})
    if response.status_code != 200:
        raise Exception(f"API request failed with status code {response.status_code}")
    return response.json()

def map_cdna_to_genomic(transcript_id, cdna_position):
    """Map a cDNA position to a genomic coordinate using Ensembl."""
    exons = get_transcript_exons(transcript_id)
    total_cdna_len = 0
    for exon in sorted(exons, key=lambda x: x['start']):
        exon_cdna_start = total_cdna_len + 1
        exon_cdna_end = total_cdna_len + exon['end'] - exon['start'] + 1
        total_cdna_len = exon_cdna_end

        if exon_cdna_start <= cdna_position <= exon_cdna_end:
            genomic_pos = exon['start'] + (cdna_position - exon_cdna_start)
            return exon['seq_region_name'], genomic_pos, exon['strand']

    raise ValueError(f"cDNA position {cdna_position} out of range for transcript {transcript_id}")


In [None]:

transcript_id = "ENST00000003302"  # Replace with your transcript ID for USP28
cdna_position = 2194  # Replace with your cDNA position
chromosome, genomic_position, strand = map_cdna_to_genomic(transcript_id, cdna_position)
print(f"Chromosome: {chromosome}, Genomic Position: {genomic_position}, Strand: {strand}")



Chromosome: 11, Genomic Position: 113670789, Strand: -1


In [None]:
# prompt: in the below cell, ignore exception in with loop and continue with other records

transcript_id = "ENST00000003302"  # Replace with your transcript ID for USP28
cdna_position = 2194  # Replace with your cDNA position

try:
    chromosome, genomic_position, strand = map_cdna_to_genomic(transcript_id, cdna_position)
    print(f"Chromosome: {chromosome}, Genomic Position: {genomic_position}, Strand: {strand}")
except Exception as e:
    print(f"Error processing transcript {transcript_id}: {e}")
    # Continue with other records
    pass


In [13]:
# prompt: python program to read from a CSV, extract column coding_region_effect and write to another file
import re
import csv
import requests

def get_transcript_exons(transcript_id):
    """Fetch exon information for a given Ensembl transcript ID."""
    #url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=gene;content-type=application/json"
    url = f"https://grch37.rest.ensembl.org/overlap/id/{transcript_id}?feature=gene;content-type=application/json"
    #https://grch37.rest.ensembl.org/overlap/id/ENST00000025008?feature=gene;content-type=application/json
    #https://rest.ensembl.org/overlap/id/ENST00000025008?feature=gene;content-type=application/json
    #response = requests.get(url, headers={"Content-Type": "application/json"})
    response = requests.get(url, json={"start": "value"})
    if response.status_code != 200:
        raise Exception(f"API request failed with status code {response.status_code}")
    return response.json()

def map_cdna_to_genomic(transcript_id, cdna_position):
    """Map a cDNA position to a genomic coordinate using Ensembl."""
    exons = get_transcript_exons(transcript_id)
    total_cdna_len = 0
    for exon in sorted(exons, key=lambda x: x['start']):
        exon_cdna_start = total_cdna_len + 1
        exon_cdna_end = total_cdna_len + exon['end'] - exon['start'] + 1
        total_cdna_len = exon_cdna_end

        if exon_cdna_start <= cdna_position <= exon_cdna_end:
            genomic_pos = exon['start'] + (cdna_position - exon_cdna_start)
            return exon['seq_region_name'], genomic_pos, exon['strand']

    raise ValueError(f"cDNA position {cdna_position} out of range for transcript {transcript_id}")


def parse_variant(variant_str):
    # Define a regular expression pattern to extract information
    pattern = r'^(ENST\d+)\((\w+)\):c\.(-?\d+)([ACGTNacgtn])>([ACGTNacgtn])$'

    # Use regex to match the pattern in the variant string
    match = re.match(pattern, variant_str)

    if match:
        enst = match.group(1)  # ENST ID
        gene_name = match.group(2)  # Gene name
        position = match.group(3)  # Position
        ref_base = match.group(4).upper()  # Reference base
        alt_base = match.group(5).upper()  # Alternate base

        return enst, gene_name, position, ref_base, alt_base
    else:
        return 'NA', 'NA', 'NA', 'NA', 'NA'

# Open the input CSV file
with open('odbfile.csv', 'r') as csv_file:
    csv_reader = csv.reader(csv_file)

    # Open the output file for writing
    with open('output.csv', 'w', newline='') as output_file:
        csv_writer = csv.writer(output_file)

        # Write the header row
        csv_writer.writerow(['coding_region_effect','enst','gene_name','position','ref_base','alt_base','chromosome', 'genomic_position', 'strand'])

        # Iterate through the input rows
        for row in csv_reader:
            # Extract the 'coding_region_effect' column value
            coding_region_effect = row[5]
            enst, gene_name, position, ref_base, alt_base = parse_variant(coding_region_effect)




            if enst != 'NA':
              print("position value is ", position)
              try:
                chromosome, genomic_position, strand = map_cdna_to_genomic(enst, int(position))

              except Exception as e:
                print(f"Error processing transcript {enst}: {e}")
                # Continue with other records
                pass

            else:
              chromosome = 'NA'
              genomic_position = 'NA'
              strand = 'NA'


            # Write the coding region effect value to the output file
            csv_writer.writerow([coding_region_effect,enst,gene_name,position,ref_base,alt_base,chromosome, genomic_position, strand])

position value is  2194
position value is  474
position value is  1589
position value is  1277
position value is  612
position value is  958
position value is  3829
position value is  243
position value is  3630
position value is  2382
position value is  629
position value is  5374
position value is  212
position value is  530
position value is  2278
position value is  2530
position value is  2530
position value is  -47
Error processing transcript ENST00000078445: cDNA position -47 out of range for transcript ENST00000078445
position value is  650
position value is  3031
position value is  2889
position value is  2889
position value is  2397
position value is  2258
position value is  2375
position value is  1518
position value is  1268
position value is  1144
position value is  362
position value is  1780
position value is  472
position value is  18184
position value is  3254
position value is  3469
position value is  2058
position value is  2219
position value is  89
position value is

KeyboardInterrupt: 