### Exercise 1: Finding Mouse Homologs of Human Genes using Ensembl BioMart

In [5]:
import pandas as pd
from pybiomart import Dataset
import mygene
import requests

Input Genes (Human GRCh38):

VEPH1 (ventral anterior homeobox 1)

UGDH (UDP-glucose 6-dehydrogenase)

FRAT2 (frequently rearranged in advanced T-cell lymphomas 2)

RNU6-686P (RNA, U6 small nuclear 686, pseudogene)

XYZ (example gene not in dataset)

In [6]:
input_genes = ["VEPH1", "UGDH", "FRAT2", "RNU6-686P", "XYZ"]
mg = mygene.MyGeneInfo()

Fetch Ensemble datasets (DOES NOT WORK) I try with REST API

In [None]:

# Input genes list
input_genes = ["VEPH1", "UGDH", "FRAT2", "RNU6-686P", "XYZ"]

# Function to get human gene data from Ensembl REST API
def get_gene_data(gene_symbol):
    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?content-type=application/json"
    response = requests.get(url)
    if response.ok:
        return response.json()
    else:
        print(f"Failed to retrieve data for {gene_symbol}. Status Code: {response.status_code}")
        return None

# Function to get mouse orthologs for a human gene using Ensembl REST API
def get_human_mouse_orthologs(gene_symbol):
    url = f"https://rest.ensembl.org/homology/symbol/homo_sapiens/{gene_symbol}?target_species=mus_musculus&content-type=application/json"
    response = requests.get(url)
    if response.ok:
        ortholog_data = response.json()
        print(f"Response for {gene_symbol}:")
        print(ortholog_data)  # Print the full response for debugging
        return ortholog_data
    else:
        print(f"Failed to retrieve ortholog data for gene symbol: {gene_symbol}. Status Code: {response.status_code}")
        return None

# Function to get detailed mouse gene data using Ensembl REST API
def get_mouse_gene_data(mouse_gene_id):
    url = f"https://rest.ensembl.org/lookup/id/{mouse_gene_id}?content-type=application/json"
    response = requests.get(url)
    if response.ok:
        return response.json()
    else:
        print(f"Failed to retrieve data for mouse gene ID: {mouse_gene_id}. Status Code: {response.status_code}")
        return None

# Initialize a list to store results
results = []
ortholog_count = 0
# Loop through each input gene, get human data, and mouse orthologs
for gene_symbol in input_genes:
    # Get human gene data
    human_data = get_gene_data(gene_symbol)
    
    if human_data:
        # Extract human gene information
        human_gene_id = human_data['id']
        human_chromosome = human_data.get('seq_region_name', 'N/A')
        
        # Get mouse ortholog data using gene symbol
        ortholog_info = get_human_mouse_orthologs(gene_symbol)


        if ortholog_info and 'data' in ortholog_info and len(ortholog_info['data']) > 0:
            # Loop through each homology and create a new row for each ortholog found
            for homology in ortholog_info['data'][0]['homologies']:   
                print("Here it is the homology", ortholog_info['data'][0]['homologies'])            #Here there is something missing
                if homology['target']['species'] == 'mus_musculus':
                    # Extract mouse ortholog basic information
                    mouse_gene_id = homology['target']['id']
                    mouse_chromosome = homology['target'].get('location', 'N/A').split(':')[0] if 'location' in homology['target'] else 'N/A'
                    percent_identity = homology['target'].get('perc_id', 0.0)

                    # Get additional mouse gene information
                    mouse_gene_data = get_mouse_gene_data(mouse_gene_id)
                    if mouse_gene_data:
                        mouse_gene_symbol = mouse_gene_data.get('display_name', 'Unknown')
                        mouse_gene_description = mouse_gene_data.get('description', 'N/A')
                        mouse_chromosome = mouse_gene_data.get('seq_region_name', 'N/A') 
                    else:
                        mouse_gene_symbol = "Unknown"
                        mouse_gene_description = "N/A"

                    # Append data to results list (create a new row for each ortholog)
                    results.append({
                        "Human_Gene_Symbol": gene_symbol,
                        "Human_Ensembl_ID": human_gene_id,
                        "Human_Chromosome": human_chromosome,
                        "Mouse_Gene_Symbol": mouse_gene_symbol,
                        "Mouse_Ensembl_ID": mouse_gene_id,
                        "Mouse_Chromosome": mouse_chromosome,
                        "Percent_Identity": percent_identity,
                        "Mouse_Gene_Description": mouse_gene_description
                    })
                    ortholog_count += 1
        else:
            # If ortholog info couldn't be retrieved, append basic human data
            results.append({
                "Human_Gene_Symbol": gene_symbol,
                "Human_Ensembl_ID": human_gene_id,
                "Human_Chromosome": human_chromosome,
                "Mouse_Gene_Symbol": "No Ortholog",
                "Mouse_Ensembl_ID": "No Ortholog",
                "Mouse_Chromosome": "N/A",
                "Percent_Identity": 0.0,
                "Mouse_Gene_Description": "N/A"
            })
    else:
        # If human gene data is not available, append empty data
        results.append({
            "Human_Gene_Symbol": gene_symbol,
            "Human_Ensembl_ID": "Not Found",
            "Human_Chromosome": "N/A",
            "Mouse_Gene_Symbol": "No Ortholog",
            "Mouse_Ensembl_ID": "No Ortholog",
            "Mouse_Chromosome": "N/A",
            "Percent_Identity": 0.0,
            "Mouse_Gene_Description": "N/A"
        })

# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(results)

# Display the resulting DataFrame
print(results_df)

# Save the results to a CSV file
output_file = "human_mouse_orthologs.csv"
results_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
print("ortholog_count = ",ortholog_count)


Response for VEPH1:
{'data': [{'homologies': [{'target': {'cigar_line': '833M', 'taxon_id': 10090, 'perc_pos': 91.2365, 'align_seq': 'MHQLFRLVLGQKDLSKAGDLFSLDDAEIEDSLTEALEQIKVISSSLDYQTNNNDQAVVEICITRITTAIRETESIEKHARALVGLWDSCLEHNLRPAGKDEDTPHAKIASDIMSCILQNYNRTPVMVLAVPIAVKFLHRGSKELCRNMSNYLSLAAITKADLLADHTEGIIKSILQGNAMLLRVLPAVYEKQPQPINRHLAELLALMSQLEQTEQYHLLRLLHVAAKRKDVEVVQKCVPFLIRNLKDSTYNDIILNILIEIAGHEPLALNSFLPMLKEIAEQFPYLTGQMARIFGAVGHVDEERARSCLRYLVSQLANMEHPFHHILLLEIKSITDAFSSILGPHSRDIFRMSNSFTNIAKLLSRQLENSKADSSRRKTSTEVSIPEKLRELNSMEPESEDHEKLQVKIQAFEDKINAESNTPGSGRRYSLDHISKEERKSIRFSRSRSLALNTVLTNGVSVEDNEVEEKAGMHASISLSQIDPLSHGIGKLPFKTDTHGSPLRNSSASHPSIIHTEPETMPETFKENIQEEILEAATSPIEYQDKLYLHLRENLSKVKAYALEIAKKVPIPDQCTIEDTMRSCVAKLFFTCSLKGHYCLYSKSSFILVSQAPQPWIQVMFLSQQSLFPEPLSIQSGSVQFLKALWEKTQDTGAHSFEVAMTESTFPQQKDLEQLQLHLEEVRFFDVFGFSETAGAWQCFMCNNPEKATVVNQDGQPLIEGKLKEKQVRWKFIKRWKTHYFTLAGNQLLFQKGKSKDDPDDSPIELSKVQSVKAVAKKRRDRSLPRAFEIFTDSKTYVFKAKDEKNAEEWLQCINVALAQAKERESREVTTYL', 'protein_id': 'ENSMUSP00000029