## Libraries / Setup

In [16]:
from biomart import BiomartServer
import os
import pandas as pd
import sys
import gtfparse


In [17]:
ref_sheet=pd.read_csv('/home/matthew.schmitz/Reference_Genome_tracking.csv')
ref_sheet=ref_sheet.loc[~ref_sheet['English Name'].isna(),:]

In [20]:
ref_sheet['CR6 ARC 2.0 reference in BICore folder'][0]

'/allen/programs/celltypes/workgroups/rnaseqanalysis/references/ferret/ncbi/asm1176430v1.1/genome/'

In [18]:
for i in ref_sheet['CR6 ARC 2.0 reference in BICore folder']:
    print(os.listdir(os.path.join(i,'regions')))

['transcripts.bed', 'tss.bed']
[]
['transcripts.bed', 'tss.bed']
['transcripts.bed', 'tss.bed']
['transcripts.bed', 'tss.bed']
['transcripts.bed', 'tss.bed']
['transcripts.bed', 'tss.bed']
['transcripts.bed', 'tss.bed']


FileNotFoundError: [Errno 2] No such file or directory: '<- to move over/regions'

In [15]:
!nvidia-smi

Thu Oct 12 17:24:51 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB           On | 00000000:64:00.0 Off |                    0 |
| N/A   27C    P0               31W / 250W|      0MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Map gene symbol to ensemble ID using gtf files

For later...

In [153]:
species_to_codes = {"gorilla": "ggorilla",
            "chimp": "ptroglodytes",
            "marmoset": "cjacchus",
            "rhesus": "mmulatta",
            "human": "hsapiens"
             }

## Locations of gtf files to map symbole to ensemble ID
gtf_paths = {"gorilla": "/allen/programs/celltypes/workgroups/rnaseqanalysis/EvoGen/great_apes/species/inputs/Gorilla_gorilla.gorGor4.110.gtf",
            "chimp": "/allen/programs/celltypes/workgroups/rnaseqanalysis/references/chimp/ncbi/pantro/premrna/genes/genes.gtf",
            "marmoset": "/allen/programs/celltypes/workgroups/rnaseqanalysis/references/marmoset/ncbi/mcalja1.2.pat.x/genome/genes/genes.gtf",
            "rhesus": "/allen/programs/celltypes/workgroups/rnaseqanalysis/references/macaque/ncbi/mmul10/genome/genes/genes.gtf",
            "human": "/allen/programs/celltypes/workgroups/hct/SEA-AD/RNAseq/cellxgene/input/genes.gtf"}

gtfs={}
for s in gtf_paths.keys():
    print(s)
    gtfs[s]=gtfparse.parse_gtf_and_expand_attributes(gtf_paths[s])

#Just take a peek
for s in gtfs.keys():
    print(s,species_to_codes[s])
    print(gtfs[s])


## Load tables of unmapped genes

In [4]:
gene_path='/home/matthew.schmitz/nhp_unmapped_genes'
unmapped={}
for p in os.listdir(gene_path):
    if '.csv' in p:
        s=p.split('_')[0]
        unmapped[s]=pd.read_csv(os.path.join(gene_path,p))

In [196]:
for s in unmapped.keys():
    print(s)
    print(len(unmapped[s]['gene'].unique()))


human
50242
chimp
56936
marmoset
27125
rhesus
40278
gorilla
55217


# Fetch most current biomart tables

Also appears that many missing IDs are symbols mixed in, match across several columns table

In [190]:
def download_biomart_table(dataset, filename):
    #all_attributes = [x for x in dataset.attributes.keys() if 'hsapiens' in x]
    all_attributes = [
        'ensembl_gene_id',
        'external_gene_name',
        'hsapiens_homolog_associated_gene_name',
        'hsapiens_homolog_ensembl_gene',
        'hsapiens_homolog_orthology_confidence'
    ]

    response = dataset.search({
        'attributes': all_attributes
    }, header=1)  # header=1 will include the column names

    with open(filename, 'wb') as f:
        f.write('\t'.join(all_attributes).encode('ascii')+b'\n')
        for line in response.iter_lines():
            f.write(line + b'\n')

def find_matching_rows(strings, df,colnames=None):
    matches = {}
    if colnames is None:
        colnames=df.columns
    use_df=df.loc[:,colnames]
    for s in tqdm.tqdm(strings):
        mask = use_df.isin([s]).any(axis=1)
        matches[s] = df[mask]
    return matches

def get_human_orthologs(species_name, identifiers, cache_path):
    #Returns a dictionary of {original_key: dataframe of matched rows}
    server = BiomartServer("http://www.ensembl.org/biomart")
    dataset_name = f"{species_name.lower()}_gene_ensembl"
    
    # Construct cache_path based on dataset_name
    cache_name = os.path.join(cache_path,dataset_name + "_table.txt")
    
    dataset = server.datasets[dataset_name]

    # Check cache and download if necessary
    if not os.path.exists(cache_name):
        print("Cache not found. Downloading BioMart table...")
        download_biomart_table(dataset, cache_name)
        print(f"Downloaded and saved to {cache_name}")

    # Read the table into a DataFrame
    df = pd.read_csv(cache_name, sep='\t', dtype=str)
    
    # Filter using Ensembl IDs and gene symbols and add 'original_identifier' column
    #filtered_df = df[df['ensembl_gene_id'].isin(identifiers) | df['external_gene_name'].isin(identifiers)].copy()
    #filtered_df['original_identifier'] = filtered_df.apply(lambda row: row['ensembl_gene_id'] if row['ensembl_gene_id'] in identifiers else row['external_gene_name'], axis=1)
    #orthologs = {row['original_identifier']: row.drop('original_identifier').to_dict() for _, row in filtered_df.iterrows()}

    orthologs=find_matching_rows(identifiers, df,df.columns[df.columns.str.contains('gene')])        
    return orthologs


In [None]:
mapped={}
for s in unmapped.keys():
    print(s)
    if s=='human':
        continue
    mapped[s]=get_human_orthologs(species_to_codes[s],unmapped[s]['gene'].unique(),cache_path='/home/matthew.schmitz/cache/')


In [156]:
def concatenate_dataframes(dataframes,key_name='key'):
    concatenated = pd.concat(dataframes.values(), keys=dataframes.keys(), axis=0).reset_index(level=1, drop=True).reset_index()
    concatenated = concatenated.rename(columns={'index': key_name})
    return concatenated

mapped_dfs={}
for s in mapped.keys():
    mapped_dfs[s]=concatenate_dataframes(mapped[s],'original_id')

In [188]:
for s in mapped_dfs.keys():
    mapped_dfs[s].to_csv(os.path.join('/home/matthew.schmitz/nhp_unmapped_genes',s+'_mapped.txt'),sep='\t',header=True)

In [159]:
for s in mapped.keys():
    print(unmapped[s].shape)
    print(mapped_dfs[s].shape)

(56954, 3)
(22267, 6)
(27126, 3)
(15255, 6)
(40280, 3)
(17005, 6)
(55218, 3)
(31768, 6)


In [184]:
#Cases where one chimp symbol maps to multiple chimp ensids
mapped_dfs['chimp'].loc[mapped_dfs['chimp']['original_id'].duplicated(keep=False),:]

Unnamed: 0,original_id,ensembl_gene_id,external_gene_name,hsapiens_homolog_associated_gene_name,hsapiens_homolog_ensembl_gene,hsapiens_homolog_orthology_confidence
203,ACTL10,ENSPTRG00000044260,,ACTL10,ENSG00000288649,1
204,ACTL10,ENSPTRG00000047541,,ACTL10,ENSG00000288649,1
218,ACTR2,ENSPTRG00000011990,,ACTR2,ENSG00000138071,1
219,ACTR2,ENSPTRG00000033914,,ACTR2,ENSG00000138071,0
287,ADAP1,ENSPTRG00000041621,,ADAP1,ENSG00000105963,1
...,...,...,...,...,...,...
22199,ZNF91,ENSPTRG00000043231,,ZNF91,ENSG00000167232,0
22200,ENSPTRG00000019240,ENSPTRG00000019240,ZNF92,ZNF723,ENSG00000268696,0
22201,ENSPTRG00000019240,ENSPTRG00000019240,ZNF92,ZNF737,ENSG00000237440,0
22257,ZXDA,ENSPTRG00000021962,,ZXDA,ENSG00000198205,1


In [185]:
#cases where multiple chimp symbols map to a single ensid
mapped_dfs['chimp'].loc[mapped_dfs['chimp']['ensembl_gene_id'].duplicated(keep=False),:].sort_values('ensembl_gene_id')

Unnamed: 0,original_id,ensembl_gene_id,external_gene_name,hsapiens_homolog_associated_gene_name,hsapiens_homolog_ensembl_gene,hsapiens_homolog_orthology_confidence
10717,NBPF12,ENSPTRG00000000232,,NBPF12,ENSG00000268043,0
10715,NBPF10,ENSPTRG00000000232,,NBPF10,ENSG00000271425,0
10714,NBPF1,ENSPTRG00000000232,,NBPF1,ENSG00000219481,0
10718,NBPF14,ENSPTRG00000000232,,NBPF14,ENSG00000270629,0
10728,NBPF9,ENSPTRG00000000232,,NBPF9,ENSG00000269713,0
...,...,...,...,...,...,...
20588,TSPY2,ENSPTRG00000052575,,TSPY2,ENSG00000168757,0
20582,TSPY1,ENSPTRG00000052575,,TSPY1,ENSG00000258992,0
12282,PGA3,ENSPTRG00000052811,,PGA3,ENSG00000229859,0
12288,PGA5,ENSPTRG00000052811,,PGA5,ENSG00000256713,0


In [209]:
gtf_outs={}
for s in gtfs.keys():
    print(s,species_to_codes[s])
    if s=='human':
        continue
    intermediate=get_human_orthologs(species_to_codes[s],gtfs[s]['gene_id'].unique(),cache_path='/home/matthew.schmitz/cache/')
    gtf_outs[s]=concatenate_dataframes(intermediate,'original_id')
    gtf_outs[s].to_csv(os.path.join('/home/matthew.schmitz/nhp_unmapped_genes',s+'_mapped_whole_GTF.txt'),sep='\t',header=True)

gorilla ggorilla


100%|██████████| 30084/30084 [02:10<00:00, 230.29it/s]


chimp ptroglodytes


100%|██████████| 59133/59133 [03:56<00:00, 250.38it/s]


marmoset cjacchus


100%|██████████| 35750/35750 [03:37<00:00, 164.17it/s]


rhesus mmulatta


100%|██████████| 35219/35219 [04:16<00:00, 137.14it/s]
