In [10]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi
from ncbi.datasets.openapi.model.v1_gene_dataset_request import V1GeneDatasetRequest

In [2]:
def get_gene_accessions_dict(genes_metadata):
    genes_metadata = genes_metadata.to_dict()
    accessions_dict = {}
    genes_dict = {}
    for i in range(len(genes_metadata['genes'])):
        gene_symbol = genes_metadata['genes'][i]['gene']['symbol']
        accessions = []
        for access in range(len(genes_metadata['genes'][i]['gene']['transcripts'])):
            accession = genes_metadata['genes'][i]['gene']['transcripts'][access]['accession_version'].split('.')[0]
            accessions.append(accession)
            genes_dict[accession] = gene_symbol
        accessions_dict[gene_symbol] = accessions
        
    return accessions_dict, genes_dict

def access_to_gene_symbol(accession):
    gene_symbol = ''
    try:
        gene_symbol = genes_dict[accession]
    except KeyError:
        pass
    return gene_symbol

In [21]:
mirdb = pd.read_csv('data/miRDB_v6.0_prediction_result.txt', sep = '\t', header=None, names = ['mirna', 'gene', 'target_value'])
rows_to_drop = mirdb['gene'].str.contains('XM')
mirdb.drop(mirdb.index[rows_to_drop], inplace=True)
print(len(mirdb['gene'].unique()))
transcripts = mirdb['gene'].unique()

In [4]:
ini = 6094
fin = 46278
homo_sapiens_transcripts = transcripts[ini:fin]
hs_mirdb = mirdb[mirdb['gene'].isin(homo_sapiens_transcripts)]
hs_mirdb.to_csv('~/homo_sapiens_transcripts_mirdb.csv', index = False)

In [25]:
geneapi = GeneApi()
req = V1GeneDatasetRequest(accessions=list(homo_sapiens_transcripts))
genes_str = ''
with open('network_genes.txt', 'r') as ng:
    for line in ng:
        genes_str += line
    ng.close()
network_genes = genes_str.split(',')
resp = geneapi.gene_metadata_by_tax_and_symbol(network_genes, taxon = 'Homo sapiens')
genes_metadata = resp.to_dict()

In [37]:
accessions_dict, genes_dict = get_gene_accessions_dict(resp)
accessions_dict

{'AKT3': ['NM_181690',
  'NM_001370074',
  'NM_005465',
  'NM_001206729',
  'XM_016999985',
  'XM_011544014',
  'XM_024446000',
  'XM_024446892',
  'XM_024447938'],
 'LHFPL6': ['NM_005780', 'XM_011534861'],
 'SLC17A2': ['XM_006714949',
  'XM_006714951',
  'NM_005835',
  'NM_001286123',
  'NM_001286125',
  'XM_006714950',
  'XM_017010159',
  'XM_017010160'],
 'KHDRBS1': ['NR_073499', 'NR_073498', 'NM_006559', 'NM_001271878'],
 'GNA13': ['NM_006572', 'NM_001282425', 'XM_011524202'],
 'COL1A1': ['XM_005257058', 'XM_005257059', 'XM_011524341', 'NM_000088'],
 'COL1A2': ['NM_000089'],
 'CRK': ['NM_005206', 'NM_016823'],
 'CLEC14A': ['NM_175060'],
 'DCN': ['NM_001920',
  'NM_133503',
  'NM_133507',
  'NM_133505',
  'NM_133504',
  'NM_133506'],
 'S1PR1': ['NM_001320730', 'NM_001400'],
 'S1PR3': ['NR_172883', 'NR_172882', 'NM_001395848', 'NM_005226'],
 'GPC4': ['NM_001448'],
 'NTNG1': ['XM_017000686',
  'NM_001312688',
  'NM_001372167',
  'NM_014917',
  'XM_017000683',
  'XM_011541025',
  'NM_0

In [34]:
genes_dict

{'NM_181690': 'AKT3',
 'NM_001370074': 'AKT3',
 'NM_005465': 'AKT3',
 'NM_001206729': 'AKT3',
 'XM_016999985': 'AKT3',
 'XM_011544014': 'AKT3',
 'XM_024446000': 'AKT3',
 'XM_024446892': 'AKT3',
 'XM_024447938': 'AKT3',
 'NM_005780': 'LHFPL6',
 'XM_011534861': 'LHFPL6',
 'XM_006714949': 'SLC17A2',
 'XM_006714951': 'SLC17A2',
 'NM_005835': 'SLC17A2',
 'NM_001286123': 'SLC17A2',
 'NM_001286125': 'SLC17A2',
 'XM_006714950': 'SLC17A2',
 'XM_017010159': 'SLC17A2',
 'XM_017010160': 'SLC17A2',
 'NR_073499': 'KHDRBS1',
 'NR_073498': 'KHDRBS1',
 'NM_006559': 'KHDRBS1',
 'NM_001271878': 'KHDRBS1',
 'NM_006572': 'GNA13',
 'NM_001282425': 'GNA13',
 'XM_011524202': 'GNA13',
 'XM_005257058': 'COL1A1',
 'XM_005257059': 'COL1A1',
 'XM_011524341': 'COL1A1',
 'NM_000088': 'COL1A1',
 'NM_000089': 'COL1A2',
 'NM_005206': 'CRK',
 'NM_016823': 'CRK',
 'NM_175060': 'CLEC14A',
 'NM_001920': 'DCN',
 'NM_133503': 'DCN',
 'NM_133507': 'DCN',
 'NM_133505': 'DCN',
 'NM_133504': 'DCN',
 'NM_133506': 'DCN',
 'NM_0013

In [38]:
hs_mirdb['gene_symbol'] = hs_mirdb['gene'].apply(access_to_gene_symbol)
hs_mirdb['gene_symbol'].unique()
network_mirdb = hs_mirdb[~(hs_mirdb['gene_symbol'] == '')]
network_mirdb.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hs_mirdb['gene_symbol'] = hs_mirdb['gene'].apply(access_to_gene_symbol)


Unnamed: 0,mirna,gene,target_value,gene_symbol
1776197,hsa-let-7a-2-3p,NM_003371,70.8346,VAV2
1776206,hsa-let-7a-2-3p,NM_006749,82.3439,SLC20A2
1776369,hsa-let-7a-2-3p,NM_001288718,57.9932,STAT5A
1777067,hsa-let-7a-2-3p,NM_002037,61.6208,FYN
1777273,hsa-let-7a-2-3p,NM_001134398,93.21992,VAV2


In [None]:
# network_mirdb.to_csv('~/network_mirdb.csv', index = False)