# AddDB Updater

This is a Jupyter notebook to build a python script that can parse and create a fullly annotated tsv file to replace gene_xref for ANNOVAR. This script could be useful to update these database and be added to the Achabilarity container.

# Import library

In [82]:
import pandas as pd

# Parser function

These functions will parse them for merging.

## HGNC

In [83]:
def hgnc(file):
    hgnc = pd.read_csv(file, sep='\t')
    hgnc.rename(columns={'Approved symbol': '#Gene_name'}, inplace=True)
    return(hgnc)

In [84]:
hgnc_list = hgnc('~/Kevin/AddDB_updater/data/hgnc.tsv')
print(hgnc_list.head())

  #Gene_name                   Approved name
0       A1BG          alpha-1-B glycoprotein
1   A1BG-AS1            A1BG antisense RNA 1
2       A1CF  APOBEC1 complementation factor
3        A2M           alpha-2-macroglobulin
4    A2M-AS1             A2M antisense RNA 1


## GnomAD constraint score

In [85]:
def gnomad_score(file):
    gnomad = pd.read_csv(file, sep='\t')
    gnomad.rename(columns={'gene': '#Gene_name'}, inplace=True)
    gnomad_select = gnomad[['#Gene_name', 'oe_lof_upper_rank',
       'oe_lof_upper_bin','oe_lof','oe_lof_lower','oe_lof_upper','oe_mis','oe_mis_lower', 'oe_mis_upper','oe_syn','oe_syn_lower', 'oe_syn_upper','constraint_flag']]
    return(gnomad_select)

In [86]:
gnomad_score_list =  gnomad_score('~/Kevin/AddDB_updater/data/gnomad.v2.1.1.lof_metrics.by_gene.txt')
print(gnomad_score_list.head())

  #Gene_name  oe_lof_upper_rank  oe_lof_upper_bin    oe_lof  oe_lof_lower  \
0      MED13                0.0               0.0  0.000000         0.000   
1      NIPBL                1.0               0.0  0.006653         0.001   
2       SMC3                2.0               0.0  0.000000         0.000   
3      CNOT1                3.0               0.0  0.007998         0.002   
4        RLF                4.0               0.0  0.000000         0.000   

   oe_lof_upper   oe_mis  oe_mis_lower  oe_mis_upper  oe_syn  oe_syn_lower  \
0         0.030  0.77921         0.736         0.824  1.0890         1.005   
1         0.032  0.58688         0.554         0.621  1.0020         0.930   
2         0.037  0.28251         0.249         0.320  1.0578         0.946   
3         0.038  0.43290         0.403         0.464  1.0306         0.955   
4         0.040  0.68766         0.645         0.733  1.0153         0.930   

   oe_syn_upper constraint_flag  
0         1.180             NaN  


## OMIM genemap2

In [87]:
def omim(file):
    omim = pd.read_csv(file, sep='\t',skiprows=3)
    omim_select = omim[['Approved Symbol','Phenotypes']]
    omim_select.rename(columns={'Approved Symbol': '#Gene_name'}, inplace=True)
    omim_select = omim_select.dropna(subset=['#Gene_name']) 
    return(omim_select)

In [89]:
omim_list = omim('~/Kevin/AddDB_updater/data/genemap2.txt')
print(omim_list.head(n=10))

   #Gene_name                                         Phenotypes
5         CMM  {Melanoma, cutaneous malignant, 1}, 155600 (2)...
6         CCV  Cataract 8, multiple types, 115665 (2), Autoso...
8        DYX8  {Dyslexia, susceptibility to, 8}, 608995 (2), ...
10       IBD7         {Inflammatory bowel disease 7}, 605225 (2)
12      MYP14                              Myopia 14, 610320 (2)
13     PSORS7           {Psoriasis susceptibility 7}, 605606 (2)
14     PTPRZ2                                                NaN
15       SAI1                                                NaN
16     SAMD11                                                NaN
17      NOC2L                                                NaN


## UniProt database

In [80]:
def uniprot(file):
    uniprot = pd.read_csv(file, sep='\t')
    uniprot_select = uniprot.iloc[:,3:]
    uniprot_select.rename(columns={'Gene names  (primary )': '#Gene_name'}, inplace=True)
    return(uniprot_select)

In [81]:
uniprot_list = uniprot('~/Kevin/AddDB_updater/data/uniprot.tsv')
print(uniprot_list.head())

  #Gene_name                                      Function [CC]  \
0   TRBV11-2  FUNCTION: V region of the variable domain of T...   
1     TEX13A                                                NaN   
2      LARS2                                                NaN   
3    TXNDC11  FUNCTION: May act as a redox regulator involve...   
4        TXK  FUNCTION: Non-receptor tyrosine kinase that pl...   

                                  Tissue specificity  \
0                                                NaN   
1               TISSUE SPECIFICITY: Testis specific.   
2  TISSUE SPECIFICITY: Ubiquitously expressed, bu...   
3  TISSUE SPECIFICITY: Widely expressed at low le...   
4  TISSUE SPECIFICITY: Expressed in T-cells and s...   

                              Involvement in disease  
0                                                NaN  
1                                                NaN  
2  DISEASE: Perrault syndrome 4 (PRLTS4) [MIM:615...  
3                                       

# Merge into one file

This function will merge all databases into HGNC.

In [98]:
def merge_db(hgnc_file,omim_file,gnomad_score_file,uniprot_file):
    hgnc_list = hgnc(hgnc_file)
    omim_list = omim(omim_file).reset_index(drop=True)
    gnomad_score_list = gnomad_score(gnomad_score_file)
    uniprot_list = uniprot(uniprot_file)
    gene_fullxref = hgnc_list.merge(omim_list,on='#Gene_name').merge(gnomad_score_list,on='#Gene_name').merge(uniprot_list,on='#Gene_name')
    gene_fullxref = gene_fullxref.fillna('')
    return gene_fullxref

In [100]:
gene_fullxref_list = merge_db(hgnc_file='~/Kevin/AddDB_updater/data/hgnc.tsv',omim_file='~/Kevin/AddDB_updater/data/genemap2.txt', gnomad_score_file='~/Kevin/AddDB_updater/data/gnomad.v2.1.1.lof_metrics.by_gene.txt', uniprot_file='~/Kevin/AddDB_updater/data/uniprot.tsv')
gene_fullxref_list.to_csv('~/Kevin/AddDB_updater/data/test.txt',sep='\t',index=False)
print(gene_fullxref_list.head())

  #Gene_name                                    Approved name  \
0       A1BG                           alpha-1-B glycoprotein   
1       A1CF                   APOBEC1 complementation factor   
2        A2M                            alpha-2-macroglobulin   
3      A2ML1                     alpha-2-macroglobulin like 1   
4     A4GALT  alpha 1,4-galactosyltransferase (P blood group)   

                                          Phenotypes oe_lof_upper_rank  \
0                                                                13015   
1                                                                 9254   
2  Alpha-2-macroglobulin deficiency, 614036 (1), ...              5366   
3  {Otitis media, susceptibility to}, 166760 (3),...             10116   
4  [Blood group, P1Pk system, P(2) phenotype], 11...             16517   

  oe_lof_upper_bin   oe_lof oe_lof_lower oe_lof_upper   oe_mis  oe_mis_lower  \
0                6  0.78457        0.524        1.208   1.0141         0.922   
1   