In [1]:
# import modules
import os
import pandas as pd
import numpy as np
from prettytable import PrettyTable
import sys
sys.getdefaultencoding()

'utf-8'

In [6]:
SUMMARIES_DIR = '/home/liubov/PycharmProjects/ExplorePipolin/data/new_analysis/abricate/summaries'

def make_megares_tidy(megares_df):
    tidy_df = pd.DataFrame(columns=['STRAIN', 'ID', 'TYPE', 'CLASS', 'MECHANISM', 'GROUP'])
    for _, row in megares_df.iterrows():
        entry = row['#FILE']
        index_array = np.flatnonzero(row.iloc[2:].notna()) + 2
        entry_types = [row.index[index] for index in index_array]
        for etype in entry_types:
            levels = etype.split(sep='|')
            data = pd.DataFrame({'STRAIN': [entry], 'ID': [levels[0]], 'TYPE': [levels[1]],
                            'CLASS': [levels[2]], 'MECHANISM': [levels[3]], 'GROUP': [levels[4]]})
            tidy_df = tidy_df.append(data, ignore_index=True)

    return tidy_df

def make_tidy(df):
    tidy_df = pd.DataFrame(columns=['STRAIN', 'GENE'])
    for _, row in df.iterrows():
        entry = row['#FILE']
        index_array = np.flatnonzero(row.iloc[2:].notna()) + 2
        entry_types = [row.index[index] for index in index_array]
        for etype in entry_types:
            data = pd.DataFrame({'STRAIN': [entry], 'GENE': [etype]})
            tidy_df = tidy_df.append(data, ignore_index=True)
    
    return tidy_df

# AMR genes in pipolins

### MEGARes

`megares_noSNPs` is a database that was downloaded from MEGARes repository (https://megares.meglab.org/download/index.php) -- Version 2.0.0 (14 October 2019). Genes that requires SNP confirmation were excluded (`ExplorePipolin/scripts/prepare_megares_nosnps.py`).

**MEGARes is non-redundant and includes entities from 5 databases: BacMet, ResFinder, ARG-ANNOT, CARD and NCBI AMR.**

Each entry (gene) in this database is represented by it's unique ID and 4 hierarchical levels: TYPE, CLASS, MECHANISM and GROUP (by analogy with GO ontology).

 * **Type** of compound, to which the accession confers resistance (e.g. drug, biocide, metal, multi-compound),
 * **Class** of antimicrobial compounds to which a gene confers resistance (e.g. betalactams),
 * **Mechanism** by which this resistance is conferred (e.g. betalactamases),
 * **Group** name of the genes (e.g. Group A betalactamases),
 * and **ID** for each individual gene accession.

In [7]:
megares_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'megares_noSNPs.tab'),
                              sep='\t', na_values='.')
megares_tidy = make_megares_tidy(megares_summary)
print(megares_tidy)

            STRAIN        ID    TYPE                   CLASS  \
0         2452.tab  MEG_2435   Drugs             betalactams   
1         CP53.tab  MEG_2435   Drugs             betalactams   
2      CRE1540.tab  MEG_2435   Drugs             betalactams   
3   CVM_N62675.tab  MEG_2435   Drugs             betalactams   
4          L37.tab  MEG_2435   Drugs             betalactams   
5          L53.tab  MEG_2435   Drugs             betalactams   
6      LREC242.tab  MEG_2378   Drugs             betalactams   
7      LREC242.tab  MEG_2401   Drugs             betalactams   
8      LREC242.tab  MEG_2434   Drugs             betalactams   
9      LREC242.tab  MEG_2435   Drugs             betalactams   
10     LREC244.tab  MEG_2378   Drugs             betalactams   
11     LREC244.tab  MEG_2435   Drugs             betalactams   
12     LREC245.tab  MEG_2378   Drugs             betalactams   
13     LREC245.tab  MEG_2435   Drugs             betalactams   
14     LREC248.tab  MEG_3999  Metals  Mu

### ARG-ANNOT (abricate DB)

In [8]:
argannot_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'argannot.tab'),
                               sep='\t', na_values='.')
argannot_tidy = make_tidy(argannot_summary)
print(argannot_tidy)

        STRAIN              GENE
0  LREC242.tab  (Bla)blaCTX-M-14
1  LREC242.tab        (Phe)catB4


### NCBI AMR (abricate DB)

In [9]:
ncbi_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'ncbi.tab'),
                          sep='\t', na_values='.')
ncbi_tidy = make_tidy(ncbi_summary)
print(ncbi_tidy)

        STRAIN         GENE
0  LREC242.tab  blaCTX-M-14


### CADR (abricate DB)

In [10]:
card_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'card.tab'),
                           sep='\t', na_values='.')
card_tidy = make_tidy(card_summary)
print(card_tidy)

        STRAIN      GENE
0  LREC242.tab  CTX-M-14


### ResFinder (abricate DB)

In [11]:
resfinder_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'resfinder.tab'),
                                sep='\t', na_values='.')
resfinder_tidy = make_tidy(resfinder_summary)
print(resfinder_tidy)

        STRAIN           GENE
0  LREC242.tab  blaCTX-M-14_1


# Virulence genes in pipolins

### VFDB (http://www.mgc.ac.cn/VFs/ updated 17-Oct-2019)

In [12]:
vfdb_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'vfdb.tab'),
                           sep='\t', na_values='.')
vfdb_tidy = make_tidy(vfdb_summary)
print(vfdb_tidy)

        STRAIN  GENE
0  LREC252.tab  cheD
1  LREC252.tab  cheY
2  LREC252.tab  entD
3  LREC252.tab  flhA
4     P12b.tab  fliC


### Ecoli_VF (abricate DB)

In [13]:
ecvf_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'ecoli_vf.tab'),
                           sep='\t', na_values='.')
ecvf_tidy = make_tidy(ecvf_summary)
print(ecvf_tidy)

                 STRAIN      GENE
0             10942.tab      fimD
1            112648.tab      fimD
2            122715.tab      fimD
3        2015C-3125.tab      fimD
4             FHI82.tab      fimD
5             FHI87.tab      fimD
6         FWSEC0002.tab      fimD
7   GCA_009002655.1.tab      fimD
8   GCF_000941935.1.tab      fimD
9               L73.tab      fimD
10          LREC249.tab      fimD
11          LREC251.tab      fimD
12          LREC252.tab      cheA
13          LREC252.tab      cheB
14          LREC252.tab      cheR
15          LREC252.tab      cheW
16          LREC252.tab      cheY
17          LREC252.tab      cheZ
18          LREC252.tab      entD
19          LREC252.tab      flhA
20          LREC252.tab      flhB
21          LREC252.tab      flhC
22          LREC252.tab      flhD
23          LREC252.tab      flhE
24          LREC252.tab      fliA
25          LREC252.tab      fliC
26          LREC252.tab      fliY
27          LREC252.tab      fliZ
28          LR

### Ecoli_VirFinder (https://bitbucket.org/genomicepidemiology/virulencefinder_db/src/master/)

In [14]:
ecbb_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'ecoli_virfinder.tab'),
                           sep='\t', na_values='.')
ecbb_tidy = make_tidy(ecbb_summary)
print(ecbb_tidy)

Empty DataFrame
Columns: [STRAIN, GENE]
Index: []
