In [1]:
# import modules
import os
import pandas as pd
import numpy as np
from prettytable import PrettyTable
import sys
sys.getdefaultencoding()

'utf-8'

In [2]:
SUMMARIES_DIR = '/home/liubov/Documents/tfm/the_whole_analysis/abricate_pipolins/summaries'

def make_megares_tidy(megares_df):
    tidy_df = pd.DataFrame(columns=['STRAIN', 'ID', 'TYPE', 'CLASS', 'MECHANISM', 'GROUP'])
    for _, row in megares_df.iterrows():
        entry = row['#FILE']
        index_array = np.flatnonzero(row.iloc[2:].notna()) + 2
        entry_types = [row.index[index] for index in index_array]
        for etype in entry_types:
            levels = etype.split(sep='|')
            data = pd.DataFrame({'STRAIN': [entry], 'ID': [levels[0]], 'TYPE': [levels[1]],
                            'CLASS': [levels[2]], 'MECHANISM': [levels[3]], 'GROUP': [levels[4]]})
            tidy_df = tidy_df.append(data, ignore_index=True)

    return tidy_df

def make_tidy(df):
    tidy_df = pd.DataFrame(columns=['STRAIN', 'GENE'])
    for _, row in df.iterrows():
        entry = row['#FILE']
        index_array = np.flatnonzero(row.iloc[2:].notna()) + 2
        entry_types = [row.index[index] for index in index_array]
        for etype in entry_types:
            data = pd.DataFrame({'STRAIN': [entry], 'GENE': [etype]})
            tidy_df = tidy_df.append(data, ignore_index=True)
    
    return tidy_df

# AMR genes in pipolins

### MEGARes

`megares_noSNPs` is a database that was downloaded from MEGARes repository (https://megares.meglab.org/download/index.php) -- Version 2.0.0 (14 October 2019). Genes that requires SNP confirmation were excluded (`ExplorePipolin/scripts/prepare_megares_nosnps.py`).

**MEGARes is non-redundant and includes entities from 5 databases: BacMet, ResFinder, ARG-ANNOT, CARD and NCBI AMR.**

Each entry (gene) in this database is represented by it's unique ID and 4 hierarchical levels: TYPE, CLASS, MECHANISM and GROUP (by analogy with GO ontology).

 * **Type** of compound, to which the accession confers resistance (e.g. drug, biocide, metal, multi-compound),
 * **Class** of antimicrobial compounds to which a gene confers resistance (e.g. betalactams),
 * **Mechanism** by which this resistance is conferred (e.g. betalactamases),
 * **Group** name of the genes (e.g. Group A betalactamases),
 * and **ID** for each individual gene accession.

In [4]:
megares_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'megares_noSNPs.tab'),
                              sep='\t', na_values='.')
megares_tidy = make_megares_tidy(megares_summary)
print(megares_tidy)

             STRAIN        ID    TYPE                   CLASS  \
0    CP019051.1.tab  MEG_2435   Drugs             betalactams   
1    CP031833.1.tab  MEG_2435   Drugs             betalactams   
2    CP033096.1.tab  MEG_2435   Drugs             betalactams   
3    CP034589.1.tab  MEG_2435   Drugs             betalactams   
4    CP034734.1.tab  MEG_2435   Drugs             betalactams   
5    CP040927.1.tab  MEG_2435   Drugs             betalactams   
6    CP043750.1.tab  MEG_2435   Drugs             betalactams   
7   chr_LREC242.tab  MEG_2378   Drugs             betalactams   
8   chr_LREC242.tab  MEG_2401   Drugs             betalactams   
9   chr_LREC242.tab  MEG_2434   Drugs             betalactams   
10  chr_LREC242.tab  MEG_2435   Drugs             betalactams   
11  chr_LREC245.tab  MEG_2378   Drugs             betalactams   
12  chr_LREC245.tab  MEG_2435   Drugs             betalactams   
13  chr_LREC248.tab  MEG_3999  Metals  Multi-metal_resistance   
14  chr_LREC252.tab  MEG_

### ARG-ANNOT (abricate DB)

In [5]:
argannot_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'argannot.tab'),
                               sep='\t', na_values='.')
argannot_tidy = make_tidy(argannot_summary)
print(argannot_tidy)

            STRAIN              GENE
0  chr_LREC242.tab  (Bla)blaCTX-M-14
1  chr_LREC242.tab        (Phe)catB4


### NCBI AMR (abricate DB)

In [6]:
ncbi_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'ncbi.tab'),
                          sep='\t', na_values='.')
ncbi_tidy = make_tidy(ncbi_summary)
print(ncbi_tidy)

            STRAIN         GENE
0  chr_LREC242.tab  blaCTX-M-14


### CADR (abricate DB)

In [7]:
card_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'card.tab'),
                           sep='\t', na_values='.')
card_tidy = make_tidy(card_summary)
print(card_tidy)

            STRAIN      GENE
0  chr_LREC242.tab  CTX-M-14


### ResFinder (abricate DB)

In [8]:
resfinder_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'resfinder.tab'),
                                sep='\t', na_values='.')
resfinder_tidy = make_tidy(resfinder_summary)
print(resfinder_tidy)

            STRAIN           GENE
0  chr_LREC242.tab  blaCTX-M-14_1


# Virulence genes in pipolins

### VFDB (http://www.mgc.ac.cn/VFs/ updated 17-Oct-2019)

In [9]:
vfdb_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'vfdb.tab'),
                           sep='\t', na_values='.')
vfdb_tidy = make_tidy(vfdb_summary)
print(vfdb_tidy)

            STRAIN  GENE
0   CP002291.1.tab  fliC
1  chr_LREC252.tab  cheD
2  chr_LREC252.tab  cheY
3  chr_LREC252.tab  entD
4  chr_LREC252.tab  flhA


### Ecoli_VF (abricate DB)

In [10]:
ecvf_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'ecoli_vf.tab'),
                           sep='\t', na_values='.')
ecvf_tidy = make_tidy(ecvf_summary)
print(ecvf_tidy)

                 STRAIN      GENE
0        AP019703.1.tab      fimD
1        AP019706.1.tab      fimD
2        AP019708.1.tab      fimD
3        CP002291.1.tab      fliC
4        CP006262.1.tab      fimD
5        CP007133.1.tab      fimD
6        CP017631.1.tab      fimD
7        CP027763.1.tab      fimD
8        CP028379.1.tab      fimD
9        CP031919.1.tab      fimD
10       CP033378.1.tab      fimD
11  GCA_000941935.1.tab      fimD
12  GCF_000937095.2.tab      fimD
13      chr_LREC249.tab      fimD
14      chr_LREC251.tab      fimD
15      chr_LREC252.tab      cheA
16      chr_LREC252.tab      cheB
17      chr_LREC252.tab      cheR
18      chr_LREC252.tab      cheW
19      chr_LREC252.tab      cheY
20      chr_LREC252.tab      cheZ
21      chr_LREC252.tab      entD
22      chr_LREC252.tab      flhA
23      chr_LREC252.tab      flhB
24      chr_LREC252.tab      flhC
25      chr_LREC252.tab      flhD
26      chr_LREC252.tab      flhE
27      chr_LREC252.tab      fliA
28      chr_LR

### Ecoli_VirFinder (https://bitbucket.org/genomicepidemiology/virulencefinder_db/src/master/)

In [11]:
ecbb_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'ecoli_virfinder.tab'),
                           sep='\t', na_values='.')
ecbb_tidy = make_tidy(ecbb_summary)
print(ecbb_tidy)

Empty DataFrame
Columns: [STRAIN, GENE]
Index: []
