# Analysis of virulence genes

#### Author: Liubov Chuprikova

In [1]:
# import modules
import os
import pandas as pd
import numpy as np
import sys
sys.getdefaultencoding()

'utf-8'

In [2]:
SUMMARIES_DIR = '/home/liubov/Documents/tfm/the_whole_analysis/abricate_analysis/summaries'

def make_tidy(df):
    tidy_df = pd.DataFrame(columns=['STRAIN', 'GENE'])
    for _, row in df.iterrows():
        entry = row['#FILE']
        index_array = np.flatnonzero(row.iloc[2:].notna()) + 2
        entry_types = [row.index[index] for index in index_array]
        for etype in entry_types:
            data = pd.DataFrame({'STRAIN': [entry], 'GENE': [etype]})
            tidy_df = tidy_df.append(data, ignore_index=True)
    
    return tidy_df

def get_sheared_entities(entities, entity_name, grouped_data):
    sheared_entities = []
    for entity in entities:
        count = 0
        for _, group in grouped_data:
            if entity in group[entity_name].unique().tolist():
                count += 1
        if count > len(grouped_data) * 0.95:
            sheared_entities.append(entity)

    return sheared_entities

### VFDB (http://www.mgc.ac.cn/VFs/ updated 17-Oct-2019)

In [3]:
vfdb_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'vfdb.tab'),
                           sep='\t', na_values='.')

In [4]:
vfdb_tidy = make_tidy(vfdb_summary)

In [5]:
vfdb_total_genes = vfdb_tidy['GENE'].unique().tolist()
vfdb_tidy_grouped = vfdb_tidy.groupby(['STRAIN'])
# get genes that present in more than 95% of strains (89-93)
vfdb_sheared_genes = get_sheared_entities(vfdb_total_genes, 'GENE', vfdb_tidy_grouped)

In [6]:
print(len(vfdb_sheared_genes))
print(vfdb_sheared_genes)

38
['algW', 'aslA', 'cheD', 'cheY', 'csgB', 'csgD', 'csgE', 'csgF', 'csgG', 'entA', 'entB', 'entC', 'entD', 'entE', 'entF', 'entS', 'espL1', 'espL4', 'espR1', 'espX4', 'espX5', 'fepA', 'fepB', 'fepC', 'fepD', 'fepG', 'fes', 'fimH', 'flgE', 'flgG', 'flgH', 'flgM', 'fliG', 'fliI', 'fliM', 'fliN', 'fliP', 'ompA']


### Ecoli_VF (abricate DB)

In [7]:
ecvf_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'ecoli_vf.tab'),
                           sep='\t', na_values='.')

In [8]:
ecvf_tidy = make_tidy(ecvf_summary)

In [9]:
ecvf_total_genes = ecvf_tidy['GENE'].unique().tolist()
ecvf_tidy_grouped = ecvf_tidy.groupby(['STRAIN'])
# get genes that present in more than 95% of strains (89-93)
ecvf_sheared_genes = get_sheared_entities(ecvf_total_genes, 'GENE', ecvf_tidy_grouped)

In [10]:
print(len(ecvf_sheared_genes))
print(ecvf_sheared_genes)

103
['ECS88_3547', 'Z1307', 'Z2206', 'aec15', 'artj', 'aslA', 'cheA', 'cheB', 'cheR', 'cheW', 'cheY', 'cheZ', 'clpV', 'csgA', 'csgB', 'csgC', 'csgD', 'csgE', 'csgF', 'csgG', 'entA', 'entB', 'entC', 'entD', 'entE', 'entF', 'entS', 'espL1', 'espL3', 'espL4', 'espR1', 'espX4', 'espX5', 'fepA', 'fepB', 'fepC', 'fepD', 'fepE', 'fepG', 'fes', 'fimD', 'fimF', 'fimG', 'fimH', 'flgA', 'flgB', 'flgC', 'flgD', 'flgE', 'flgF', 'flgG', 'flgH', 'flgI', 'flgJ', 'flgK', 'flgL', 'flgN', 'flhA', 'flhB', 'flhC', 'flhD', 'flhE', 'fliA', 'fliE', 'fliF', 'fliG', 'fliH', 'fliI', 'fliJ', 'fliK', 'fliL', 'fliM', 'fliN', 'fliO', 'fliP', 'fliQ', 'fliR', 'fliS', 'fliT', 'fliY', 'fliZ', 'flk', 'gadX', 'hlyE', 'hofB', 'hofC', 'hofq', 'ibeB', 'ibeC', 'motA', 'motB', 'nada', 'nadb', 'ppdD', 'ppda', 'ppdb', 'ppdc', 'tar/cheM', 'ycbF', 'ycbU', 'ycbV', 'ygdb', 'yggr']


#### shared between VFDB and Ecoli_VF

In [11]:
shared = set(vfdb_sheared_genes).intersection(set(ecvf_sheared_genes))
print(len(shared))
print(shared)

34
{'espX5', 'fepA', 'entA', 'fepD', 'fliG', 'csgE', 'espX4', 'fepG', 'csgG', 'csgB', 'flgG', 'aslA', 'entE', 'csgD', 'csgF', 'espL1', 'fliI', 'fliM', 'fliN', 'fepB', 'espR1', 'entC', 'cheY', 'entB', 'entF', 'flgE', 'flgH', 'fliP', 'espL4', 'fimH', 'entS', 'entD', 'fes', 'fepC'}


### Ecoli_VirFinder (https://bitbucket.org/genomicepidemiology/virulencefinder_db/src/master/)

In [12]:
ecbb_summary = pd.read_csv(os.path.join(SUMMARIES_DIR, 'ecoli_virfinder.tab'),
                           sep='\t', na_values='.')

In [13]:
ecbb_tidy = make_tidy(ecbb_summary)

In [14]:
ecbb_total_genes = ecbb_tidy['GENE'].unique().tolist()
ecbb_tidy_grouped = ecbb_tidy.groupby(['STRAIN'])
# get genes that present in more than 95% of strains (89-93)
ecbb_sheared_genes = get_sheared_entities(ecbb_total_genes, 'GENE', ecbb_tidy_grouped)

In [15]:
print(len(ecbb_sheared_genes))

0


### Save the results

In [16]:
max_lenght = len(ecvf_sheared_genes)

vfdb_col = vfdb_sheared_genes + [np.nan for i in range(max_lenght - len(vfdb_sheared_genes))]
ecbb_col = ecbb_sheared_genes + [np.nan for i in range(max_lenght - len(ecbb_sheared_genes))]

virulence_results = pd.DataFrame({'vfdb': vfdb_col, 
                                  'ecoli_vf': ecvf_sheared_genes, 
                                  'ecoli_virfinder': ecbb_col})
virulence_results.to_csv(os.path.join(os.path.dirname(SUMMARIES_DIR), 'virulence_sheared_genes.csv'),
                         header=True, index=False)