# Abricate analysis of pipolin-containing strains

### Author: Liubov Chuprikova

In [2]:
# import modules
import os
import subprocess
import pandas
import numpy
from utilities import check_dir
import sys
sys.getdefaultencoding()

'utf-8'

In [14]:
# functions and variables
ABRICATE_PATH = '/home/liubov/repos/abricate/bin/abricate'
GENOMES_PATH = '/home/liubov/Documents/tfm/the_whole_analysis/all_genomes'
WORKING_DIR = '/home/liubov/Documents/tfm/the_whole_analysis/abricate_analysis'

def get_abricate_databases():
    """
    Run `abricate --list` and parse available databases from stdout.
    """
    abricate_list = subprocess.run([ABRICATE_PATH, '--list'],
                                   stdout=subprocess.PIPE, 
                                   stderr=subprocess.STDOUT).stdout.decode('utf-8')
    databases = []
    for line in abricate_list.split('\n'):
        entry = line.split('\t')[0]
        if entry != 'DATABASE' and entry != '':
            databases.append(entry)
    return databases

def run_abricate(database_dir, genomes_dir):
    """
    Screen all the genomes in the genomes_dir against the given database.
    """
    snp_databases = ['fumC', 'fimH', 'pointFinder_Ecoli']
    db = os.path.basename(database_dir)
    minid = '100' if db in snp_databases else '75'
    mincov = '100' if db in snp_databases else '0'
    in_genomes = os.listdir(genomes_dir)
    os.mkdir(database_dir)
    for genome in in_genomes:
        with open(os.path.join(database_dir, f'{genome[:-3]}.tab', ), 'w') as ouf:
            subprocess.run([ABRICATE_PATH, '--db', db, 
                            '--minid', minid, '--mincov', mincov,
                            '--threads', '4', '--nopath', 
                            f'{os.path.join(genomes_dir, genome)}'], stdout=ouf)

def abricate_summarize(databases, out_dir):
    summaries_dir = os.path.join(out_dir, 'summaries')
    check_dir(summaries_dir)   # create if not exists
    for db in databases:
        db_out_path = os.path.join(out_dir, db)
        if not os.path.isdir(db_out_path):
            continue
        if os.path.isfile(os.path.join(summaries_dir, f'{db}.tab')):
            continue
        db_out_files = ' '.join([os.path.join(db_out_path, file) for file in os.listdir(db_out_path)])
        with open(os.path.join(summaries_dir, f'{db}.tab'), 'w') as ouf:
            subprocess.run(f'{ABRICATE_PATH} --nopath --summary {db_out_files}',
                           shell=True, stdout=ouf)

def parse_summaries(databases, summaries_dir):
    summaries_data = pandas.DataFrame(columns=databases, 
                                      index=[f'{file[:-3]}.tab' for file in os.listdir(GENOMES_PATH)])
    for db in databases:
        summary_file = os.path.join(summaries_dir, f'{db}.tab')
        if not os.path.isfile(summary_file):
            continue
        with open(summary_file) as inf:
            for line in inf:
                if line[0] != '#':
                    genome, number = line.strip().split(sep='\t')[:2]
                    summaries_data.at[genome, db] = number

    return summaries_data

def set_chtype(from_df, to_df, chtype):
    for _, row in from_df.iterrows():
        entry = row['#FILE']
        index_array = numpy.flatnonzero(row == 100)
        if len(index_array) > 1:
            raise AssertionError('More than a single variation locus was defined!')
        if len(index_array) == 0:
            entry_type = numpy.nan
        else:
            entry_type = row.index[index_array[0]]
        to_df.at[entry, chtype] = entry_type

## The analysis
### 1. Run abricate with all available databases and summarize the output

In [4]:
abricate_dbs = get_abricate_databases()

In [5]:
for database in abricate_dbs:
    db_dir = os.path.join(WORKING_DIR, database)
    if not os.path.isdir(db_dir) and database != 'REL-DB'  and database != 'RIP-DB':
        run_abricate(db_dir, GENOMES_PATH)

In [6]:
abricate_summarize(abricate_dbs, WORKING_DIR)
summaries = parse_summaries(abricate_dbs, os.path.join(WORKING_DIR, 'summaries'))

### Define *fimH* and *fumC* types

In [15]:
fimh_summary = pandas.read_csv(os.path.join(WORKING_DIR, 'summaries', 'fimH.tab'), 
                               sep='\t', na_values='.')

set_chtype(fimh_summary, summaries, 'fimH')

In [16]:
fumc_summary = pandas.read_csv(os.path.join(WORKING_DIR, 'summaries', 'fumC.tab'),
                               sep='\t', na_values='.')

set_chtype(fumc_summary, summaries, 'fumC')

In [18]:
summaries.to_csv(os.path.join(WORKING_DIR, 'short_summary.csv'))