# Abricate analysis of pipolin-containing strains

### Author: Liubov Chuprikova

In [5]:
# import modules
import os
import subprocess
import pandas
import sys
sys.getdefaultencoding()

In [12]:
# functions and variables
ABRICATE_PATH = '/home/liubov/repos/abricate/bin/abricate'
GENOMES_PATH = '/home/liubov/Documents/tfm/the_whole_analysis/all_genomes'
WORKING_DIR = '/home/liubov/Documents/tfm/the_whole_analysis/abricate_analysis'

def get_abricate_databases():
    abricate_list = subprocess.run([ABRICATE_PATH, '--list'],
                                   stdout=subprocess.PIPE, 
                                   stderr=subprocess.STDOUT).stdout.decode('utf-8')
    databases = []
    for line in abricate_list.split('\n'):
        entry = line.split('\t')[0]
        if entry != 'DATABASE' and entry != '':
            databases.append(entry)
    return databases

def run_abricate(database_dir, genomes_dir):
    in_genomes = os.listdir(genomes_dir)
    os.mkdir(database_dir)
    for genome in in_genomes:
        with open(os.path.join(database_dir, f'{genome[:-3]}.tab', ), 'w') as ouf:
            subprocess.run([ABRICATE_PATH, '--db', f'{os.path.basename(database_dir)}', 
                            '--threads', '4', '--nopath', 
                            f'{os.path.join(genomes_dir, genome)}'], stdout=ouf)

def abricate_summarize(databases, out_dir):
    for db in databases:
        db_out_path = os.path.join(out_dir, db)
        if not os.path.isdir(db_out_path):
            continue
        db_out_files = ' '.join([os.path.join(db_out_path, file) for file in os.listdir(db_out_path)])
        with open(os.path.join(out_dir, 'summaries', f'{db}.tab'), 'w') as ouf:
            subprocess.run(f'{ABRICATE_PATH} --nopath --summary {db_out_files}',
                           shell=True, stdout=ouf)

def parse_summaries(databases, summaries_dir):
    summaries_data = pandas.DataFrame(columns=databases, index=os.listdir(GENOMES_PATH))
    for db in databases:
        summary_file = os.path.join(summaries_dir, f'{db}.tab')
        if not os.path.isfile(summary_file):
            continue
        with open(summary_file) as inf:
            for line in inf:
                if line[0] != '#':
                    genome, number = line.strip().split(sep='\t')[:2]
                    summaries_data.at[genome, db] = number

    return summaries_data

## The analysis

In [3]:
abricate_dbs = get_abricate_databases()

In [4]:
for database in abricate_dbs:
    db_dir = os.path.join(WORKING_DIR, database)
    if not os.path.isdir(db_dir) and database != 'REL-DB'  and database != 'RIP-DB':
        run_abricate(db_dir, GENOMES_PATH)

In [13]:
abricate_summarize(abricate_dbs, WORKING_DIR)
summaries = parse_summaries(abricate_dbs, os.path.join(WORKING_DIR, 'summaries'))
summaries.to_csv(os.path.join(WORKING_DIR, 'short_summary.csv'))
