In [320]:
import os
import re

from Bio import SeqIO
from tabulate import tabulate


import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [470]:
sample_folder = '/home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w'

In [471]:
def extract_files(folder):
    percentage_file = ""
    for root, _, files in os.walk(folder):
        for name in files:
            if name.endswith(".coverage_adapted_clustered_percentage"):
                percentage_file = os.path.join(root, name)
            elif name.endswith(".plasmids.complete"):
                complete_file = os.path.join(root, name)
            elif "representative" in name and name.endswith(".fasta"):
                representative_file = os.path.join(root, name)
                
    return percentage_file, complete_file, representative_file


In [472]:
percentage_file, complete_file, representative_file = extract_files(sample_folder)

In [473]:
def percentage_to_df(percentage_file):
    if not percentage_file == "":
        df = pd.read_csv(percentage_file, sep=" ", names=['id', 'percentage'])
        df['percentage'] = df['percentage'].round(2)
        return df
    else:
        return pd.DataFrame(columns=['id','percentage'])

In [474]:
percentage_df = percentage_to_df(percentage_file)
percentage_df.head()

Unnamed: 0,id,percentage


In [475]:
def len_description_to_df(representative_file):
    df = pd.DataFrame(columns=['id','length','species', 'description'])
    index = 0
    for seq_record in SeqIO.parse(representative_file, "fasta"):
        df.loc[index, 'id'] = seq_record.id
        df.loc[index, 'length'] = len(seq_record)
        description_split = seq_record.description.split(' ')
        df.loc[index, 'species'] = (' ').join(description_split[1:3])
        df.loc[index, 'description'] = (' ').join(description_split[2:])
        index = index + 1
    df['length'] = df['length'].astype(int)

    return df

In [476]:
len_description_df = len_description_to_df(representative_file)
len_description_df.head()

Unnamed: 0,id,length,species,description
0,NZ_CP013143.1,14928,Alcaligenes faecalis,"faecalis strain ZD02 plasmid pZD02, complete sequence"
1,NZ_CP021884.1,4576,Alcaligenes faecalis,faecalis strain BDB4 plasmid pZD02 sequence
2,NC_010064.1,2004,Escherichia coli,"coli plasmid pLMO226, complete sequence"
3,NZ_CP035364.1,13142,Escherichia coli,coli strain BR10-DEC plasmid unnamed4
4,NZ_CP035356.1,7815,Escherichia coli,coli strain BR64-DEC plasmid unnamed4


In [477]:
def complete_report_df(complete_file, len_description_df, percentage_df):
    
    def set_to_list(row):
        listed_set = list(row.contig_name)
        listed_set.sort()
        return listed_set
    
    #CP029217.1	176762	288994	9	id=170244
    dfc = pd.read_csv(complete_file, sep="\t", names=['id', 'start', 'end', 'contig_name', 'contig_id'])
    dfc['len_covered'] = dfc.end - dfc.start
    covered_df = dfc.groupby('id')['len_covered'].sum().reset_index()
    contigs_df = dfc.groupby('id')['contig_name'].apply(set).reset_index()#Merge all dataframes 
    #Merge all dataframes
    df = len_description_df.merge(covered_df, on='id', how='left')
    df['fraction_covered'] = round(df.len_covered / df.length, 2)
    del df['len_covered']
    df = df.merge(contigs_df, on='id', how='left')
    df = df.dropna()
    df['contig_name'] = df.apply(lambda x: set_to_list(x), axis=1)
    df = df.merge(percentage_df, on='id', how='left')
    df = df.sort_values(by=['length'], ascending=False).reset_index(drop=True)
    df = df.fillna('X')
    return df

In [478]:
summary_df = complete_report_df(complete_file, len_description_df, percentage_df)

In [479]:
summary_df

Unnamed: 0,id,length,species,description,fraction_covered,contig_name,percentage
0,CP029217.1,288994,Klebsiella pneumoniae,pneumoniae strain L201 plasmid p1-L201,1.2,"[9, 12, 43, 82, 91, 101, 113, 124, 128, 129, 130]",X
1,NZ_CP020499.1,178294,Klebsiella pneumoniae,"pneumoniae strain BWHC1 plasmid unnamed1, complete sequence",0.98,"[37, 41, 51, 64, 67, 71, 74, 75, 84, 88, 94, 98, 99, 102, 111, 129, 131, 135, 136, 145, 152]",X
2,NZ_CP039976.1,148759,Klebsiella pneumoniae,"pneumoniae strain R1761 plasmid p1761_02, complete sequence",0.82,"[24, 69, 70, 77, 85, 97, 103, 110, 112, 151]",X
3,NZ_CP029222.1,142762,Klebsiella pneumoniae,pneumoniae strain L388 plasmid p3-L388,1.01,"[25, 33, 142, 147, 148, 154]",X
4,CP025817.1,101557,Klebsiella pneumoniae,pneumoniae strain Kp81 plasmid pKp81_1,0.95,"[41, 64, 67, 71, 74, 81, 94, 99, 102, 111, 119, 122, 131, 135, 136, 139, 145, 152]",X
5,NZ_CP003998.1,81071,Klebsiella pneumoniae,"pneumoniae subsp. pneumoniae Kp13 plasmid pKP13e, complete sequence",1.05,"[24, 119]",X
6,NZ_KX154765.1,50800,Klebsiella pneumoniae,"pneumoniae strain 145 plasmid pKp145/11a, complete sequence",0.97,"[45, 76, 86, 87, 117, 123, 129, 143, 144, 153]",X
7,NZ_CP010576.1,35843,Klebsiella pneumoniae,pneumoniae strain 32192 plasmid sequence,0.73,"[51, 100, 110, 135, 145, 151]",X
8,NZ_CP041525.1,31500,Escherichia coli,coli strain 69 plasmid p6007-3,1.42,"[45, 76, 86, 123, 135, 144, 145]",X
9,NZ_CP015133.1,26450,Klebsiella pneumoniae,"pneumoniae strain Kpn555 plasmid pKPN-d6b, complete sequence",0.03,[119],X


In [480]:
def include_images(sample_folder, summary_df):
    sample = sample_folder.split("/")[-1]
    def image_finder(row, sample_folder):
        for root, _, files in os.walk(sample_folder):
            for name in files:
                if 'images' in root and row.id in name and name.endswith('.png'):
                    return 'file://' + os.path.join(root, name)
                
    summary_df['images'] = summary_df.apply(lambda x: image_finder(x, sample_folder), axis=1)
    summary_df.to_csv(sample_folder + '/' + sample +  '_final_results.tab', sep='\t', index=False)
    return summary_df

In [481]:
final_individual_dataframe = include_images(sample_folder, summary_df)
final_individual_dataframe.head()

Unnamed: 0,id,length,species,description,fraction_covered,contig_name,percentage,images
0,CP029217.1,288994,Klebsiella pneumoniae,pneumoniae strain L201 plasmid p1-L201,1.2,"[9, 12, 43, 82, 91, 101, 113, 124, 128, 129, 130]",X,file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_CP029217.1.png
1,NZ_CP020499.1,178294,Klebsiella pneumoniae,"pneumoniae strain BWHC1 plasmid unnamed1, complete sequence",0.98,"[37, 41, 51, 64, 67, 71, 74, 75, 84, 88, 94, 98, 99, 102, 111, 129, 131, 135, 136, 145, 152]",X,file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP020499.1.png
2,NZ_CP039976.1,148759,Klebsiella pneumoniae,"pneumoniae strain R1761 plasmid p1761_02, complete sequence",0.82,"[24, 69, 70, 77, 85, 97, 103, 110, 112, 151]",X,file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP039976.1.png
3,NZ_CP029222.1,142762,Klebsiella pneumoniae,pneumoniae strain L388 plasmid p3-L388,1.01,"[25, 33, 142, 147, 148, 154]",X,file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP029222.1.png
4,CP025817.1,101557,Klebsiella pneumoniae,pneumoniae strain Kp81 plasmid pKp81_1,0.95,"[41, 64, 67, 71, 74, 81, 94, 99, 102, 111, 119, 122, 131, 135, 136, 139, 145, 152]",X,file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_CP025817.1.png


In [482]:
def summary_to_html(sample_folder, final_individual_dataframe):
    df = final_individual_dataframe.copy()
    sample = sample_folder.split("/")[-1]
    html_filename = os.path.join(sample_folder, sample + '_final_results.html')
    
    html_template = """
    <!doctype html>

    <html lang="en">
    
    <style type="text/css">
    
    body {
        font: normal 20px Verdana, Arial, sans-serif;
        border: 1px solid black;
        border-radius: 5px;
        padding: auto;
        margin: auto;
    }
    img {
        display: block;
        max-width: 350px;
        height: auto;
    }
    
    .summary{
        display: flex;
        flex-direction: column-reverse;
    }
    
    .numeric-values {
        display: flex;
        flex-direction: row;
        justify-content: space-around;
    }
    
    .neutral {
        background-color: lightgray;
    }
    
    .likely {
        background-color: limegreen;
    }
    
    .unlikely {
        background-color: sandybrown;
    }
    
    .unprobable {
        background-color: brown;
    }
    
    th {
        font-size: 1.7em;
        background-color: skyblue;
    }

    tr td {
        font-size: 1.1em;
        text-align: center;
    }
    
    tr:nth-child(even) {background-color: snow;}
    tr:hover {background-color:azure;}

    </style>
    
    <head>
      <meta charset="utf-8">

      <title>PlasmidID Report</title>
      <meta name="description" content="https://github.com/BU-ISCIII/plasmidID">
      <meta name="author" content="pedroscampoy@gmail.com">

      <link rel="stylesheet" href="css/styles.css?v=1.0">
      <link rel="shortcut icon" type="image/png" href="https://raw.github.com/BU-ISCIII/plasmidID/master/img/plasmidID_logo.png">

    </head>

    <body>
      <div>
      TABLESUMMARY
      </div>
    </body>
    </html>
    
    
    \n"""

    
    def complete_to_rating(row):
        if row.fraction_covered >= 0.8 and row.fraction_covered <= 1.2:
            return 'likely'
        elif row.fraction_covered > 1.2 or (row.fraction_covered < 0.8 and row.fraction_covered > 0.5):
            return 'unlikely'
        else:
            return 'unprobable'
        
    def mapping_to_rating(row):
        if row.percentage == 'X':
            return 'neutral'
        elif row.percentage >= 80:
            return 'likely'
        elif row.percentage < 80 and row.percentage > 60:
            return 'unlikely'
        else:
            return 'unprobable'
        
    
    def apply_img_tag(row):
        return '<div class=summary>' + '\n' + \
    '<div class=numeric-values>' + '\n' + \
    '<div class=\"percentage ' + row.perc_rating + '\">' + 'MAPPING %<br>' + str(row.percentage) + '</div>' + '\n' + \
    '<div class=\"complete ' + row.complete_rating + '\">' + 'ALIGN FR<br>' + str(row.fraction_covered) + '</div>' + '\n' + \
    '</div>' + '\n' + \
    '<a href=' + row.images + ' target=\"_blank\">' + '\n' + \
    '<img src=' + row.images + ' alt=' + "\"" + row.id + "\"" + '>' + '\n' + \
    '</a>' + '\n' + \
    '</div>'
    
    def italic_species(row):
        return '<i>' + row.species + '</i>'
    
    df['perc_rating'] = df.apply(lambda x: mapping_to_rating(x), axis=1)
    
    df['complete_rating'] = df.apply(lambda x: complete_to_rating(x), axis=1)
    
    df['images'] = df.apply(lambda x: apply_img_tag(x), axis=1)
    
    df['species'] = df.apply(lambda x: italic_species(x), axis=1)
    
    df.drop(['percentage', 'fraction_covered', 'perc_rating', 'complete_rating'], axis = 1, inplace = True) 

    
    #del df[['percentage', 'fraction_covered']]
    table = tabulate(df, headers='keys', tablefmt='html', showindex=False)
    table = table.replace("style=\"text-align: right;\"", "")
    
    final_html = html_template.replace('TABLESUMMARY', table)
    with open(html_filename, 'w+') as f:
        f.write(final_html)
    return df
        

In [483]:
summary_to_html(sample_folder, final_individual_dataframe)

Unnamed: 0,id,length,species,description,contig_name,images
0,CP029217.1,288994,<i>Klebsiella pneumoniae</i>,pneumoniae strain L201 plasmid p1-L201,"[9, 12, 43, 82, 91, 101, 113, 124, 128, 129, 130]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete likely"">ALIGN FR<br>1.2</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_CP029217.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_CP029217.1.png alt=""CP029217.1"">\n</a>\n</div>"
1,NZ_CP020499.1,178294,<i>Klebsiella pneumoniae</i>,"pneumoniae strain BWHC1 plasmid unnamed1, complete sequence","[37, 41, 51, 64, 67, 71, 74, 75, 84, 88, 94, 98, 99, 102, 111, 129, 131, 135, 136, 145, 152]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete likely"">ALIGN FR<br>0.98</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP020499.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP020499.1.png alt=""NZ_CP020499.1"">\n</a>\n</div>"
2,NZ_CP039976.1,148759,<i>Klebsiella pneumoniae</i>,"pneumoniae strain R1761 plasmid p1761_02, complete sequence","[24, 69, 70, 77, 85, 97, 103, 110, 112, 151]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete likely"">ALIGN FR<br>0.82</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP039976.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP039976.1.png alt=""NZ_CP039976.1"">\n</a>\n</div>"
3,NZ_CP029222.1,142762,<i>Klebsiella pneumoniae</i>,pneumoniae strain L388 plasmid p3-L388,"[25, 33, 142, 147, 148, 154]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete likely"">ALIGN FR<br>1.01</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP029222.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP029222.1.png alt=""NZ_CP029222.1"">\n</a>\n</div>"
4,CP025817.1,101557,<i>Klebsiella pneumoniae</i>,pneumoniae strain Kp81 plasmid pKp81_1,"[41, 64, 67, 71, 74, 81, 94, 99, 102, 111, 119, 122, 131, 135, 136, 139, 145, 152]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete likely"">ALIGN FR<br>0.95</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_CP025817.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_CP025817.1.png alt=""CP025817.1"">\n</a>\n</div>"
5,NZ_CP003998.1,81071,<i>Klebsiella pneumoniae</i>,"pneumoniae subsp. pneumoniae Kp13 plasmid pKP13e, complete sequence","[24, 119]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete likely"">ALIGN FR<br>1.05</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP003998.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP003998.1.png alt=""NZ_CP003998.1"">\n</a>\n</div>"
6,NZ_KX154765.1,50800,<i>Klebsiella pneumoniae</i>,"pneumoniae strain 145 plasmid pKp145/11a, complete sequence","[45, 76, 86, 87, 117, 123, 129, 143, 144, 153]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete likely"">ALIGN FR<br>0.97</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_KX154765.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_KX154765.1.png alt=""NZ_KX154765.1"">\n</a>\n</div>"
7,NZ_CP010576.1,35843,<i>Klebsiella pneumoniae</i>,pneumoniae strain 32192 plasmid sequence,"[51, 100, 110, 135, 145, 151]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete unlikely"">ALIGN FR<br>0.73</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP010576.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP010576.1.png alt=""NZ_CP010576.1"">\n</a>\n</div>"
8,NZ_CP041525.1,31500,<i>Escherichia coli</i>,coli strain 69 plasmid p6007-3,"[45, 76, 86, 123, 135, 144, 145]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete unlikely"">ALIGN FR<br>1.42</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP041525.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP041525.1.png alt=""NZ_CP041525.1"">\n</a>\n</div>"
9,NZ_CP015133.1,26450,<i>Klebsiella pneumoniae</i>,"pneumoniae strain Kpn555 plasmid pKPN-d6b, complete sequence",[119],"<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete unprobable"">ALIGN FR<br>0.03</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP015133.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_ddbb_w/images/1_or_ddbb_w_NZ_CP015133.1.png alt=""NZ_CP015133.1"">\n</a>\n</div>"
