In [320]:
import os
import re

from Bio import SeqIO
from tabulate import tabulate


import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [276]:
sample_folder = '/home/pjsola/TMP/pidtest/NO_GROUP/1_or_w'

In [277]:
def extract_files(folder):
    percentage_file = ""
    for root, _, files in os.walk(folder):
        for name in files:
            if name.endswith(".coverage_adapted_clustered_percentage"):
                percentage_file = os.path.join(root, name)
            elif name.endswith(".plasmids.complete"):
                complete_file = os.path.join(root, name)
            elif "representative" in name and name.endswith(".fasta"):
                representative_file = os.path.join(root, name)
                
    return percentage_file, complete_file, representative_file


In [278]:
percentage_file, complete_file, representative_file = extract_files(sample_folder)

In [279]:
def percentage_to_df(percentage_file):
    if not percentage_file == "":
        df = pd.read_csv(percentage_file, sep=" ", names=['id', 'percentage'])
        df['percentage'] = df['percentage'].round(2)
        return df
    else:
        return pd.DataFrame(columns=['id','percentage'])

In [280]:
percentage_df = percentage_to_df(percentage_file)
percentage_df.head()

Unnamed: 0,id,percentage


In [281]:
def len_description_to_df(representative_file):
    df = pd.DataFrame(columns=['id','length','species', 'description'])
    index = 0
    for seq_record in SeqIO.parse(representative_file, "fasta"):
        df.loc[index, 'id'] = seq_record.id
        df.loc[index, 'length'] = len(seq_record)
        description_split = seq_record.description.split(' ')
        df.loc[index, 'species'] = (' ').join(description_split[1:3])
        df.loc[index, 'description'] = (' ').join(description_split[2:])
        index = index + 1
    df['length'] = df['length'].astype(int)

    return df

In [368]:
len_description_df = len_description_to_df(representative_file)
len_description_df.head()

Unnamed: 0,id,length,species,description
0,NZ_CP035364.1,13142,Escherichia coli,coli strain BR10-DEC plasmid unnamed4
1,NZ_LS998788.1,18395,Escherichia coli,"coli isolate EC-TO75 genome assembly, plasmid: 4"
2,NZ_CP041525.1,31500,Escherichia coli,coli strain 69 plasmid p6007-3
3,NZ_CP011334.1,2954,Escherichia coli,"coli O104:H4 str. C227-11 isolate 368 shch plasmid unnamed, complete sequence"
4,NZ_CP010576.1,35843,Klebsiella pneumoniae,pneumoniae strain 32192 plasmid sequence


In [369]:
def complete_report_df(complete_file, len_description_df, percentage_df):
    
    def set_to_list(row):
        listed_set = list(row.contig_name)
        listed_set.sort()
        return listed_set
    
    #CP029217.1	176762	288994	9	id=170244
    dfc = pd.read_csv(complete_file, sep="\t", names=['id', 'start', 'end', 'contig_name', 'contig_id'])
    dfc['len_covered'] = dfc.end - dfc.start
    covered_df = dfc.groupby('id')['len_covered'].sum().reset_index()
    contigs_df = dfc.groupby('id')['contig_name'].apply(set).reset_index()#Merge all dataframes 
    #Merge all dataframes
    df = len_description_df.merge(covered_df, on='id', how='left')
    df['fraction_covered'] = round(df.len_covered / df.length, 2)
    del df['len_covered']
    df = df.merge(contigs_df, on='id', how='left')
    df = df.dropna()
    df['contig_name'] = df.apply(lambda x: set_to_list(x), axis=1)
    df = df.merge(percentage_df, on='id', how='left')
    df = df.sort_values(by=['length']).reset_index(drop=True)
    df = df.fillna('X')
    return df

In [370]:
summary_df = complete_report_df(complete_file, len_description_df, percentage_df)

In [371]:
summary_df

Unnamed: 0,id,length,species,description,fraction_covered,contig_name,percentage
0,NZ_CP042486.1,6953,Klebsiella pneumoniae,pneumoniae strain C51 plasmid pC51_005,0.76,"[97, 103, 110, 119, 151]",X
1,NZ_CP032177.1,9294,Klebsiella pneumoniae,"pneumoniae strain AR_0135 plasmid unnamed2, complete sequence",1.0,[72],X
2,NZ_CP034423.1,11225,Klebsiella pneumoniae,pneumoniae strain C1398 plasmid unnamed2,1.3,"[64, 131]",X
3,NZ_CP035364.1,13142,Escherichia coli,coli strain BR10-DEC plasmid unnamed4,2.82,"[41, 135]",X
4,NZ_LS998788.1,18395,Escherichia coli,"coli isolate EC-TO75 genome assembly, plasmid: 4",0.55,"[76, 93, 117, 144, 153]",X
5,NZ_CP036441.1,19305,Klebsiella pneumoniae,"pneumoniae strain ABFQB plasmid pKPC-5fbf, complete sequence",0.48,[72],X
6,NZ_CP015133.1,26450,Klebsiella pneumoniae,"pneumoniae strain Kpn555 plasmid pKPN-d6b, complete sequence",0.03,[119],X
7,NZ_CP041525.1,31500,Escherichia coli,coli strain 69 plasmid p6007-3,1.42,"[45, 76, 86, 123, 135, 144, 145]",X
8,NZ_CP010576.1,35843,Klebsiella pneumoniae,pneumoniae strain 32192 plasmid sequence,0.73,"[51, 100, 110, 135, 145, 151]",X
9,NZ_KX154765.1,50800,Klebsiella pneumoniae,"pneumoniae strain 145 plasmid pKp145/11a, complete sequence",0.97,"[45, 76, 86, 87, 117, 123, 129, 143, 144, 153]",X


In [372]:
def include_images(sample_folder, summary_df):
    sample = sample_folder.split("/")[-1]
    def image_finder(row, sample_folder):
        for root, _, files in os.walk(sample_folder):
            for name in files:
                if 'images' in root and row.id in name and name.endswith('.png'):
                    return 'file://' + os.path.join(root, name)
                
    summary_df['images'] = summary_df.apply(lambda x: image_finder(x, sample_folder), axis=1)
    summary_df.to_csv(sample_folder + '/' + sample +  '_final_results.tab', sep='\t', index=False)
    return summary_df

In [424]:
final_individual_dataframe = include_images(sample_folder, summary_df)
final_individual_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                15 non-null     object 
 1   length            15 non-null     int64  
 2   species           15 non-null     object 
 3   description       15 non-null     object 
 4   fraction_covered  15 non-null     float64
 5   contig_name       15 non-null     object 
 6   percentage        15 non-null     object 
 7   images            15 non-null     object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.1+ KB


In [433]:
def summary_to_html(sample_folder, final_individual_dataframe):
    df = final_individual_dataframe.copy()
    sample = sample_folder.split("/")[-1]
    html_filename = os.path.join(sample_folder, sample + '_final_results.html')
    
    html_template = """
    <!doctype html>

    <html lang="en">
    
    <style type="text/css">
    
    body {
        font: normal 20px Verdana, Arial, sans-serif;
        border: 1px solid black;
        border-radius: 5px;
        padding: auto;
        margin: auto;
    }
    img {
        display: block;
        max-width: 350px;
        height: auto;
    }
    
    .summary{
        display: flex;
        flex-direction: column-reverse;
    }
    
    .numeric-values {
        display: flex;
        flex-direction: row;
        justify-content: space-around;
    }
    
    .neutral {
        background-color: lightgray;
    }
    
    .likely {
        background-color: limegreen;
    }
    
    .unlikely {
        background-color: sandybrown;
    }
    
    .unprobable {
        background-color: brown;
    }
    
    th {
        font-size: 2em;
    }

    tr td {
        font-size: 1.5em;
        text-align: center;
    }
    
    tr:nth-child(even) {background-color: gainsboro;}
    tr:hover {background-color:lightblue;}

    </style>
    
    <head>
      <meta charset="utf-8">

      <title>PlasmidID Report</title>
      <meta name="description" content="https://github.com/BU-ISCIII/plasmidID">
      <meta name="author" content="pedroscampoy@gmail.com">

      <link rel="stylesheet" href="css/styles.css?v=1.0">
      <link rel="shortcut icon" type="image/png" href="https://raw.github.com/BU-ISCIII/plasmidID/master/img/plasmidID_logo.png">

    </head>

    <body>
      <div>
      TABLESUMMARY
      </div>
    </body>
    </html>
    
    
    \n"""

    
    def complete_to_rating(row):
        if row.fraction_covered >= 0.8 and row.fraction_covered <= 1.2:
            return 'likely'
        elif row.fraction_covered > 1.2 or (row.fraction_covered < 0.8 and row.fraction_covered > 0.5):
            return 'unlikely'
        else:
            return 'unprobable'
        
    def mapping_to_rating(row):
        if row.percentage == 'X':
            return 'neutral'
        elif row.percentage >= 80:
            return 'likely'
        elif row.percentage < 80 and row.percentage > 60:
            return 'unlikely'
        else:
            return 'unprobable'
        
    
    def apply_img_tag(row):
        return '<div class=summary>' + '\n' + \
    '<div class=numeric-values>' + '\n' + \
    '<div class=\"percentage ' + row.perc_rating + '\">' + 'MAPPING %<br>' + str(row.percentage) + '</div>' + '\n' + \
    '<div class=\"complete ' + row.complete_rating + '\">' + 'ALIGN FR<br>' + str(row.fraction_covered) + '</div>' + '\n' + \
    '</div>' + '\n' + \
    '<a href=' + row.images + ' target=\"_blank\">' + '\n' + \
    '<img src=' + row.images + ' alt=' + "\"" + row.id + "\"" + '>' + '\n' + \
    '</a>' + '\n' + \
    '</div>'
    
    def italic_species(row):
        return '<i>' + row.species + '</i>'
    
    df['perc_rating'] = df.apply(lambda x: mapping_to_rating(x), axis=1)
    
    df['complete_rating'] = df.apply(lambda x: complete_to_rating(x), axis=1)
    
    df['images'] = df.apply(lambda x: apply_img_tag(x), axis=1)
    
    df['species'] = df.apply(lambda x: italic_species(x), axis=1)
    
    df.drop(['percentage', 'fraction_covered', 'perc_rating', 'complete_rating'], axis = 1, inplace = True) 

    
    #del df[['percentage', 'fraction_covered']]
    table = tabulate(df, headers='keys', tablefmt='html', showindex=False)
    table = table.replace("style=\"text-align: right;\"", "")
    
    final_html = html_template.replace('TABLESUMMARY', table)
    with open(html_filename, 'w+') as f:
        f.write(final_html)
    return df
        

In [434]:
summary_to_html(sample_folder, final_individual_dataframe)

Unnamed: 0,id,length,species,description,contig_name,images
0,NZ_CP042486.1,6953,<i>Klebsiella pneumoniae</i>,pneumoniae strain C51 plasmid pC51_005,"[97, 103, 110, 119, 151]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete unlikely"">ALIGN FR<br>0.76</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP042486.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP042486.1.png alt=""NZ_CP042486.1"">\n</a>\n</div>"
1,NZ_CP032177.1,9294,<i>Klebsiella pneumoniae</i>,"pneumoniae strain AR_0135 plasmid unnamed2, complete sequence",[72],"<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete likely"">ALIGN FR<br>1.0</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP032177.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP032177.1.png alt=""NZ_CP032177.1"">\n</a>\n</div>"
2,NZ_CP034423.1,11225,<i>Klebsiella pneumoniae</i>,pneumoniae strain C1398 plasmid unnamed2,"[64, 131]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete unlikely"">ALIGN FR<br>1.3</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP034423.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP034423.1.png alt=""NZ_CP034423.1"">\n</a>\n</div>"
3,NZ_CP035364.1,13142,<i>Escherichia coli</i>,coli strain BR10-DEC plasmid unnamed4,"[41, 135]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete unlikely"">ALIGN FR<br>2.82</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP035364.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP035364.1.png alt=""NZ_CP035364.1"">\n</a>\n</div>"
4,NZ_LS998788.1,18395,<i>Escherichia coli</i>,"coli isolate EC-TO75 genome assembly, plasmid: 4","[76, 93, 117, 144, 153]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete unlikely"">ALIGN FR<br>0.55</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_LS998788.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_LS998788.1.png alt=""NZ_LS998788.1"">\n</a>\n</div>"
5,NZ_CP036441.1,19305,<i>Klebsiella pneumoniae</i>,"pneumoniae strain ABFQB plasmid pKPC-5fbf, complete sequence",[72],"<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete unprobable"">ALIGN FR<br>0.48</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP036441.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP036441.1.png alt=""NZ_CP036441.1"">\n</a>\n</div>"
6,NZ_CP015133.1,26450,<i>Klebsiella pneumoniae</i>,"pneumoniae strain Kpn555 plasmid pKPN-d6b, complete sequence",[119],"<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete unprobable"">ALIGN FR<br>0.03</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP015133.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP015133.1.png alt=""NZ_CP015133.1"">\n</a>\n</div>"
7,NZ_CP041525.1,31500,<i>Escherichia coli</i>,coli strain 69 plasmid p6007-3,"[45, 76, 86, 123, 135, 144, 145]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete unlikely"">ALIGN FR<br>1.42</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP041525.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP041525.1.png alt=""NZ_CP041525.1"">\n</a>\n</div>"
8,NZ_CP010576.1,35843,<i>Klebsiella pneumoniae</i>,pneumoniae strain 32192 plasmid sequence,"[51, 100, 110, 135, 145, 151]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete unlikely"">ALIGN FR<br>0.73</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP010576.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_CP010576.1.png alt=""NZ_CP010576.1"">\n</a>\n</div>"
9,NZ_KX154765.1,50800,<i>Klebsiella pneumoniae</i>,"pneumoniae strain 145 plasmid pKp145/11a, complete sequence","[45, 76, 86, 87, 117, 123, 129, 143, 144, 153]","<div class=summary>\n<div class=numeric-values>\n<div class=""percentage neutral"">MAPPING %<br>X</div>\n<div class=""complete likely"">ALIGN FR<br>0.97</div>\n</div>\n<a href=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_KX154765.1.png target=""_blank"">\n<img src=file:///home/pjsola/TMP/pidtest/NO_GROUP/1_or_w/images/1_or_w_NZ_KX154765.1.png alt=""NZ_KX154765.1"">\n</a>\n</div>"
