In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.backends.backend_pdf import PdfPages

rcParams = {'font.size': 17 , 'font.weight': 'normal', 'font.family': 'sans-serif',
            'axes.unicode_minus':False, 'axes.labelweight':'normal'}
graphspath = 'Mount/MetaCarvel_paper/hmp_scaffolds/stool/'
coords_path = 'Research-Activities/Data/Scaffold_Coverage_After_Delinking/'
bin_path = 'Mount/projects/refining_bins_with_assembly_graphs/Scaffold_Coverage_binning/delinked_scaffold_bins/'
cov_path = 'Research-Activities/Data/genomecov_d/'
op_path = 'High_Contamination_Coverage_Plots/'

def Load_Read_Coverage(sample):
    df_coverage = pd.read_csv(cov_path+sample+'.txt',names = ['Contig','Loc','Coverage'], 
                              sep = '\t', low_memory = False,  
                              dtype = {'Contig': str, 'Loc': np.int32, 'Coverage': np.int32},
                              index_col = 'Contig', engine='c')
    df_coverage['Loc'] = df_coverage['Loc']-1
    return df_coverage

def Get_Scaffolds(bin_id, sample):
    lines = open(bin_path+sample+'_delinked_cc_bins/'+bin_id+'.fa','r').readlines()
    contigs, scaffolds = [], []
    for line in lines:
        if line[0] == '>':
            line = line.replace(">","")
            line = line.replace("\n","")
            
            if 'cc' in line: scaffolds.append(line)
            else: contigs.append(line)
    return contigs, scaffolds

def Load_Scaffold_Coverages(sample):
    df_scaffold_coverage = pd.read_csv(coords_path+sample+'_Coverages.csv',
                                      index_col = ['Connected_Component'])
    return df_scaffold_coverage

def Plot_Coverage(Sample, bins):
    df_contig_cov = Load_Read_Coverage(Sample)
    df_scaffold_cov = Load_Scaffold_Coverages(Sample)
    
    for b in bins:
        pdf = PdfPages(op_path+b+'.pdf')
        contigs, scaffolds = Get_Scaffolds(b, Sample)
        for c in contigs:
            contig_cov = df_contig_cov.loc[c]
            contig_cov = contig_cov.set_index('Loc')
            fig, ax = plt.subplots(1,1,figsize = (18, 6))
            contig_cov[['Coverage']].plot(ax = ax, legend = False)
            ax.set_title('Contig:'+c)
            fig.tight_layout()
            pdf.savefig( fig )
            plt.close('all')
        for s in scaffolds:
            scaffold_id = int(s.replace("cc_",""))
            scaffold_cov = df_scaffold_cov.loc[scaffold_id]
            scaffold_cov = scaffold_cov.set_index('Coordinates')
            fig, ax = plt.subplots(1,1,figsize = (18, 6))
            scaffold_cov[['Coverage']].plot(ax=ax, legend = False)
            ax.set_title('Scaffold:'+str(scaffold_id))
            fig.tight_layout()
            pdf.savefig( fig )
            plt.close('all')
        pdf.close()
        print(b)

In [107]:
filepath = 'Mount/projects/refining_bins_with_assembly_graphs/Scaffold_Coverage_binning/\
delinked_scaffold_bins/concat_checkm_report_scaffolds.txt'
column_names = ['Bin Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', '1', 
                '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity']
lines = open(filepath, 'r').readlines()
mat = []
for l in lines[1:]:
    temp_str = ''
    vec = []
    for i in range(1, len(l)):
        if l[i] != ' ' or (l[i] == ' ' and l[i+1] == '('):
            temp_str += l[i]
        else:
            if len(temp_str) > 0:
                vec.append(temp_str)
            temp_str = ''
    mat.append(vec)
df = pd.DataFrame(data = mat)
df[[2,3,4,5,6,7,8,9,10,11,12,13]] = df[[2,3,4,5,6,7,8,9,10,11,12,13]].apply(pd.to_numeric)
df.columns = column_names
df['Sample'] = df['Bin Id'].str[0:9]
plt.rcParams.update(rcParams)

In [108]:
df_high_cont = df[df['Contamination'] > 5]
df_high_cont_grps = df_high_cont.groupby('Sample')


In [109]:
for grp in df_high_cont_grps:
    Plot_Coverage(grp[0], grp[1]['Bin Id'].tolist())

SRS016438_delinked_cc.24
SRS019397_delinked_cc.65
SRS019397_delinked_cc.52
SRS019397_delinked_cc.47
SRS019397_delinked_cc.46
SRS019787_delinked_cc.44
SRS023829_delinked_cc.9
SRS024075_delinked_cc.8
SRS024549_delinked_cc.20
SRS053573_delinked_cc.5
SRS064645_delinked_cc.23
SRS064645_delinked_cc.33
SRS064645_delinked_cc.17
SRS077194_delinked_cc.95
SRS077194_delinked_cc.94
SRS077194_delinked_cc.37
SRS077194_delinked_cc.124
SRS077194_delinked_cc.57
SRS098644_delinked_cc.47
SRS098644_delinked_cc.176
SRS098644_delinked_cc.72
SRS098644_delinked_cc.140
SRS098644_delinked_cc.105
SRS098644_delinked_cc.85
SRS098644_delinked_cc.113
SRS098644_delinked_cc.98
SRS098644_delinked_cc.118
SRS098644_delinked_cc.126
SRS104311_delinked_cc.68
SRS104311_delinked_cc.64
SRS104311_delinked_cc.109
SRS104311_delinked_cc.113
SRS104311_delinked_cc.77
SRS104311_delinked_cc.11
SRS104311_delinked_cc.90
SRS104311_delinked_cc.72
SRS104311_delinked_cc.66
SRS104485_delinked_cc.23
SRS143342_delinked_cc.79
SRS143342_delinked_