### Check M results

In this notebook the different quality classes are added to the checkM output to help plotting the quality in the notebook BinPlotWithQuality.ipynb 

In [1]:
import pandas as pd 
import numpy as np

#### Reading and opening CheckM results file with completeness and contamination values

In [2]:
df = pd.read_csv("../data56_related_files/CheckM_results_file", sep = '\t')

In [4]:
df.head(5)

Unnamed: 0,Bin Id,Marker lineage,# genomes,# markers,# marker sets,0,1,2,3,4,5+,Completeness,Contamination,Strain heterogeneity
0,S10C10200,root (UID1),5656,56,24,56,0,0,0,0,0,0.0,0.0,0.0
1,S10C1032,o__Bacteroidales (UID2654),163,486,266,298,186,2,0,0,0,38.94,0.38,0.0
2,S10C1039,o__Lactobacillales (UID355),490,334,183,3,331,0,0,0,0,98.91,0.0,0.0
3,S10C1094,k__Bacteria (UID203),5449,104,58,92,12,0,0,0,0,14.66,0.0,0.0
4,S10C10947,root (UID1),5656,56,24,56,0,0,0,0,0,0.0,0.0,0.0


In [5]:
df.Completeness = df.Completeness.astype(float)
df.Contamination = df.Contamination.astype(float)

#### Generating columns based on bin quality - later used for plotting in  BinPlotWithQuality.ipynb

In [6]:
print(max(df.Completeness))
print(max(df.Contamination))

100.0
265.69


In [7]:
query = 100
df.query('Contamination > %d' %query)

Unnamed: 0,Bin Id,Marker lineage,# genomes,# markers,# marker sets,0,1,2,3,4,5+,Completeness,Contamination,Strain heterogeneity
142,S12C3084,k__Bacteria (UID203),5449,104,58,1,19,13,70,1,0,98.28,181.22,27.51
1492,S36C456,k__Bacteria (UID203),5449,102,57,3,21,23,27,20,8,97.93,206.56,17.11
2478,S53C1778,k__Bacteria (UID203),5449,104,58,2,30,21,16,17,18,98.28,265.69,26.72
2803,S8C1094,k__Bacteria (UID203),5449,104,58,1,6,40,56,1,0,99.66,176.99,28.5


In [8]:
#adding an extra column to define the colors in the scatterplot
# 0 = highQ, 1 = mediumHigh, 2 = mediumLow, 3 = low 
def conditions(s):
    
    QS = s['Completeness'] - (5 * s['Contamination'])
    
    if (s['Contamination'] < 5) and (s['Completeness'] > 90):
        return int(0)
    elif (s['Contamination'] < 10) and (s['Completeness'] > 50) and QS > 50:
        return int(1)
    elif (s['Contamination'] < 10) and (s['Completeness'] > 50) and QS <= 50:
        return int(2)
    else:
        return int(3)

In [9]:
df['bin_class'] = df.apply(conditions, axis=1)

In [10]:
#adding an extra column to define bins in the barplot
# 0 = highQ, 1 = middel, 2 = Low 
def classConditions(s):
    
    if (s['bin_class'] == 0):
        return int(0)
    elif (s['bin_class'] <= 2 and s['bin_class'] >= 1):
        return int(1)
    else:
        return int(2)

In [11]:
df['class_3'] = df.apply(classConditions, axis=1)

In [12]:
df.head(3)

Unnamed: 0,Bin Id,Marker lineage,# genomes,# markers,# marker sets,0,1,2,3,4,5+,Completeness,Contamination,Strain heterogeneity,bin_class,class_3
0,S10C10200,root (UID1),5656,56,24,56,0,0,0,0,0,0.0,0.0,0.0,3,2
1,S10C1032,o__Bacteroidales (UID2654),163,486,266,298,186,2,0,0,0,38.94,0.38,0.0,3,2
2,S10C1039,o__Lactobacillales (UID355),490,334,183,3,331,0,0,0,0,98.91,0.0,0.0,0,0


In [13]:
df.groupby('bin_class').count()

Unnamed: 0_level_0,Bin Id,Marker lineage,# genomes,# markers,# marker sets,0,1,2,3,4,5+,Completeness,Contamination,Strain heterogeneity,class_3
bin_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,609,609,609,609,609,609,609,609,609,609,609,609,609,609,609
1,682,682,682,682,682,682,682,682,682,682,682,682,682,682,682
2,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73
3,1601,1601,1601,1601,1601,1601,1601,1601,1601,1601,1601,1601,1601,1601,1601


#### Saving df for later plotting in BinPlotWithQuality.ipynb

In [14]:
df.to_csv('../data56_related_files/Contamination_completness_df.csv', index = False)