# Analysis of the MAG search (validating with read mapping)

In [1]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
results = pd.read_csv("sra_search/results.csv",
                      sep=",",
                      quotechar="'",
                      names=["MAG", "SRA Run ID", "Containment"])

# Fix names so it's easier to query
results['MAG'] = results['MAG'].str.replace(r"'(?P<id>.*)'", lambda m: m.group("id"))
results['SRA Run ID'] = results['SRA Run ID'].str.replace(r".*/(?P<id>.*).sig.*", lambda m: m.group("id"))

# remove parks and TARA metagenomes containing the MAG
results = results[results['SRA Run ID'].isin(set(results['SRA Run ID']) - set(["SRR5036820", "SRR5037207"]))]
results = results[(results["MAG"] == "TOBG_NP-110") & (results["Containment"] > 0.5)]
results = results.set_index("SRA Run ID")
del results["MAG"]

In [3]:
results.sort_values(by="Containment", ascending=False)

Unnamed: 0_level_0,Containment
SRA Run ID,Unnamed: 1_level_1
SRR5868539,0.99
SRR1509798,0.98
SRR1509792,0.97
SRR5868540,0.91
SRR1509799,0.89
SRR070081,0.85
ERR3256923,0.81
SRR070083,0.79
SRR1509793,0.79
SRR304680,0.64


In [4]:
mapping = pd.read_table("sra_search/outputs/minimap/summary.txt", sep=",", index_col=0)

In [5]:
mapping[["reads mapped", "reads properly paired", 'percentage of properly paired reads (%)',
         'total length', 'bases mapped', 'bases mapped (cigar)',
        'pairs on different chromosomes', 'mismatches', 'average quality']]
#mapping

Unnamed: 0,reads mapped,reads properly paired,percentage of properly paired reads (%),total length,bases mapped,bases mapped (cigar),pairs on different chromosomes,mismatches,average quality
SRR1509798,101640,97506,95.9,23545378,23545378,22944787,107,139791,36.3
SRR1509792,63662,61686,96.9,12517065,12517065,12126825,42,102934,35.2
SRR1509799,20036,17776,88.7,4871134,4871134,4805097,39,27812,36.3
SRR1509793,22493,21832,97.1,4375045,4375045,4292519,16,31508,35.7
SRR1509794,9828,9482,96.5,2010952,2010952,1975791,7,13826,35.7
ERR3256923,32295,29474,91.3,6411363,6411363,5059230,13,110653,25.9
SRR070081,9755,0,0.0,3819722,3819722,3728057,0,29248,33.7
SRR070083,6115,0,0.0,2674302,2674302,2526597,0,19303,34.1
SRR070084,3403,0,0.0,1499935,1499935,1445223,0,10806,34.5
SRR5868539,131640,0,0.0,24765412,24765412,24578634,0,114191,40.0


In [6]:
depth = pd.read_table("sra_search/outputs/minimap/depth/summary.txt", sep=",", index_col=0)


In [7]:
depth

Unnamed: 0,genome bp,missed,percent missed,coverage
SRR1509798,1238250.0,7771.0,0.63,18.53
SRR1509792,1238250.0,26145.0,2.11,9.79
SRR1509799,1238250.0,71072.0,5.74,3.88
SRR1509793,1238250.0,142974.0,11.55,3.47
SRR1509794,1238250.0,427130.0,34.49,1.6
ERR3256923,1238250.0,116961.0,9.45,4.09
SRR070081,1238250.0,109714.0,8.86,3.0
SRR070083,1238250.0,202038.0,16.32,2.04
SRR070084,1238250.0,450118.0,36.35,1.16
SRR5868539,1238250.0,1475.0,0.12,19.85


In [8]:
overlap = pd.read_table("sra_search/outputs/minimap/overlap/summary.txt", sep=",", index_col=0)



In [9]:
overlap

Unnamed: 0,containment
SRR1509798,0.98
SRR1509792,0.97
SRR1509799,0.89
SRR1509793,0.79
SRR1509794,0.56
ERR3256923,0.81
SRR070081,0.85
SRR070083,0.79
SRR070084,0.58
SRR5868539,0.99


In [10]:
final = results.copy()
final["Containment search"] = final["Containment"]
del final["Containment"]
final["Containment reads"] = overlap["containment"]
final["Missed bp"] = depth["missed"]
final["%bp missed"] = depth["percent missed"]
final["Coverage"] = depth["coverage"]
final["Reads mapped"] = mapping["reads mapped"]
#final["mismatches"] = mapping["mismatches"]
#final["% mismatches"] = 100 * mapping["mismatches"] / mapping["bases mapped (cigar)"]
print(final.sort_values(by="Containment search", ascending=False)
            .to_markdown(floatfmt=(',.2f', ',.2f', ',.2f', ',.0f', ',.3f', ',.2f', ',.0f')))

| SRA Run ID   |   Containment search |   Containment reads |   Missed bp |   %bp missed |   Coverage |   Reads mapped |
|:-------------|---------------------:|--------------------:|------------:|-------------:|-----------:|---------------:|
| SRR5868539   |                 0.99 |                0.99 |       1,475 |        0.119 |      19.85 |        131,640 |
| SRR1509798   |                 0.98 |                0.98 |       7,771 |        0.628 |      18.53 |        101,640 |
| SRR1509792   |                 0.97 |                0.97 |      26,145 |        2.111 |       9.79 |         63,662 |
| SRR5868540   |                 0.91 |                0.91 |      52,686 |        4.255 |       4.27 |         24,983 |
| SRR1509799   |                 0.89 |                0.89 |      71,072 |        5.740 |       3.88 |         20,036 |
| SRR070081    |                 0.85 |                0.85 |     109,714 |        8.860 |       3.00 |          9,755 |
| ERR3256923   |                