# Analysis of the MAG search (validating with read mapping)

In [1]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
results = pd.read_csv("sra_search/results.csv",
                      sep=",",
                      quotechar="'",
                      names=["MAG", "SRA Run ID", "Containment"])

# Fix names so it's easier to query
results['MAG'] = results['MAG'].str.replace(r"'(?P<id>.*)'", lambda m: m.group("id"))
results['SRA Run ID'] = results['SRA Run ID'].str.replace(r".*/(?P<id>.*).sig.*", lambda m: m.group("id"))

# remove parks and TARA metagenomes containing the MAG
results = results[results['SRA Run ID'].isin(set(results['SRA Run ID']) - set(["SRR5036820", "SRR5037207"]))]
results = results[(results["MAG"] == "TOBG_NP-110") & (results["Containment"] > 0.5)]
results = results.set_index("SRA Run ID")
del results["MAG"]

In [3]:
results.sort_values(by="Containment", ascending=False)

Unnamed: 0_level_0,Containment
SRA Run ID,Unnamed: 1_level_1
SRR5868539,0.99
SRR1509798,0.98
SRR1509792,0.97
SRR5868540,0.91
SRR1509799,0.89
SRR070081,0.85
ERR3256923,0.81
SRR070083,0.79
SRR1509793,0.79
SRR304680,0.64


In [4]:
mapping = pd.read_table("sra_search/outputs/minimap/summary.txt", sep=",", index_col=0)

In [5]:
mapping[["reads mapped", 'total length', 'bases mapped', 'bases mapped (cigar)',
      'mismatches', 'average quality']]
#mapping

Unnamed: 0,reads mapped,total length,bases mapped,bases mapped (cigar),mismatches,average quality
SRR1509798,101441,23497214,23497214,22933834,138629,36.3
SRR1509792,63653,12508521,12508521,12136462,103237,35.2
SRR1509799,20014,4865614,4865614,4802129,27401,36.3
SRR1509793,22495,4374542,4374542,4295300,31768,35.7
SRR1509794,9827,2010200,2010200,1976937,13912,35.7
ERR3256923,32130,6378226,6378226,5050608,109241,25.9
SRR070081,9755,3819722,3819722,3728057,29248,33.7
SRR070083,6115,2674302,2674302,2526597,19303,34.1
SRR070084,3403,1499935,1499935,1445223,10806,34.5
SRR5868539,131640,24765412,24765412,24578634,114191,40.0


In [6]:
depth = pd.read_table("sra_search/outputs/minimap/depth/summary.txt", sep=",", index_col=0)


In [7]:
depth

Unnamed: 0,total,missed,percent missed,coverage
SRR1509798,1238250.0,7771.0,0.01,18.52
SRR1509792,1238250.0,26247.0,0.02,9.8
SRR1509799,1238250.0,71116.0,0.06,3.88
SRR1509793,1238250.0,143021.0,0.12,3.47
SRR1509794,1238250.0,427213.0,0.35,1.6
ERR3256923,1238250.0,117748.0,0.1,4.08
SRR070081,1238250.0,109714.0,0.09,3.0
SRR070083,1238250.0,202037.0,0.16,2.04
SRR070084,1238250.0,450167.0,0.36,1.16
SRR5868539,1238250.0,1475.0,0.0,19.85


In [8]:
overlap = pd.read_table("sra_search/outputs/minimap/overlap/summary.txt", sep=",", index_col=0)



In [9]:
overlap

Unnamed: 0,containment
SRR1509798,0.98
SRR1509792,0.97
SRR1509799,0.89
SRR1509793,0.79
SRR1509794,0.56
ERR3256923,0.81
SRR070081,0.85
SRR070083,0.79
SRR070084,0.58
SRR5868539,0.99


In [10]:
final = results.copy()
final["Containment search"] = final["Containment"]
del final["Containment"]
final["Containment reads"] = overlap["containment"]
final["Missed bp"] = depth["missed"]
final["%bp missed"] = depth["percent missed"]
final["Coverage"] = depth["coverage"]
final["Reads mapped"] = mapping["reads mapped"]
#final["mismatches"] = mapping["mismatches"]
#final["% mismatches"] = mapping["mismatches"] / mapping["bases mapped (cigar)"]
print(final.sort_values(by="Containment search", ascending=False)
            .to_markdown(floatfmt=(',.2f', ',.2f', ',.2f', ',.0f', ',.3f', ',.2f', ',.0f')))

| SRA Run ID   |   Containment search |   Containment reads |   Missed bp |   %bp missed |   Coverage |   Reads mapped |
|:-------------|---------------------:|--------------------:|------------:|-------------:|-----------:|---------------:|
| SRR5868539   |                 0.99 |                0.99 |       1,475 |        0.001 |      19.85 |        131,640 |
| SRR1509798   |                 0.98 |                0.98 |       7,771 |        0.006 |      18.52 |        101,441 |
| SRR1509792   |                 0.97 |                0.97 |      26,247 |        0.021 |       9.80 |         63,653 |
| SRR5868540   |                 0.91 |                0.91 |      52,686 |        0.043 |       4.27 |         24,983 |
| SRR1509799   |                 0.89 |                0.89 |      71,116 |        0.057 |       3.88 |         20,014 |
| SRR070081    |                 0.85 |                0.85 |     109,714 |        0.089 |       3.00 |          9,755 |
| ERR3256923   |                