## This notebook extracts information from the output files of anvi-display-contig-stats to calculate average number of ORFs/contig in each assembly.

#### Import all required modules

In [1]:
import pandas as pd
import os as os
import glob as glob
import numpy as np
import re as re
import seaborn as sns
import matplotlib.pyplot as plt

### Function to extract information from contig-stats output files

#### Assuming all output files for each assembly type are in separate directories, and they are the only .txt files in that directory

In [6]:
def calc_average_orf_per_contig(assembler):
    df = pd.DataFrame(columns=['Sample','Assembler','Average number of ORFs/contig', 'Average contig length (bp)'])
    for f in glob.glob("*.txt"):
        name, b = f.split('-contig')
        stats = pd.read_csv(f, sep='\t', index_col=0)
        mgn = stats.columns[0]
        contigs = stats.loc['Num Contigs'][0]
        genes = stats.loc['Num Genes (prodigal)'][0]
        length = stats.loc['Total Length'][0]
        avg_orfs = genes/contigs
        avg_len = length/contigs
        row = [name, assembler, contigs, genes, avg_orfs, avg_len]
        df.loc[len(df.index)] = row
    return(df)

#### Generate individual data frames for each assembly type

In [11]:
os.chdir("/Users/nastassia.patin/Desktop/Projects/Lasker2019/PacBio/metaFlye/contig-stats")
metaflye = calc_average_orf_per_contig('metaflye')
os.chdir("/Users/nastassia.patin/Desktop/Projects/Lasker2019/PacBio/hybridSPAdes/contig-stats")
hybrid = calc_average_orf_per_contig('hybridSPAdes')
os.chdir("/Users/nastassia.patin/Desktop/Projects/Lasker2019/PacBio/annotations-Illumina-matching/contig-stats")
illumina = calc_average_orf_per_contig('SPAdes')

In [12]:
illumina

Unnamed: 0,Metagenome,Assembly,Average number of ORFs/contig,Average contig length (bp)
0,1903c122_28m-2_bmtag_SPAdes,SPAdes,0.984408,332.364903
1,Las19c138_27m-1_bmtag_bbnorm_SPAdes,SPAdes,1.458451,1011.659063
2,1903c122_28m-1_NOAA1163b_S143_L003_bmtag_SPAdes,SPAdes,0.899634,317.264838
3,1903c126_45m-1_bmtag_SPAdes,SPAdes,1.535612,1153.538122
4,1903c129_26m-2_bmtag_SPAdes,SPAdes,1.420113,977.973971
5,Las19c135_5m-1_bmtag_SPAdes,SPAdes,0.909116,275.992971
6,1903c124_15m-1_bmtag_SPAdes,SPAdes,0.918486,282.687693
7,1903c144_13m-2_NOAA1163b_S163_L003_bmtag_SPAdes,SPAdes,0.861271,254.718455
8,1903c119_11m-2_bmtag_SPAdes,SPAdes,0.936064,282.32794
9,1903c111_10m-1_bmtag_SPAdes,SPAdes,0.936772,290.23648


In [13]:
df_all = pd.concat([metaflye, hybrid, illumina])

In [14]:
df_all

Unnamed: 0,Metagenome,Assembly,Average number of ORFs/contig,Average contig length (bp)
0,Las19c107_10m-3metaFlye,metaflye,13.2,11592.6
1,1903c126_45m-3metaFlye,metaflye,16.797246,16952.427859
2,Las19c138_27m-3metaFlye,metaflye,12.882415,12613.625937
3,1903c129_26m-3metaFlye,metaflye,5.555527,4412.188338
4,1903c111_10m-3metaFlye,metaflye,13.780936,10535.591973
5,1903c122_28m-3metaFlye,metaflye,13.55303,10421.636364
6,1903c118_23m-3metaFlye,metaflye,20.65,15946.75
7,1903c123_10m-3metaFlye,metaflye,11.530405,10310.131757
8,1903c119_11m-3metaFlye,metaflye,4.073039,3574.73275
9,1903c127_7m-3metaFlye,metaflye,11.804196,7666.902098
