In [50]:
import os
import pandas as pd
import numpy as np
import re
from statistics import mean

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [2]:
out_dir = '/home/laura/ANALYSIS/VARIANT_CALLING/COVID'

In [3]:
def extract_snp_count(output_dir,sample):
    variants_folder = os.path.join(output_dir, 'Variants')
    raw_var_folder = os.path.join(variants_folder, 'ivar_raw')
    filename = os.path.join(raw_var_folder, sample + ".tsv")

    if os.path.exists(filename):
        df = pd.read_csv(filename, sep="\t")
        df = df.drop_duplicates(subset=['POS', 'REF', 'ALT'], keep="first")
        high_quality_snps = df["POS"][(df.PASS == True) &
                    (df.ALT_DP >= 20) &
                    (df.ALT_FREQ >= 0.8) &
                    ~(df.ALT.str.startswith('+') | df.ALT.str.startswith('-'))].tolist()
        htz_snps = df["POS"][(df.PASS == True) &
                    (df.ALT_DP >= 20) &
                    (df.ALT_FREQ < 0.8) &
                    (df.ALT_FREQ >= 0.2) &
                    ~(df.ALT.str.startswith('+') | df.ALT.str.startswith('-'))].tolist()
        return len(high_quality_snps), len(htz_snps)
    else:
        print("FILE " + filename + " NOT FOUND" )
        return None

In [7]:
def extract_mapped_reads(output_dir,sample):
    stats_folder = os.path.join(output_dir, 'Stats')
    bamstats_folder = os.path.join(stats_folder, 'Bamstats')
    filename = os.path.join(bamstats_folder, sample + ".bamstats")

    if os.path.exists(filename):
        with open (filename, 'r') as f:
            for line in f:
                if 'mapped' in line and '%' in line:
                    reads_mapped = line.split(" ")[0]
                    mappep_percentage = line.split("(")[-1].split("%")[0]
                elif 'properly paired' in line:
                    properly_paired = line.split(" ")[0]
                    paired_percentage = line.split("(")[-1].split("%")[0]
        return int(reads_mapped), float(mappep_percentage), int(properly_paired), float(paired_percentage)
        print("FILE " + filename + " NOT FOUND" )
        return None

In [64]:
def extract_n_consensus(output_dir,sample):
    consensus_folder = os.path.join(output_dir, 'Consensus')
    filename = os.path.join(consensus_folder, sample + ".fa")

    if os.path.exists(filename):
        with open (filename, 'r') as f:
            content = f.read()
            content_list = content.split('\n')
            sample_fq = content_list[0].strip(">")
            if sample_fq == sample:
                #In case fasta is in several lines(not by default)
                sequence = ("").join(content_list[1:]).strip()
                all_N = re.findall(r'N+', sequence)
                leading_N = re.findall(r'^N+', sequence)
                tailing_N = re.findall(r'N+$', sequence)
                length_N = [len(x) for x in all_N]
                individual_N = [x for x in length_N if x == 1]
                mean_length_N = mean(length_N)
                sum_length_N = sum(length_N)
                return(len(all_N), len(individual_N), len(leading_N), len(tailing_N), sum_length_N, mean_length_N)
    else:
        print("FILE " + filename + " NOT FOUND" )
        return None

In [65]:
extract_n_consensus(out_dir, '20155670-8')

(3, 1, 1, 0, 22)

In [66]:
extract_snp_count(out_dir, '20155670-8')

(9, 6)

In [67]:
extract_mapped_reads(out_dir, '20155670-8')

(758339, 100.0, 756665, 99.91)

In [68]:
def obtain_overal_stats(output_dir, group):
    stat_folder = os.path.join(output_dir, 'Stats')
    overal_stat_file = os.path.join(stat_folder, group + ".overal.stats.tab")
    for root, _, files in os.walk(stat_folder):
        for name in files:
            if name.endswith('coverage.summary.tab'):
                filename = os.path.join(root, name)
                df = pd.read_csv(filename, sep="\t")
                df[['HQ_SNP', 'HTZ_SNP']] = df.apply(lambda x: extract_snp_count(output_dir, x['#SAMPLE']), axis=1, result_type="expand")
                df[['mapped_reads', 'perc_mapped', 'paired_mapped', 'perc_paired']] = df.apply(lambda x: extract_mapped_reads(output_dir, x['#SAMPLE']), axis=1, result_type="expand")
                df[['N_groups', 'N_individual', 'N_leading', 'N_tailing', 'N_sum_len', 'N_mean_len']] = df.apply(lambda x: extract_n_consensus(output_dir, x['#SAMPLE']), axis=1, result_type="expand")
    df.to_csv(overal_stat_file, sep="\t", index=False)


In [69]:
obtain_overal_stats(out_dir, 'covid')

FILE /home/laura/ANALYSIS/VARIANT_CALLING/COVID/Consensus/20060178.fa NOT FOUND
FILE /home/laura/ANALYSIS/VARIANT_CALLING/COVID/Consensus/20060161.fa NOT FOUND


Unnamed: 0,#SAMPLE,MEAN_COV,UNMMAPED_PROP,COV1-10X,COV10-20X,COV>20X,COV>50X,COV>100X,COV>500X,COV>1000X,HQ_SNP,HTZ_SNP,mapped_reads,perc_mapped,paired_mapped,perc_paired,N_groups,N_individual,N_leading,N_tailing,N_mean_len
0,20091911,1733.04,0.16,0.03,0.03,99.78,99.13,98.96,94.79,81.39,7,3,694648.0,100.0,693326.0,99.97,2.0,0.0,1.0,0.0,32.5
1,20147374,989.38,0.16,0.48,1.92,97.43,97.25,97.09,85.31,51.5,14,0,340360.0,100.0,339951.0,99.96,8.0,0.0,1.0,1.0,91.875
2,20091284,1380.68,0.15,0.04,0.05,99.76,98.99,98.94,93.36,76.7,6,0,484511.0,100.0,483622.0,99.97,4.0,1.0,1.0,0.0,17.5
3,20089945,918.52,0.04,0.13,0.01,99.82,99.75,96.54,66.6,35.42,6,0,334422.0,100.0,331491.0,99.97,2.0,0.0,1.0,0.0,27.5
4,20069464,1241.71,0.15,0.6,0.17,99.09,98.7,98.14,90.24,66.98,8,0,491265.0,100.0,490642.0,99.96,3.0,0.0,1.0,1.0,91.0
5,20091675,524.06,1.18,0.51,0.05,98.26,96.92,93.63,54.76,3.1,3,0,154121.0,100.0,154055.0,99.99,4.0,0.0,1.0,1.0,130.25
6,20326611,811.87,1.2,1.8,0.43,96.57,92.94,88.23,61.89,33.08,16,0,322052.0,100.0,321233.0,99.91,11.0,2.0,1.0,1.0,89.0
7,20137496,732.67,0.16,0.69,0.28,98.87,98.12,97.08,64.99,25.67,10,1,250795.0,100.0,250342.0,99.99,7.0,1.0,1.0,0.0,45.285714
8,20072726,1155.57,0.08,0.22,0.08,99.62,99.02,98.96,88.57,62.25,6,0,451993.0,100.0,451494.0,99.97,4.0,0.0,1.0,1.0,26.0
9,20353283,1080.04,0.27,0.2,0.52,99.0,98.97,98.73,82.44,58.17,17,0,430500.0,100.0,430213.0,99.98,3.0,0.0,1.0,1.0,99.0
