In [50]:
import os
import pandas as pd
import numpy as np
import re
from statistics import mean

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [2]:
out_dir = '/home/laura/ANALYSIS/VARIANT_CALLING/COVID'

In [3]:
def extract_snp_count(output_dir,sample):
    variants_folder = os.path.join(output_dir, 'Variants')
    raw_var_folder = os.path.join(variants_folder, 'ivar_raw')
    filename = os.path.join(raw_var_folder, sample + ".tsv")

    if os.path.exists(filename):
        df = pd.read_csv(filename, sep="\t")
        df = df.drop_duplicates(subset=['POS', 'REF', 'ALT'], keep="first")
        high_quality_snps = df["POS"][(df.PASS == True) &
                    (df.ALT_DP >= 20) &
                    (df.ALT_FREQ >= 0.8) &
                    ~(df.ALT.str.startswith('+') | df.ALT.str.startswith('-'))].tolist()
        htz_snps = df["POS"][(df.PASS == True) &
                    (df.ALT_DP >= 20) &
                    (df.ALT_FREQ < 0.8) &
                    (df.ALT_FREQ >= 0.2) &
                    ~(df.ALT.str.startswith('+') | df.ALT.str.startswith('-'))].tolist()
        return len(high_quality_snps), len(htz_snps)
    else:
        print("FILE " + filename + " NOT FOUND" )
        return None

In [7]:
def extract_mapped_reads(output_dir,sample):
    stats_folder = os.path.join(output_dir, 'Stats')
    bamstats_folder = os.path.join(stats_folder, 'Bamstats')
    filename = os.path.join(bamstats_folder, sample + ".bamstats")

    if os.path.exists(filename):
        with open (filename, 'r') as f:
            for line in f:
                if 'mapped' in line and '%' in line:
                    reads_mapped = line.split(" ")[0]
                    mappep_percentage = line.split("(")[-1].split("%")[0]
                elif 'properly paired' in line:
                    properly_paired = line.split(" ")[0]
                    paired_percentage = line.split("(")[-1].split("%")[0]
        return int(reads_mapped), float(mappep_percentage), int(properly_paired), float(paired_percentage)
        print("FILE " + filename + " NOT FOUND" )
        return None

In [70]:
def extract_n_consensus(output_dir,sample):
    consensus_folder = os.path.join(output_dir, 'Consensus')
    filename = os.path.join(consensus_folder, sample + ".fa")

    if os.path.exists(filename):
        with open (filename, 'r') as f:
            content = f.read()
            content_list = content.split('\n')
            sample_fq = content_list[0].strip(">")
            if sample_fq == sample:
                #In case fasta is in several lines(not by default)
                sequence = ("").join(content_list[1:]).strip()
                all_N = re.findall(r'N+', sequence)
                leading_N = re.findall(r'^N+', sequence)
                tailing_N = re.findall(r'N+$', sequence)
                length_N = [len(x) for x in all_N]
                individual_N = [x for x in length_N if x == 1]
                mean_length_N = mean(length_N)
                sum_length_N = sum(length_N)
                return(len(all_N), len(individual_N), len(leading_N), len(tailing_N), sum_length_N, mean_length_N)
    else:
        print("FILE " + filename + " NOT FOUND" )
        return None

In [71]:
extract_n_consensus(out_dir, '20155670-8')

(3, 1, 1, 0, 66, 22)

In [66]:
extract_snp_count(out_dir, '20155670-8')

(9, 6)

In [67]:
extract_mapped_reads(out_dir, '20155670-8')

(758339, 100.0, 756665, 99.91)

In [68]:
def obtain_overal_stats(output_dir, group):
    stat_folder = os.path.join(output_dir, 'Stats')
    overal_stat_file = os.path.join(stat_folder, group + ".overal.stats.tab")
    for root, _, files in os.walk(stat_folder):
        for name in files:
            if name.endswith('coverage.summary.tab'):
                filename = os.path.join(root, name)
                df = pd.read_csv(filename, sep="\t")
                df[['HQ_SNP', 'HTZ_SNP']] = df.apply(lambda x: extract_snp_count(output_dir, x['#SAMPLE']), axis=1, result_type="expand")
                df[['mapped_reads', 'perc_mapped', 'paired_mapped', 'perc_paired']] = df.apply(lambda x: extract_mapped_reads(output_dir, x['#SAMPLE']), axis=1, result_type="expand")
                df[['N_groups', 'N_individual', 'N_leading', 'N_tailing', 'N_sum_len', 'N_mean_len']] = df.apply(lambda x: extract_n_consensus(output_dir, x['#SAMPLE']), axis=1, result_type="expand")
    df.to_csv(overal_stat_file, sep="\t", index=False)


In [72]:
obtain_overal_stats(out_dir, 'covid')

FILE /home/laura/ANALYSIS/VARIANT_CALLING/COVID/Consensus/20060178.fa NOT FOUND
FILE /home/laura/ANALYSIS/VARIANT_CALLING/COVID/Consensus/20060161.fa NOT FOUND


ValueError: Columns must be same length as key