In [22]:
import os
import re
import pandas as pd

pd.set_option('display.max_columns', None)

In [154]:
folder = "/home/laura/ANALYSIS/SNPTB/190624_Anotation_test/Table"
folder2 = "/home/laura/ANALYSIS/SNPTB/190617_all_coinfection/Table"

In [149]:
def extrach_variants_summary(vcf_table, distance=15, quality=10 ):
    sample = vcf_table.split("/")[-1].split(".")[0]
    
    df = pd.read_csv(vcf_table, sep="\t", header=0)
    
    total_snp = len(df[df.TYPE == "SNP"].index)
    total_indels = len(df[df.TYPE == "INDEL"].index)
    total_homozygous = len(df[(df.TYPE == "SNP") & (df.gt0 == 1)].index)
    total_heterozygous = len(df[(df.TYPE == "SNP") & (df.gt0 == 0)].index)
    median_allele_freq = "%.2f" % (df.AF[df.TYPE == "SNP"].median())
    mean_allele_freq = "%.2f" % (df.AF[df.TYPE == "SNP"].mean())
    
    distance = distance
    QD = quality
    position_to_filter = df['POS'][((df.snp_left_distance <= distance)|
                                (df.snp_right_distance <= distance)|
                                (df.window_10 >= 2)|
                                (df.AF <= 0.0) |
                                (df.len_AD > 2) |
                                (df.TYPE != "SNP") |
                                (df.QD <= QD) |
                                (df.highly_hetz == True) |
                                (df.poorly_covered == True) |
                                (df.non_genotyped == True) |
                                (df.is_polymorphic == True))].tolist()
    
    filtered_df = df[~df.POS.isin(position_to_filter)]
    
    filtered_df_htz = filtered_df[filtered_df.gt0 == 0]
    
    ftotal_snp = len(filtered_df[filtered_df.TYPE == "SNP"].index)
    ftotal_homozygous = len(filtered_df[(filtered_df.TYPE == "SNP") & (filtered_df.gt0 == 1)].index)
    ftotal_heterozygous = len(filtered_df[(filtered_df.TYPE == "SNP") & (filtered_df.gt0 == 0)].index)
    fmedian_allele_freq = "%.2f" % (filtered_df.AF[filtered_df.TYPE == "SNP"].median())
    fmean_allele_freq = "%.2f" % (filtered_df.AF[filtered_df.TYPE == "SNP"].mean())
    fmean_allele_freq_htz = "%.2f" % (filtered_df_htz.AF[filtered_df_htz.TYPE == "SNP"].mean())
    
    output = [sample,
              total_snp,
              total_indels,
              total_homozygous,
              total_heterozygous,
              median_allele_freq,
              mean_allele_freq,
              ftotal_snp,
              ftotal_homozygous,
              ftotal_heterozygous,
              fmedian_allele_freq,
              fmean_allele_freq,
              fmean_allele_freq_htz]
    output = [str(x) for x in output]
    
    return "\t".join(output)

In [158]:
def vcf_stats(folder_table, distance=15, quality=10):
    
    out_file = os.path.join(folder_table, "vcf_stat.tab")
    mixed_samples = []
    
    with open(out_file, 'w+') as fout:
        fout.write("\t".join(["SAMPLE", 
                              "#SNP", 
                              "#INDELS", 
                              "#HOMOZ_SNP", 
                              "#HETZ_SNP", 
                              "MEDIAN_AF_SNP", 
                              "MEAN_AF_SNP", 
                              "#FSNP", 
                              "#FHOMOZ_SNP", 
                              "#FHETZ_SNP", 
                              "FMEDIAN_AF_SNP",
                              "FMEAN_AF_SNP",
                              "FMEAN_AF_SNP_HTZ"]))
        fout.write("\n")
        for root, _, files in os.walk(folder_table):
            for name in files:
                filename = os.path.join(root, name)
                if filename.endswith("raw.tab"):
                    line = extrach_variants_summary(filename)
                    line_split = line.split("\t")
                    sample = line_split[0]
                    htz_filtered = line_split[9]
                    if int(htz_filtered) > 100:
                        mixed_samples.append(sample)
                    fout.write(line)
                    fout.write("\n")
    return mixed_samples

In [160]:
vcf_stats(folder)

['7mixed', '6mixed']

In [8]:
file = "/home/laura/ANALYSIS/SNPTB/190624_Anotation_test/Table/HIS-25198169.combined.hf.raw.tab"


In [97]:
extrach_variants_summary(file)

HIS-25198169	1616	428	1124	488	1.00


In [88]:
posiciones = [1977, 9304]

In [89]:
testdf[testdf.POS.isin(posiciones)]

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,AC,af,AN,BaseQRankSum,DP,ExcessHet,FS,MQ,MQRankSum,QD,ReadPosRankSum,SOR,GT,AD,GQ,PL,PGT,PID,PS,len_AD,REF_AD,ALT_AD,gt0,gt1,TYPE,dp,aF,AF,is_polymorphic,highly_hetz,non_genotyped,poorly_covered,snp_left_distance,snp_right_distance,indel_right_distance,indel_left_distance,window_10,window_20,window_30
0,MTB_anc,1977,.,G,A,2420.24,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=0.548;DP=28;Exc...,GT:AD:DP:GQ:PL,"1/1:0,28:28:82:948,82,0",2.0,1.0,2.0,0.548,28.0,0.1902,0.0,60.0,0.0,34.09,0.686,1.981,1/1,28,82.0,948820,,,,2,0.0,28.0,1,1,SNP,28.0,0.0,1.0,False,False,False,False,1977.0,555.0,,,1,1,1
4,MTB_anc,9304,.,A,G,4905.24,PASS,AC=2;AF=1.00;AN=2;DP=53;ExcessHet=0.1902;FS=0....,GT:AD:DP:GQ:PL,"1/1:0,53:53:99:2138,159,0",2.0,1.0,2.0,,53.0,0.1902,0.0,60.0,,28.73,,0.902,1/1,53,99.0,21381590,,,,2,0.0,53.0,1,1,SNP,53.0,0.0,1.0,False,False,False,False,161.0,4156.0,,,1,1,1


In [9]:
testdf = pd.read_csv(file, sep="\t", header=0)

In [12]:
testdf.shape

(2044, 49)

In [27]:
len(testdf[testdf.TYPE == "SNP"].index)

1616

In [26]:
testdf[testdf.TYPE == "SNP"].shape[0]

1616

In [76]:
"%.2f" % (testdf.AF.median())

'1.00'

In [83]:
testdf.AF[testdf.TYPE == "SNP"].median()

1.0

In [49]:
#testdf[(testdf.TYPE == "SNP") & (testdf.gt0 == 0)]

In [None]:
df_vcf['POS'][((df_vcf.snp_left_distance <= distance)|
                                (df_vcf.snp_right_distance <= distance)|
                                (df_vcf.Window_10 >= 2)|
                                (df_vcf.Is_repeat == True) |
                                (df_vcf.AF <= 0.0) |
                                (df_vcf.len_AD > 2) |
                                (df_vcf.QD <= QD))].tolist()

# Handle mixed samples

In [161]:
out_folder = "/home/laura/ANALYSIS/SNPTB/190624_Anotation_test"
list_mixed = ["6mixed", "7mixed"]

In [165]:
def check_create_dir(path):
    #exists = os.path.isfile(path)
    #exists = os.path.isdir(path)
    if os.path.exists(path):
        pass
    else:
        os.mkdir(path)

In [176]:
def remove_low_covered_mixed(output_dir, sample_list, type_remove):
    output_dir = os.path.abspath(output_dir)
    group = output_dir.split("/")[-1]
    uncovered_dir = os.path.join(output_dir, type_remove) #Uncovered or Mixed
    check_create_dir(uncovered_dir)

    sample_list_file = os.path.join(output_dir, "sample_list.txt")
    
    for root, _, files in os.walk(output_dir):
        #Any previous file created except for Table for mixed samples
        # and Species for both uncovered and mixed
        if root.endswith('GVCF_recal') or root.endswith('Coverage') \
        or root.endswith('VCF') or root.endswith('VCF_recal') \
        or root.endswith('Bam') or root.endswith('GVCF'):
            for name in files:
                filename = os.path.join(root, name)
                for sample_low in sample_list:
                    if name.startswith(sample_low):
                        print(filename)
                        os.remove(filename)
        """
        #Remove .cov to avoid its recalculation
        if root.endswith('Coverage'):
            for name in files:
                filename = os.path.join(root, name)
                for sample_low in sample_list:
                    if name.startswith(sample_low):
                        print(filename)
                        #os.remove(filename)
        """
        #Place low covered samples in a specific folder to analize them with different parameters
        if root.endswith(group):
            for name in files:
                filename = os.path.join(root, name)
                for sample_low in sample_list:
                    if name.startswith(sample_low) and name.endswith("fastq.gz"):
                        dest_uncovered_path = os.path.join(uncovered_dir, name)
                        print(filename, dest_uncovered_path)
                        os.rename(filename, dest_uncovered_path)
    if os.path.isfile(sample_list_file):
        edit_sample_list(sample_list_file, sample_list)

In [178]:
remove_low_covered_mixed(out_folder, list_mixed, "Mixed")