In [94]:
import os
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [95]:
vcf = "/home/laura/ANALYSIS/autosnippy_colonies/Core/core.vcf"
cov_folder = "/home/laura/ANALYSIS/autosnippy_colonies/Stats/Coverage"

In [96]:
def import_VCF4_core_to_compare(vcf_file, sep='\t'):
    header_lines = 0
    
    with open(vcf_file) as f:
        first_line = f.readline().strip()
        next_line = f.readline().strip()
        while next_line.startswith("##"):
            header_lines = header_lines + 1
            # logger.info(next_line)
            next_line = f.readline()

    if first_line.endswith('VCFv4.2'):

        # Use first line as header
        df = pd.read_csv(vcf_file, sep=sep, skiprows=[
                                header_lines], header=header_lines)

        df.POS = df.POS.apply(str)
        #df['POS'] = df['POS'].astype("string")
        #df['POS'] = pandas.Series(df['POS'], dtype="string")
        df['Position'] = df["#CHROM"] + "|" + df["REF"] + "|" + df["POS"] + "|" + df["ALT"]
        df = df.drop(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'], axis=1)
        df = df[['Position'] + [ col for col in df.columns if col not in ['Position']]]
        df["N"] = df.apply(lambda x: sum([i != 0 for i in x[1:]]), axis=1)
        df = df[['Position', 'N'] + [ col for col in df.columns if col not in ['Position', 'N']]]
        
        def extract_sample_name(row):
            count_list = [i != 0 for i in row[2:]]
            samples = np.array(df.columns[2:])
            # samples[np.array(count_list)] filter array with True False array
            return ((',').join(samples[np.array(count_list)]))
    
        df['Samples'] = df.apply(extract_sample_name, axis=1)
        df = df[['Position', 'N', 'Samples'] + [ col for col in df.columns if col not in ['Position', 'N', 'Samples']]]
        
    else:
        logger.info("This vcf file is not v4.2")
        sys.exit(1)

    return df

In [97]:
df = import_VCF4_core_to_compare(vcf)

In [98]:
df.head()

Unnamed: 0,Position,N,Samples,10082989-0-COL2,10082989-0-COL3,10082989-0-COL7,10105494-0-COL1,10105494-0-COL2,AL10105494COL0
0,MTB_anc|T|1701|C,6,"10082989-0-COL2,10082989-0-COL3,10082989-0-COL7,10105494-0-COL1,10105494-0-COL2,AL10105494COL0",1,1,1,1,1,1
1,MTB_anc|C|2532|T,6,"10082989-0-COL2,10082989-0-COL3,10082989-0-COL7,10105494-0-COL1,10105494-0-COL2,AL10105494COL0",1,1,1,1,1,1
2,MTB_anc|G|8040|A,6,"10082989-0-COL2,10082989-0-COL3,10082989-0-COL7,10105494-0-COL1,10105494-0-COL2,AL10105494COL0",1,1,1,1,1,1
3,MTB_anc|C|9143|T,6,"10082989-0-COL2,10082989-0-COL3,10082989-0-COL7,10105494-0-COL1,10105494-0-COL2,AL10105494COL0",1,1,1,1,1,1
4,MTB_anc|G|13460|A,6,"10082989-0-COL2,10082989-0-COL3,10082989-0-COL7,10105494-0-COL1,10105494-0-COL2,AL10105494COL0",1,1,1,1,1,1


In [99]:
def add_window_distance(vcf_df, window_size=10):
    """
    Add a column indicating the maximum number of SNPs in a windows of 10 or supplied distance
    """
    list_pos = vcf_df.POS.to_list()  # all positions
    set_pos = set(list_pos)  # to set for later comparing
    # max to iter over positions (independent from reference)
    max_pos = max(vcf_df.POS.to_list())

    all_list = list(range(1, max_pos + 1))  # create a list to slide one by one

    df_header = "window_" + str(window_size)

    vcf_df[df_header] = 1  # Create all 1 by default

    # Slide over windows
    for i in range(0, max_pos, 1):
        # This splits the list in windows of determined length
        window_pos = all_list[i:i+window_size]
        set_window_pos = set(window_pos)
        # How many known positions are in every window for later clasification
        num_conglomerate = set_pos & set_window_pos

        if len(num_conglomerate) > 1:
            for i in num_conglomerate:
                # Retrieve index with the known position
                index = vcf_df.index[vcf_df["POS"] == i][0]
                if vcf_df.loc[index, df_header] < len(num_conglomerate):
                    vcf_df.loc[index, df_header] = len(num_conglomerate)

In [100]:
def extract_close_snps(df, snps_in_10=1):
    # Calculate close SNPS/INDELS and remove those with 2 or more mutations in 10bp
    df['POS'] = df.apply(lambda x: x.Position.split('|')[2], axis=1)
    df['POS'] = df['POS'].astype(int)
    df = df.sort_values("POS")
    add_window_distance(df)
    return df_w[df_w.window_10 > snps_in_10].POS.tolist()

In [111]:
close_positions = extract_close_snps(df)

In [104]:
def coverage_to_df(input_file,min_coverage=10):
    sample_name = input_file.split("/")[-1].split(".")[0]
    min_cov_df = pd.DataFrame()
    coverage_list = []
    with open(input_file, 'r') as f:
            content = f.read()
            content_list = content.split('\n')
            while '' in content_list : content_list.remove('')
    coverage_list = [int(x.split("\t")[2]) for x in content_list]
    min_cov_df[sample_name] = coverage_list
    min_cov_df = min_cov_df[min_cov_df < min_coverage].dropna(how='all')
    
    return min_cov_df

def identify_uncovered(cov_folder, min_coverage=10, nocall_fr=0.5):
    cov_folder = os.path.abspath(cov_folder)
    len_files = set()
    #Create Position column and asign value
    cov_df = pd.DataFrame()
    
    for root, _, files in os.walk(cov_folder):
        if root == cov_folder:
            for name in files:
                if name.endswith(".cov"):
                    filename = os.path.join(root, name)
                    #import to dataframe if they have the same positios(same reference)
                    low_coverage_df = coverage_to_df(filename)
                    cov_df = cov_df.merge(low_coverage_df, how='outer', left_index=True, right_index=True)          
                    
                                   
    #Determine low covered positions in dataframe
    #Filter positions with values lower than min_cov, dro rows with all false and extract the indet to iterate
    df_any_uncovered = cov_df[cov_df < min_coverage].dropna(how='all')#.index.tolist()
    df_any_uncovered['N_uncovered'] = df_any_uncovered.count(axis=1)
    df_any_uncovered['Position'] = df_any_uncovered.index + 1
    
    n_samples = len(df_any_uncovered.columns) - 2
    
    df_half_uncovered_list = df_any_uncovered['Position'][df_any_uncovered.N_uncovered / n_samples >= nocall_fr].tolist()
    
    return df_half_uncovered_list

In [103]:
cov_df = coverage_to_df("/home/laura/ANALYSIS/autosnippy_colonies/Stats/Coverage/AL10105494COL0.cov")

In [115]:
cov_df.head()

Unnamed: 0,AL10105494COL0
17857,9.0
17858,9.0
17859,9.0
17860,9.0
17861,9.0


In [106]:
uncovered = identify_uncovered(cov_folder)

In [107]:
uncovered

[17858, 17859, 17860, 17861, 17862]

In [110]:
df.shape

(1023, 10)

In [112]:
to_remove = close_positions + uncovered

In [113]:
len(to_remove)

172968

In [123]:
def remove_position_from_compare(df, position_list):
    df['POS'] = df.apply(lambda x: x.Position.split('|')[2], axis=1)
    df['POS'] = df['POS'].astype(int)
    df = df.sort_values("POS")
    df = df[~df['POS'].isin(position_list)]
    df = df.drop(['POS'], axis=1)
    return df

In [124]:
rmdf = remove_position_from_compare(df, to_remove)

In [125]:
rmdf.shape

(875, 9)

In [129]:
"helllo {}".format((",").join([str(x) for x in close_positions]))

'helllo 27463,27469,206481,206484,233358,233364,467504,467508,467516,672330,672332,947429,947430,1213024,1213025,1341040,1341044,1390527,1390528,1490905,1490911,1634580,1634581,1634586,1634589,1634592,1634609,1634610,1636733,1636742,1789675,1789678,1792777,1792778,2282376,2282377,2338990,2338994,2339719,2339726,2626189,2626191,2637327,2637330,2637475,2637476,2637481,2637483,2637520,2637523,2970017,2970018,2970019,2977169,2977173,2977176,2977177,2977179,2977180,3006896,3006898,3119737,3119738,3119740,3119741,3122943,3122949,3122951,3133054,3133055,3247865,3247874,3379708,3379712,3379718,3379726,3379730,3379732,3379735,3379736,3379742,3379751,3379757,3379763,3418328,3418330,3501666,3501668,3737830,3737836,3737839,3738689,3738690,3738692,3738694,3739236,3739238,3750177,3750178,3750185,3750205,3750209,3750210,3750417,3750421,3750805,3750808,3750812,3930526,3930532,4264218,4264219'

In [128]:
close_positions

[27463,
 27469,
 206481,
 206484,
 233358,
 233364,
 467504,
 467508,
 467516,
 672330,
 672332,
 947429,
 947430,
 1213024,
 1213025,
 1341040,
 1341044,
 1390527,
 1390528,
 1490905,
 1490911,
 1634580,
 1634581,
 1634586,
 1634589,
 1634592,
 1634609,
 1634610,
 1636733,
 1636742,
 1789675,
 1789678,
 1792777,
 1792778,
 2282376,
 2282377,
 2338990,
 2338994,
 2339719,
 2339726,
 2626189,
 2626191,
 2637327,
 2637330,
 2637475,
 2637476,
 2637481,
 2637483,
 2637520,
 2637523,
 2970017,
 2970018,
 2970019,
 2977169,
 2977173,
 2977176,
 2977177,
 2977179,
 2977180,
 3006896,
 3006898,
 3119737,
 3119738,
 3119740,
 3119741,
 3122943,
 3122949,
 3122951,
 3133054,
 3133055,
 3247865,
 3247874,
 3379708,
 3379712,
 3379718,
 3379726,
 3379730,
 3379732,
 3379735,
 3379736,
 3379742,
 3379751,
 3379757,
 3379763,
 3418328,
 3418330,
 3501666,
 3501668,
 3737830,
 3737836,
 3737839,
 3738689,
 3738690,
 3738692,
 3738694,
 3739236,
 3739238,
 3750177,
 3750178,
 3750185,
 3750205,
 3750