In [1]:
import os
import re
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
def add_bed_info(bed_df, position):
    """
    Identify a position within a range
    credits: https://stackoverflow.com/questions/6053974/python-efficiently-check-if-integer-is-within-many-ranges
    """
    #dict_position = bed_to_dict(bed_file)
    if any(start <= position <= end for (start, end) in zip(bed_df.start.values.tolist(), bed_df.end.values.tolist())):
        description_out = bed_df.description[(bed_df.start <= position) & (bed_df.end >= position)].values[0]
        return description_out
    else:
        return False

In [3]:
def annotate_bed_s(vcf_annot, *bed_files):
    """
    More on list comprehension: https://stackoverflow.com/questions/3371269/call-int-function-on-every-list-element
    """
    print("ANNOTATING BED(S): ", bed_files)
    #bed_files = [ os.path.abspath(x) for x in bed_files ]
    #bed_files = list(map(os.path.abspath, bed_files)) #get full path for all files
    variable_list = [ x.split("/")[-1].split(".")[0] for x in bed_files ] #extract file name and use it as header
    
    for variable_name, bed_file in zip(variable_list,bed_files):
        bed_annot_df = bed_to_df(bed_file)
        vcf_annot[variable_name] = vcf_annot['POS'].apply(lambda x: add_bed_info(bed_annot_df,x))

In [4]:
file_F = "/home/laura/DEVELOP/SNPTB/annotation/resistance/dict_position_resistance_high_conf.txt"

In [5]:
def file_to_dict(file_format):
    formatted_dict = {}
    with open (file_format, 'r') as f:
        for line in f:
            formatted_dict[int(line.split(":")[0])] = line.split(":")[1].strip().split(",")
    return formatted_dict

In [None]:
def add_resistance_snp(vcf_df, dict_high_confidence=dict_high_conf, dict_resistance_position=dict_res_v1):
    list_resistance = []
    
    vcf_df['Resistance'] = np.nan

    for index, _ in vcf_df.iterrows():
        position = int(vcf_df.loc[index,'POS'])
        alt_nucleotide = str(vcf_df.loc[index,'ALT'])
        
        if position in dict_resistance_position.keys():
            #Check position in resistance dict
            #Create a list with all possible nucleotydes in each position
            nucleotides = dict_resistance_position[position][1:]

            if alt_nucleotide in nucleotides:
                snp_resist = alt_nucleotide #ALT
                resistance = dict_resistance_position[int(position)][0] #Resist name
                list_resistance.append(resistance)
                list_resistance.append(str(position)) #POS
                list_resistance.append(snp_resist)
                #Evaluate High confidence (1.Position; 2. Nucleotide; 3. yes value)
                if  (int(position) in dict_high_confidence.keys()) and \
                (alt_nucleotide in dict_high_confidence[int(position)][1:] and \
                dict_high_confidence[int(position)][0] == 'yes'):
                    vcf_df.loc[index,'Resistance'] = resistance
                    
                else:
                    list_resistance.append("*")
                    vcf_df.loc[index,'Resistance'] = resistance + "*"
                    
            list_resistance.append("\t")
    #list_resistance.append(resistance + "\n")
    
    if len(list_resistance) > 3:
        print("This strain has resistance positions:\n:" + ",".join(list_resistance))
        return ",".join(list_resistance)
    else:
        print("No resistance were found\n")