In [1]:
%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
import os
import sys
import re
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
from tabulate import tabulate

#%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
#plt.rcParams['figure.figsize'] = (16, 9)
#plt.style.use('ggplot')


from matplotlib_venn import venn2
from matplotlib_venn import venn3
#from lineage import get_lineage_coll, get_lineage_snp
#from resistance import get_resistance_snp
#from coinfection_functions import import_VCF42_to_pandas, filter_repeats, add_snp_distance, \
#                            scatter_vcf_pos, distplot_sns, add_window_distance

In [2]:
#dict_vcf = {}
#for vcf_file, df_name in zip(vcf_files, sample_list):
#    dict_vcf[df_name] = pd.read_csv(vcf_file, sep="\t", header=0)

In [3]:
def calculate_ALT_AD(row):
    if row.len_AD > 2:
        split_AD = row.AD.split(",")[1:]
        max_AD = max(split_AD)
        #max_index = split_AD.index(max(split_AD))
        return max_AD
    else:
        ALT_AD = row.AD.split(",")[1]
        return ALT_AD

In [4]:
def import_annot_to_pandas(vcf_file, sep='\t'):
    header_lines = 0
    with open(vcf_file) as f:
        first_line = f.readline().strip()
        next_line = f.readline().strip()
        while next_line.startswith("##"):
            header_lines = header_lines + 1
            #print(next_line)
            next_line = f.readline()
    
    if first_line.endswith('VCFv4.2'):
        
        #Use first line as header
        dataframe = pd.read_csv(vcf_file, sep=sep, skiprows=[header_lines], header=header_lines)
        sample = dataframe.columns[-1]
        dataframe.rename(columns={sample:'sample'}, inplace=True)
        
        ann_head = ["Allele","Annotation","Annotation_Impact","Gene_Name",
                    "Gene_ID","Feature_Type","Feature_ID","Transcript_BioType",
                    "Rank","HGVS.c","HGVS.p","cDNA.pos / cDNA.length",
                    "CDS.pos / CDS.length","AA.pos / AA.length","Distance",
                    "ERRORS / WARNINGS / INFO"]
        
        for index, data_row in dataframe.iterrows():
            info_fields = re.findall(r';*([a-zA-Z]{1,20})=', data_row.INFO)
            info_values = re.findall(r'-?\d+\.?\d*e?[+-]?\d{0,2}', data_row.INFO)
            ann_values_re = re.search(r'ANN=(.*)\|(.*)', data_row.INFO)
            all_ann_values = ann_values_re.group(1)
            ann_values = all_ann_values.split(",")[0].split("|")[:16]
            
            format_fields = data_row['FORMAT'].split(":")
            format_values = data_row['sample'].split(":")
                                    
            for ifield, ivalue in zip(info_fields,info_values):
                dataframe.loc[index,ifield] = ivalue
                
            for ffield, fvalue in zip(format_fields,format_values):
                dataframe.loc[index,ffield] = fvalue
            
            dataframe.loc[index,'ANN'] = all_ann_values
            
            for ann_field,ann_value in zip(ann_head, ann_values):
                dataframe.loc[index,ann_field] = ann_value
            
        dataframe.rename(columns={'AF':'af'}, inplace=True)
        
        dataframe['len_AD'] = dataframe['AD'].str.split(",").str.len()
        dataframe['REF_AD'] = dataframe['AD'].str.split(",").str[0]
        #dataframe['ALT_AD'] = dataframe['AD'].str.split(",").str[1]
        dataframe['ALT_AD'] = dataframe.apply(calculate_ALT_AD, axis=1)
        dataframe[['gt0','gt1']] = dataframe['GT'].str.split(r'[/|\|]', expand=True)
        
        dataframe['HGVS.c'] = dataframe['HGVS.c'].str.split(".").str[-1]
        dataframe['HGVS.p'] = dataframe['HGVS.p'].str.split(".").str[-1]
        dataframe['Gene length'] = dataframe['CDS.pos / CDS.length'].str.split("/").str[-1]
        dataframe['AA length'] = dataframe['AA.pos / AA.length'].str.split("/").str[-1]
                
        to_float = ['QUAL', 'AC', 'af', 'AN', 'BaseQRankSum', 'DP', 'ExcessHet', 'FS',
       'MLEAC', 'MLEAF', 'MQ', 'MQRankSum', 'QD', 'ReadPosRankSum', 'SOR','GQ','ALT_AD', 'REF_AD']
        
        to_int = ['POS', 'len_AD', 'gt0', 'gt1']
        
        to_str = ['#CHROM','REF','ALT', 'FILTER']
        
        for column in dataframe.columns:
            if column in to_float:
                dataframe[column] = dataframe[column].astype(float)
                
        for column in dataframe.columns:
            if column in to_int:
                dataframe[column] = dataframe[column].astype(int)
                
        for column in dataframe.columns:
            if column in to_str:
                dataframe[column] = dataframe[column].astype(str)
                
        dataframe['dp'] = (dataframe['REF_AD'] + dataframe['ALT_AD'])
        dataframe['aF'] = dataframe['REF_AD']/dataframe['dp']
        dataframe['AF'] = dataframe['ALT_AD']/dataframe['dp']
        

                
    else:
        print("This vcf file is not v4.2")
        sys.exit(1)
           
    return dataframe

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
sample_list = []
vcf_files = []
directory = "/home/laura/ANALYSIS/SNPTB/190624_Anotation_test/Annotation__"

for file in os.listdir(directory):
    if file.endswith(".annot"):
        name = file.split(".")[0]
        sample_list.append(name)
        file_path = os.path.join(directory, file)
        vcf_files.append(file_path)
        
sample_list.sort()
vcf_files.sort()

print(sample_list)
print(vcf_files)

['SM2', 'test_SM3_anc_ud0_interval']
['/home/laura/ANALYSIS/SNPTB/190624_Anotation_test/Annotation__/SM2.annot', '/home/laura/ANALYSIS/SNPTB/190624_Anotation_test/Annotation__/test_SM3_anc_ud0_interval.annot']


In [7]:
"bra" in "ebrart"

True

# Import vcf as dataframe

In [8]:
test_info = "AC=2;AF=1.00;AN=2;DP=134;ExcessHet=3.0103;FS=0.000;MQ=60.00;QD=34.97;SOR=1.076;ANN=C|missense_variant|MODERATE|recF|Rv0003|transcript|CCP42725.1|protein_coding|1/1|c.734T>C|p.Ile245Thr|734/1158|734/1158|245/385||WARNING_TRANSCRIPT_NO_START_CODON,C|upstream_gene_variant|MODIFIER|Rv0004|Rv0004|transcript|CCP42726.1|protein_coding||c.-421T>C|||||421|,C|upstream_gene_variant|MODIFIER|gyrB|Rv0005|transcript|CCP42727.1|protein_coding||c.-1227T>C|||||1227|WARNING_TRANSCRIPT_NO_START_CODON,C|upstream_gene_variant|MODIFIER|gyrA|Rv0006|transcript|CCP42728.1|protein_coding||c.-3289T>C|||||3289|,C|downstream_gene_variant|MODIFIER|dnaA|Rv0001|transcript|CCP42723.1|protein_coding||c.*2489T>C|||||2489|,C|downstream_gene_variant|MODIFIER|dnaN|Rv0002|transcript|CCP42724.1|protein_coding||c.*753T>C|||||753|"

In [9]:
ann_values_re = re.search(r'ANN=(.*)\|', test_info)

In [10]:
ANN_ = "Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO"

In [11]:
ANN_header = ANN_.replace(" | ","\",\"")

In [12]:
ANN_header



In [13]:
ANN_head = ["Allele","Annotation","Annotation_Impact","Gene_Name","Gene_ID","Feature_Type","Feature_ID","Transcript_BioType","Rank","HGVS.c","HGVS.p","cDNA.pos / cDNA.length","CDS.pos / CDS.length","AA.pos / AA.length","Distance","ERRORS / WARNINGS / INFO"]

In [14]:
len(ANN_head)

16

In [15]:
ANN_values = ann_values_re.group(1).split("|")

In [16]:
ANN_values_16 = ANN_values[:16]

In [17]:
for head, value in zip(ANN_head, ANN_values_16):
    print(head, ":\t", value)

Allele :	 C
Annotation :	 missense_variant
Annotation_Impact :	 MODERATE
Gene_Name :	 recF
Gene_ID :	 Rv0003
Feature_Type :	 transcript
Feature_ID :	 CCP42725.1
Transcript_BioType :	 protein_coding
Rank :	 1/1
HGVS.c :	 c.734T>C
HGVS.p :	 p.Ile245Thr
cDNA.pos / cDNA.length :	 734/1158
CDS.pos / CDS.length :	 734/1158
AA.pos / AA.length :	 245/385
Distance :	 


In [18]:
ann_values_re.group(1)



In [19]:
dict_vcf = {}
for vcf_file, df_name in zip(vcf_files, sample_list):
    dict_vcf[df_name] = import_annot_to_pandas(vcf_file)

In [20]:
dict_vcf['test_SM3_anc_ud0_interval'].columns

Index(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'sample', 'AC', 'af', 'AN', 'BaseQRankSum', 'DP', 'ExcessHet', 'FS',
       'InbreedingCoeff', 'MQ', 'MQRankSum', 'QD', 'ReadPosRankSum', 'SOR',
       'ANN', 'GT', 'AD', 'GQ', 'PL', 'Allele', 'Annotation',
       'Annotation_Impact', 'Gene_Name', 'Gene_ID', 'Feature_Type',
       'Feature_ID', 'Transcript_BioType', 'Rank', 'HGVS.c', 'HGVS.p',
       'cDNA.pos / cDNA.length', 'CDS.pos / CDS.length', 'AA.pos / AA.length',
       'REF_AD', 'ALT_AD', 'gt0', 'gt1', 'Gene length', 'AA length', 'dp',
       'aF', 'AF'],
      dtype='object')

In [21]:
test3 = dict_vcf['test_SM3_anc_ud0_interval']
test2 = dict_vcf['SM2']

In [22]:
def add_lineage_snp(vcf_df):
    dict_lineage_position = {
        '615938' : ['A', '1'],
        '4404247' : ['A', '1.1'],
        '3021283' : ['A', '1.1.1'],
        '3216553' : ['A', '1.1.1.1'],
        '2622402' : ['A', '1.1.2'],
        '1491275' : ['A', '1.1.3'],
        '3479545' : ['A', '1.2.1'],
        '3470377' : ['T', '1.2.2'],
        '497491' : ['A', '2'],
        '1881090' : ['T', '2.1'],
        '2505085' : ['A', '2.2'],
        '797736' : ['T', '2.2.1'],
        '4248115' : ['T', '2.2.1.1'],
        '3836274' : ['A', '2.2.1.2'],
        '346693' : ['T', '2.2.2'],
        '3273107' : ['A', '3'],
        '1084911' : ['A', '3.1.1'],
        '3722702' : ['C', '3.1.2'],
        '1237818' : ['G', '3.1.2.1'],
        '2874344' : ['A', '3.1.2.2'],
        '931123' : ['C', '4**'],
        '62657' : ['A', '4.1'],
        '514245' : ['T', '4.1.1'],
        '1850119' : ['T', '4.1.1.1'],
        '541048' : ['G', '4.1.1.2'],
        '4229087' : ['T', '4.1.1.3'],
        '891756' : ['G', '4.1.2'],
        '107794' : ['T', '4.1.2.1'],
        '2411730' : ['C', '4.2'],
        '783601' : ['C', '4.2.1'],
        '1487796' : ['A', '4.2.2'],
        '1455780' : ['C', '4.2.2.1'],
        '764995' : ['G', '4.3'],
        '615614' : ['A', '4.3.1'],
        '4316114' : ['A', '4.3.2'],
        '3388166' : ['G', '4.3.2.1'],
        '403364' : ['A', '4.3.3'],
        '3977226' : ['A', '4.3.4'],
        '4398141' : ['A', '4.3.4.1'],
        '1132368' : ['T', '4.3.4.2'],
        '1502120' : ['A', '4.3.4.2.1'],
        '4307886' : ['A', '4.4'],
        '4151558' : ['A', '4.4.1'],
        '355181' : ['A', '4.4.1.1'],
        '2694560' : ['C', '4.4.1.2'],
        '4246508' : ['A', '4.4.2'],
        '1719757' : ['T', '4.5'],
        '3466426' : ['A', '4.6'],
        '4260268' : ['C', '4.6.1'],
        '874787' : ['A', '4.6.1.1'],
        '1501468' : ['C', '4.6.1.2'],
        '4125058' : ['C', '4.6.2'],
        '3570528' : ['G', '4.6.2.1'],
        '2875883' : ['T', '4.6.2.2'],
        '4249732' : ['G', '4.7'],
        '3836739' : ['A', '4.8'],
        '1759252' : ['T', '4.9**'],
        '1799921' : ['A', '5'],
        '1816587' : ['G', '6'],
        '1137518' : ['A', '7'],
        '2831482' : ['G', 'BOV'],
        '1882180' : ['T', 'BOV_AFRI']
                }
    list_lineage = []
    
    for index, _ in vcf_df.iterrows():
        position = str(vcf_df.loc[index,'POS'])
        if position in dict_lineage_position.keys():
            if str(vcf_df.loc[index,'ALT']) == dict_lineage_position[str(position)][0]:
                lineage = dict_lineage_position[str(position)][1]
                vcf_df.loc[index,'Lineage'] = lineage
                list_lineage.append(lineage)
                
    if len(list_lineage) > 0:
        list_lineage.sort(reverse=True)
        asterix = ""
        for sublineage_n in range(len(list_lineage)):
            if sublineage_n < (len(list_lineage) - 1):
                if list_lineage[sublineage_n].startswith(list_lineage[sublineage_n + 1]):
                    asterix = asterix + "*"
        final_lineage = list_lineage[0] + " " + asterix
        print("This strain has lineage position(s):\n: " + " ".join([list_lineage[0],asterix]))
        return final_lineage
    else:
        print("No lineage were found\n")
        

In [23]:
add_lineage_snp(test3)


This strain has lineage position(s):
: 4.1.2.1 **


'4.1.2.1 **'

In [24]:
add_lineage_snp(test2)

This strain has lineage position(s):
: 4.3.4.2 **


'4.3.4.2 **'

In [25]:
test2[test2.Lineage.notnull()]

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,AC,af,AN,BaseQRankSum,DP,ExcessHet,FS,InbreedingCoeff,MQ,MQRankSum,QD,ReadPosRankSum,SOR,ANN,GT,AD,GQ,PL,Allele,Annotation,Annotation_Impact,Gene_Name,Gene_ID,Feature_Type,Feature_ID,Transcript_BioType,Rank,HGVS.c,HGVS.p,cDNA.pos / cDNA.length,CDS.pos / CDS.length,AA.pos / AA.length,Distance,PGT,PID,PS,ERRORS / WARNINGS / INFO,len_AD,REF_AD,ALT_AD,gt0,gt1,Gene length,AA length,dp,aF,AF,Lineage
157,Chromosome,764995,.,C,G,265366.45,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=0.854;DP=167;Ex...,GT:AD:DP:GQ:PL,"1/1:0,167:167:99:5843,479,0",2.0,1.0,2.0,0.854,167.0,-0.0,0.0,0.8895,60.0,0.0,33.13,0.499,0.731,G|synonymous_variant|LOW|rpoC|Rv0668|transcrip...,1/1,167,99.0,58434790,G,synonymous_variant,LOW,rpoC,Rv0668,transcript,CCP43411.1,protein_coding,1/1,1626C>G,Ala542Ala,1626/3951,1626/3951,542/1316,,,,,,2,0.0,167.0,1,1,3951,1316,167.0,0.0,1.0,4.3
230,Chromosome,1132368,.,C,T,38611.16,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=-1.796e+00;DP=1...,GT:AD:DP:GQ:PL,"1/1:0,180:180:99:6013,540,0",2.0,1.0,2.0,-1.796,180.0,0.0,3.204,0.7372,60.0,0.0,24.69,0.423,0.457,T|synonymous_variant|LOW|pks16|Rv1013|transcri...,1/1,180,99.0,60135400,T,synonymous_variant,LOW,pks16,Rv1013,transcript,CCP43763.1,protein_coding,1/1,744C>T,Thr248Thr,744/1635,744/1635,248/544,,,,,,2,0.0,180.0,1,1,1635,544,180.0,0.0,1.0,4.3.4.2
740,Chromosome,3977226,.,G,A,31072.05,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=0.139;DP=110;Ex...,GT:AD:DP:GQ:PL,"1/1:0,110:110:99:3657,329,0",2.0,1.0,2.0,0.139,110.0,0.0,0.642,0.7555,60.0,0.0,24.37,0.057,0.808,A|synonymous_variant|LOW|Rv3538|Rv3538|transcr...,1/1,110,99.0,36573290,A,synonymous_variant,LOW,Rv3538,Rv3538,transcript,CCP46360.1,protein_coding,1/1,165G>A,Leu55Leu,165/861,165/861,55/286,,,,,,2,0.0,110.0,1,1,861,286,110.0,0.0,1.0,4.3.4


In [26]:
def add_resistance_snp(vcf_df):
    dict_resistance_position = {6575: ['T', 'fluoroquinolones (FQ)'],
                                6620: ['C', 'fluoroquinolones (FQ)', 'A'],
                                6621: ['C', 'fluoroquinolones (FQ)'],
                                6734: ['G', 'fluoroquinolones (FQ)'],
                                6735: ['C', 'fluoroquinolones (FQ)'],
                                6736: ['G', 'fluoroquinolones (FQ)'],
                                6737: ['C', 'fluoroquinolones (FQ)'],
                                6738: ['A', 'fluoroquinolones (FQ)'],
                                6741: ['T', 'fluoroquinolones (FQ)'],
                                6742: ['T', 'fluoroquinolones (FQ)'],
                                6749: ['A', 'fluoroquinolones (FQ)'],
                                6750: ['T', 'fluoroquinolones (FQ)'],
                                7563: ['T', 'fluoroquinolones (FQ)'],
                                7564: ['C', 'fluoroquinolones (FQ)'],
                                7566: ['A', 'fluoroquinolones (FQ)'],
                                7570: ['T', 'fluoroquinolones (FQ)'],
                                7572: ['C', 'fluoroquinolones (FQ)'],
                                7581: ['C', 'fluoroquinolones (FQ)', 'A', 'T'],
                                7582: ['G', 'fluoroquinolones (FQ)', 'C', 'T'],
                                575729: ['T', 'ethionamide (ETH)'],
                                576164: ['T', 'ethionamide (ETH)'],
                                576242: ['T', 'ethionamide (ETH)'],
                                576338: ['T', 'ethionamide (ETH)'],
                                576414: ['A', 'ethionamide (ETH)'],
                                576429: ['C', 'ethionamide (ETH)'],
                                760314: ['T', 'rifampicin (RMP)'],
                                761004: ['G', 'rifampicin (RMP)'],
                                761093: ['C', 'rifampicin (RMP)'],
                                761095: ['C', 'rifampicin (RMP)', 'G'],
                                761098: ['T', 'rifampicin (RMP)', 'C'],
                                761100: ['A', 'rifampicin (RMP)'],
                                761101: ['C', 'rifampicin (RMP)', 'T'],
                                761108: ['T', 'rifampicin (RMP)'],
                                761109: ['T', 'rifampicin (RMP)'],
                                761110: ['G', 'rifampicin (RMP)', 'T'],
                                761111: ['G', 'rifampicin (RMP)'],
                                761120: ['C', 'rifampicin (RMP)', 'G'],
                                761128: ['T', 'rifampicin (RMP)', 'G'],
                                761139: ['A', 'rifampicin (RMP)', 'G', 'T'],
                                761140: ['C', 'rifampicin (RMP)', 'G', 'T'],
                                761141: ['A', 'rifampicin (RMP)'],
                                761154: ['G', 'rifampicin (RMP)'],
                                761155: ['G', 'rifampicin (RMP)', 'T'],
                                761161: ['C', 'rifampicin (RMP)'],
                                761277: ['T', 'rifampicin (RMP)'],
                                781687: ['G', 'streptomycin (SM)'],
                                781821: ['C', 'streptomycin (SM)'],
                                781822: ['G', 'streptomycin (SM)'],
                                801268: ['C', 'linezolid (LZD)'],
                                1472337: ['T', 'streptomycin (SM)'],
                                1472358: ['T', 'streptomycin (SM)'],
                                1472359: ['C', 'streptomycin (SM)'],
                                1472362: ['T', 'streptomycin (SM)'],
                                1472750: ['A', 'streptomycin (SM)'],
                                1472751: ['G', 'streptomycin (SM)'],
                                1472752: ['T', 'streptomycin (SM)'],
                                1473246: ['G', 'amikacin (AMK) kanamycin (KAN) capreomycin (CPR)'],
                                1473247: ['T', 'amikacin (AMK) kanamycin (KAN) capreomycin (CPR)'],
                                1473329: ['T', 'amikacin (AMK) kanamycin (KAN) capreomycin (CPR)'],
                                1475956: ['T', 'linezolid (LZD)'],
                                1476471: ['T', 'linezolid (LZD)'],
                                1673423: ['T', 'isoniazid (INH)'],
                                1673424: ['G', 'isoniazid (INH)'],
                                1673425: ['T', 'isoniazid (INH)'],
                                1673432: ['A', 'isoniazid (INH)', 'C'],
                                1674481: ['G', 'isoniazid (INH) ethionamide (ETH)'],
                                1674782: ['C', 'isoniazid (INH) ethionamide (ETH)'],
                                1833909: ['C', 'pyrazinamide (PZA)'],
                                1834325: ['A', 'pyrazinamide (PZA)'],
                                1834855: ['C', 'pyrazinamide (PZA)'],
                                1917946: ['T', 'capreomycin (CPR)'],
                                1917979: ['T', 'capreomycin (CPR)'],
                                1917991: ['T', 'capreomycin (CPR)'],
                                1918003: ['T', 'capreomycin (CPR)'],
                                1918139: ['A', 'capreomycin (CPR)'],
                                1918144: ['G', 'capreomycin (CPR)'],
                                1918211: ['A', 'capreomycin (CPR)'],
                                1918292: ['C', 'capreomycin (CPR)'],
                                1918322: ['A', 'capreomycin (CPR)'],
                                1918388: ['C', 'capreomycin (CPR)'],
                                1918487: ['T', 'capreomycin (CPR)'],
                                1918489: ['A', 'capreomycin (CPR)', 'T'],
                                1918494: ['G', 'capreomycin (CPR)'],
                                1918651: ['A', 'capreomycin (CPR)'],
                                2102240: ['T', 'isoniazid (INH) ethionamide (ETH)'],
                                2102715: ['C', 'isoniazid (INH) ethionamide (ETH)'],
                                2155167: ['C', 'isoniazid (INH)', 'T'],
                                2155168: ['G', 'isoniazid (INH)', 'T', 'A'],
                                2155169: ['C', 'isoniazid (INH)'],
                                2155206: ['C', 'isoniazid (INH)'],
                                2155212: ['G', 'isoniazid (INH)'],
                                2155214: ['C', 'isoniazid (INH)'],
                                2155222: ['A', 'isoniazid (INH)'],
                                2155289: ['G', 'isoniazid (INH)'],
                                2155699: ['C', 'isoniazid (INH)'],
                                2288683: ['G', 'pyrazinamide (PZA)'],
                                2288697: ['G', 'pyrazinamide (PZA)'],
                                2288703: ['G', 'pyrazinamide (PZA)', 'C'],
                                2288704: ['A', 'pyrazinamide (PZA)'],
                                2288718: ['G', 'pyrazinamide (PZA)'],
                                2288719: ['C', 'pyrazinamide (PZA)'],
                                2288727: ['G', 'pyrazinamide (PZA)'],
                                2288730: ['A', 'pyrazinamide (PZA)'],
                                2288740: ['G', 'pyrazinamide (PZA)'],
                                2288752: ['G', 'pyrazinamide (PZA)'],
                                2288754: ['G', 'pyrazinamide (PZA)'],
                                2288757: ['T', 'pyrazinamide (PZA)'],
                                2288761: ['G', 'pyrazinamide (PZA)'],
                                2288764: ['G', 'pyrazinamide (PZA)'],
                                2288766: ['C', 'pyrazinamide (PZA)'],
                                2288772: ['G', 'pyrazinamide (PZA)', 'C'],
                                2288778: ['C', 'pyrazinamide (PZA)'],
                                2288779: ['T', 'pyrazinamide (PZA)'],
                                2288782: ['C', 'pyrazinamide (PZA)'],
                                2288805: ['T', 'pyrazinamide (PZA)', 'A'],
                                2288806: ['G', 'pyrazinamide (PZA)', 'T'],
                                2288817: ['T', 'pyrazinamide (PZA)', 'A'],
                                2288818: ['C', 'pyrazinamide (PZA)'],
                                2288820: ['G', 'pyrazinamide (PZA)'],
                                2288821: ['A', 'pyrazinamide (PZA)'],
                                2288823: ['G', 'pyrazinamide (PZA)'],
                                2288826: ['C', 'pyrazinamide (PZA)'],
                                2288827: ['G', 'pyrazinamide (PZA)'],
                                2288828: ['C', 'pyrazinamide (PZA)'],
                                2288830: ['G', 'pyrazinamide (PZA)'],
                                2288832: ['C', 'pyrazinamide (PZA)', 'G'],
                                2288833: ['C', 'pyrazinamide (PZA)'],
                                2288836: ['T', 'pyrazinamide (PZA)', 'A'],
                                2288838: ['T', 'pyrazinamide (PZA)'],
                                2288839: ['G', 'pyrazinamide (PZA)'],
                                2288841: ['A', 'pyrazinamide (PZA)'],
                                2288844: ['G', 'pyrazinamide (PZA)'],
                                2288847: ['G', 'pyrazinamide (PZA)', 'T'],
                                2288848: ['A', 'pyrazinamide (PZA)', 'T'],
                                2288853: ['G', 'pyrazinamide (PZA)', 'C'],
                                2288857: ['A', 'pyrazinamide (PZA)'],
                                2288859: ['C', 'pyrazinamide (PZA)'],
                                2288868: ['C', 'pyrazinamide (PZA)'],
                                2288869: ['A', 'pyrazinamide (PZA)'],
                                2288874: ['G', 'pyrazinamide (PZA)'],
                                2288880: ['G', 'pyrazinamide (PZA)'],
                                2288883: ['C', 'pyrazinamide (PZA)', 'T', 'G'],
                                2288885: ['T', 'pyrazinamide (PZA)'],
                                2288886: ['G', 'pyrazinamide (PZA)', 'T'],
                                2288887: ['G', 'pyrazinamide (PZA)', 'C'],
                                2288895: ['C', 'pyrazinamide (PZA)', 'G'],
                                2288902: ['G', 'pyrazinamide (PZA)'],
                                2288920: ['T', 'pyrazinamide (PZA)', 'G'],
                                2288928: ['T', 'pyrazinamide (PZA)'],
                                2288930: ['T', 'pyrazinamide (PZA)'],
                                2288931: ['A', 'pyrazinamide (PZA)'],
                                2288933: ['C', 'pyrazinamide (PZA)'],
                                2288934: ['C', 'pyrazinamide (PZA)', 'G'],
                                2288935: ['C', 'pyrazinamide (PZA)'],
                                2288938: ['T', 'pyrazinamide (PZA)'],
                                2288944: ['C', 'pyrazinamide (PZA)', 'G'],
                                2288945: ['T', 'pyrazinamide (PZA)'],
                                2288952: ['G', 'pyrazinamide (PZA)', 'T'],
                                2288953: ['T', 'pyrazinamide (PZA)'],
                                2288954: ['T', 'pyrazinamide (PZA)'],
                                2288955: ['C', 'pyrazinamide (PZA)', 'G'],
                                2288956: ['G', 'pyrazinamide (PZA)', 'C'],
                                2288957: ['C', 'pyrazinamide (PZA)'],
                                2288960: ['T', 'pyrazinamide (PZA)', 'C'],
                                2288961: ['C', 'pyrazinamide (PZA)', 'G'],
                                2288962: ['G', 'pyrazinamide (PZA)'],
                                2288971: ['A', 'pyrazinamide (PZA)'],
                                2288982: ['A', 'pyrazinamide (PZA)'],
                                2288988: ['C', 'pyrazinamide (PZA)', 'G'],
                                2288997: ['C', 'pyrazinamide (PZA)'],
                                2288998: ['C', 'pyrazinamide (PZA)'],
                                2289000: ['C', 'pyrazinamide (PZA)', 'G'],
                                2289001: ['C', 'pyrazinamide (PZA)'],
                                2289016: ['G', 'pyrazinamide (PZA)'],
                                2289028: ['G', 'pyrazinamide (PZA)'],
                                2289029: ['T', 'pyrazinamide (PZA)'],
                                2289030: ['C', 'pyrazinamide (PZA)'],
                                2289031: ['A', 'pyrazinamide (PZA)'],
                                2289036: ['A', 'pyrazinamide (PZA)'],
                                2289038: ['G', 'pyrazinamide (PZA)', 'A'],
                                2289039: ['G', 'pyrazinamide (PZA)', 'T'],
                                2289040: ['G', 'pyrazinamide (PZA)', 'C'],
                                2289043: ['G', 'pyrazinamide (PZA)'],
                                2289050: ['C', 'pyrazinamide (PZA)'],
                                2289052: ['C', 'pyrazinamide (PZA)'],
                                2289054: ['C', 'pyrazinamide (PZA)'],
                                2289057: ['A', 'pyrazinamide (PZA)'],
                                2289061: ['G', 'pyrazinamide (PZA)'],
                                2289068: ['T', 'pyrazinamide (PZA)', 'C'],
                                2289070: ['G', 'pyrazinamide (PZA)'],
                                2289071: ['C', 'pyrazinamide (PZA)'],
                                2289072: ['C', 'pyrazinamide (PZA)', 'G'],
                                2289073: ['C', 'pyrazinamide (PZA)', 'A'],
                                2289081: ['C', 'pyrazinamide (PZA)', 'T', 'A'],
                                2289082: ['A', 'pyrazinamide (PZA)'],
                                2289089: ['T', 'pyrazinamide (PZA)'],
                                2289090: ['C', 'pyrazinamide (PZA)', 'G'],
                                2289091: ['A', 'pyrazinamide (PZA)'],
                                2289096: ['G', 'pyrazinamide (PZA)', 'C'],
                                2289097: ['T', 'pyrazinamide (PZA)'],
                                2289100: ['C', 'pyrazinamide (PZA)', 'A'],
                                2289103: ['C', 'pyrazinamide (PZA)', 'G'],
                                2289108: ['C', 'pyrazinamide (PZA)'],
                                2289111: ['C', 'pyrazinamide (PZA)'],
                                2289133: ['A', 'pyrazinamide (PZA)'],
                                2289138: ['G', 'pyrazinamide (PZA)'],
                                2289140: ['C', 'pyrazinamide (PZA)'],
                                2289150: ['C', 'pyrazinamide (PZA)'],
                                2289159: ['T', 'pyrazinamide (PZA)'],
                                2289162: ['G', 'pyrazinamide (PZA)'],
                                2289171: ['T', 'pyrazinamide (PZA)'],
                                2289180: ['C', 'pyrazinamide (PZA)'],
                                2289186: ['G', 'pyrazinamide (PZA)'],
                                2289193: ['T', 'pyrazinamide (PZA)'],
                                2289200: ['T', 'pyrazinamide (PZA)'],
                                2289201: ['T', 'pyrazinamide (PZA)'],
                                2289202: ['G', 'pyrazinamide (PZA)'],
                                2289203: ['C', 'pyrazinamide (PZA)'],
                                2289204: ['G', 'pyrazinamide (PZA)'],
                                2289206: ['C', 'pyrazinamide (PZA)'],
                                2289207: ['G', 'pyrazinamide (PZA)'],
                                2289208: ['T', 'pyrazinamide (PZA)'],
                                2289213: ['C', 'pyrazinamide (PZA)', 'G'],
                                2289214: ['T', 'pyrazinamide (PZA)'],
                                2289216: ['G', 'pyrazinamide (PZA)', 'C'],
                                2289218: ['T', 'pyrazinamide (PZA)'],
                                2289219: ['G', 'pyrazinamide (PZA)', 'C'],
                                2289220: ['T', 'pyrazinamide (PZA)'],
                                2289222: ['T', 'pyrazinamide (PZA)', 'C'],
                                2289223: ['A', 'pyrazinamide (PZA)'],
                                2289225: ['G', 'pyrazinamide (PZA)'],
                                2289231: ['G', 'pyrazinamide (PZA)', 'C'],
                                2289234: ['T', 'pyrazinamide (PZA)'],
                                2289235: ['G', 'pyrazinamide (PZA)'],
                                2289239: ['A', 'pyrazinamide (PZA)', 'T'],
                                2289240: ['T', 'pyrazinamide (PZA)', 'G'],
                                2289248: ['G', 'pyrazinamide (PZA)', 'C'],
                                2289252: ['C', 'pyrazinamide (PZA)', 'G', 'A'],
                                2715342: ['T', 'kanamycin (KAN)'],
                                2715346: ['A', 'kanamycin (KAN)'],
                                2726136: ['T', 'isoniazid (INH)'],
                                2726145: ['A', 'isoniazid (INH)'],
                                3073808: ['C', 'para-aminosalicylic acid (PAS)'],
                                4241078: ['G', 'ethambutol (EMB)'],
                                4243221: ['T', 'ethambutol (EMB)'],
                                4243225: ['A', 'ethambutol (EMB)'],
                                4243242: ['A', 'ethambutol (EMB)'],
                                4243245: ['A', 'ethambutol (EMB)'],
                                4243833: ['A', 'ethambutol (EMB)'],
                                4244193: ['A', 'ethambutol (EMB)'],
                                4244281: ['A', 'ethambutol (EMB)'],
                                4244617: ['T', 'ethambutol (EMB)'],
                                4245730: ['C', 'ethambutol (EMB)'],
                                4246734: ['G', 'ethambutol (EMB)'],
                                4247402: ['G', 'ethambutol (EMB)'],
                                4247429: ['G', 'ethambutol (EMB)', 'C'],
                                4247430: ['C', 'ethambutol (EMB)'],
                                4247431: ['A', 'ethambutol (EMB)', 'C', 'T'],
                                4247469: ['C', 'ethambutol (EMB)'],
                                4247495: ['T', 'ethambutol (EMB)'],
                                4247496: ['G', 'ethambutol (EMB)'],
                                4247507: ['C', 'ethambutol (EMB)'],
                                4247513: ['C', 'ethambutol (EMB)'],
                                4247573: ['A', 'ethambutol (EMB)'],
                                4247717: ['G', 'ethambutol (EMB)'],
                                4247723: ['T', 'ethambutol (EMB)'],
                                4247729: ['A', 'ethambutol (EMB)', 'T'],
                                4247730: ['C', 'ethambutol (EMB)', 'A'],
                                4247863: ['G', 'ethambutol (EMB)'],
                                4247873: ['A', 'ethambutol (EMB)'],
                                4248002: ['A', 'ethambutol (EMB)'],
                                4248003: ['G', 'ethambutol (EMB)'],
                                4248747: ['A', 'ethambutol (EMB)'],
                                4249518: ['G', 'ethambutol (EMB)'],
                                4326087: ['T', 'ethionamide (ETH)'],
                                4326236: ['T', 'ethionamide (ETH)'],
                                4326300: ['C', 'ethionamide (ETH)'],
                                4326320: ['T', 'ethionamide (ETH)'],
                                4326333: ['G', 'ethionamide (ETH)'],
                                4326449: ['T', 'ethionamide (ETH)'],
                                4326461: ['C', 'ethionamide (ETH)'],
                                4326738: ['A', 'ethionamide (ETH)'],
                                4326807: ['T', 'ethionamide (ETH)'],
                                4326917: ['T', 'ethionamide (ETH)'],
                                4327224: ['C', 'ethionamide (ETH)'],
                                4327301: ['G', 'ethionamide (ETH)'],
                                4327307: ['G', 'ethionamide (ETH)'],
                                4327322: ['A', 'ethionamide (ETH)'],
                                4327346: ['T', 'ethionamide (ETH)'],
                                4327347: ['A', 'ethionamide (ETH)'],
                                4407604: ['T', 'streptomycin (SM)'],
                                4407790: ['A', 'streptomycin (SM)'],
                                4407824: ['A', 'streptomycin (SM)'],
                                4407931: ['G', 'streptomycin (SM)'],
                                4407940: ['G', 'streptomycin (SM)'],
                                4407992: ['T', 'streptomycin (SM)'],
                                4408009: ['C', 'streptomycin (SM)'],
                                4408102: ['G', 'streptomycin (SM)'],
                                6576: ['A', 'fluoroquinolones_(FQ)'],
                                6579: ['T', 'fluoroquinolones_(FQ)', 'A'],
                                6768: ['C', 'fluoroquinolones_(FQ)'],
                                412339: ['G', 'ethambutol_(EMB)'],
                                413498: ['G', 'ethambutol_(EMB)'],
                                413807: ['T', 'ethambutol_(EMB)'],
                                761106: ['C', 'rifampicin_(RMP)'],
                                761116: ['G', 'rifampicin_(RMP)'],
                                761127: ['CA', 'rifampicin_(RMP)'],
                                1416212: ['C', 'ethambutol_(EMB)'],
                                1417019: ['T', 'ethambutol_(EMB)'],
                                1673393: ['C', 'isoniazid_(INH)'],
                                1673406: ['T', 'isoniazid_(INH)'],
                                1673431: ['A', 'isoniazid_(INH)'],
                                1918125: ['A', 'streptomycin_(SM)_amikacin_(AMK)_kanamycin_(KAN)_capreomycin_(CPR)'],
                                1918135: ['C', 'streptomycin_(SM)_amikacin_(AMK)_kanamycin_(KAN)_capreomycin_(CPR)', 'C'],
                                1918136: ['A', 'streptomycin_(SM)_amikacin_(AMK)_kanamycin_(KAN)_capreomycin_(CPR)'],
                                1918202: ['G', 'streptomycin_(SM)_amikacin_(AMK)_kanamycin_(KAN)_capreomycin_(CPR)'],
                                1918213: ['C', 'streptomycin_(SM)_amikacin_(AMK)_kanamycin_(KAN)_capreomycin_(CPR)'],
                                1918219: ['T', 'streptomycin_(SM)_amikacin_(AMK)_kanamycin_(KAN)_capreomycin_(CPR)', 'A'],
                                1918220: ['A', 'streptomycin_(SM)_amikacin_(AMK)_kanamycin_(KAN)_capreomycin_(CPR)'],
                                1918418: ['G', 'streptomycin_(SM)_amikacin_(AMK)_kanamycin_(KAN)_capreomycin_(CPR)'],
                                1918478: ['G', 'streptomycin_(SM)_amikacin_(AMK)_kanamycin_(KAN)_capreomycin_(CPR)', 'C'],
                                1918661: ['C', 'streptomycin_(SM)_amikacin_(AMK)_kanamycin_(KAN)_capreomycin_(CPR)'],
                                2154445: ['T', 'isoniazid_(INH)'],
                                2154682: ['T', 'isoniazid_(INH)'],
                                2154700: ['C', 'isoniazid_(INH)'],
                                2155211: ['A', 'isoniazid_(INH)'],
                                2155234: ['G', 'isoniazid_(INH)'],
                                2155241: ['T', 'isoniazid_(INH)'],
                                2155306: ['T', 'isoniazid_(INH)'],
                                2155688: ['T', 'isoniazid_(INH)'],
                                2288686: ['A', 'pyrazinamide_(PZA)'],
                                2288717: ['T', 'pyrazinamide_(PZA)'],
                                2288767: ['C', 'pyrazinamide_(PZA)'],
                                2288790: ['G', 'pyrazinamide_(PZA)'],
                                2288815: ['T', 'pyrazinamide_(PZA)'],
                                2288860: ['A', 'pyrazinamide_(PZA)'],
                                2289099: ['C', 'pyrazinamide_(PZA)'],
                                2726100: ['T', 'isoniazid_(INH)'],
                                2726112: ['T', 'isoniazid_(INH)'],
                                3645524: ['T', 'ethambutol_(EMB)'],
                                3646959: ['T', 'ethambutol_(EMB)'],
                                3647041: ['G', 'ethambutol_(EMB)'],
                                4245147: ['T', 'ethambutol_(EMB)'],
                                4245969: ['T', 'ethambutol_(EMB)'],
                                4407904: ['A', 'streptomycin_(SM)_amikacin_(AMK)_kanamycin_(KAN)_capreomycin_(CPR)']
                                }
    dict_high_confidence = {6620: 'A', 6621: 'C', 6734: 'G', 6735: 'C', 6736: 'G', 6737: 'C', 6738: 'A', 6741: 'T', 6742: 'T',
                         6750: 'T', 7563: 'T', 7564: 'C', 7570: 'T', 7572: 'C', 7581: 'T', 7582: 'C', 760314: 'T', 761101: 'T', 
                         761109: 'T', 761110: 'T', 761139: 'T', 761140: 'G', 761155: 'T', 761161: 'C', 761277: 'T', 781687: 'G', 
                         781822: 'G', 1473246: 'G', 1473247: 'T', 1473329: 'T', 1673425: 'T', 1673432: 'C', 1674481: 'G', 2155168: 'A', 
                         2155169: 'C', 2155214: 'C', 2155289: 'G', 2288683: 'G', 2288697: 'G', 2288703: 'C', 2288718: 'G', 2288719: 'C', 
                         2288740: 'G', 2288752: 'G', 2288754: 'G', 2288761: 'G', 2288764: 'G', 2288772: 'C', 2288778: 'C', 2288779: 'T', 
                         2288805: 'A', 2288806: 'G', 2288817: 'A', 2288818: 'C', 2288823: 'G', 2288826: 'C', 2288827: 'G', 2288828: 'C', 
                         2288830: 'G', 2288832: 'G', 2288833: 'C', 2288838: 'T', 2288839: 'G', 2288841: 'A', 2288847: 'T', 2288848: 'A', 
                         2288853: 'C', 2288857: 'A', 2288868: 'C', 2288869: 'A', 2288874: 'G', 2288880: 'G', 2288883: 'G', 2288886: 'T', 
                         2288887: 'C', 2288895: 'G', 2288920: 'G', 2288928: 'T', 2288930: 'T', 2288931: 'A', 2288933: 'C', 2288934: 'C', 
                         2288935: 'C', 2288938: 'T', 2288944: 'G', 2288945: 'T', 2288952: 'T', 2288954: 'T', 2288955: 'G', 2288956: 'G', 
                         2288957: 'C', 2288960: 'C', 2288961: 'G', 2288962: 'G', 2288971: 'A', 2288988: 'C', 2288997: 'C', 2288998: 'C', 
                         2289000: 'G', 2289001: 'C', 2289028: 'G', 2289029: 'T', 2289030: 'C', 2289038: 'A', 2289039: 'T', 2289043: 'G', 
                         2289050: 'C', 2289057: 'A', 2289068: 'C', 2289070: 'G', 2289071: 'C', 2289072: 'G', 2289081: 'T', 2289082: 'A', 
                         2289089: 'T', 2289090: 'G', 2289091: 'A', 2289097: 'T', 2289100: 'A', 2289103: 'G', 2289111: 'C', 2289133: 'A', 
                         2289138: 'G', 2289140: 'C', 2289150: 'C', 2289162: 'G', 2289171: 'T', 2289180: 'C', 2289186: 'G', 2289193: 'T', 
                         2289200: 'T', 2289202: 'G', 2289203: 'C', 2289206: 'C', 2289207: 'G', 2289214: 'T', 2289218: 'T', 2289219: 'G', 
                         2289220: 'T', 2289222: 'C', 2289223: 'A', 2289225: 'G', 2289231: 'G', 2289234: 'T', 2289239: 'T', 2289240: 'G', 
                         2289248: 'C', 2289252: 'A', 4247429: 'C', 4247430: 'C', 4247431: 'T', 4247729: 'T', 4247730: 'A', 4248003: 'G'}

    list_resistance = []
    
    for index, _ in vcf_df.iterrows():
        position = int(vcf_df.loc[index,'POS'])
        alt_nucleotide = str(vcf_df.loc[index,'ALT'])
        nucleotides = []
        
        
        if position in dict_resistance_position.keys():
            #Check position in resistance dict
            #Create a list with all possible nucleotydes in each position
            if len(dict_resistance_position[position]) == 2:
                nucleotides.append(dict_resistance_position[position][0])
            elif len(dict_resistance_position[position]) > 2:
                nucleotides.append(dict_resistance_position[position][0])
                last_nucleotides = dict_resistance_position[position][2:]
                #print(last_nucleotides)
                for nucleotide in last_nucleotides: #Append extra nucleotides
                    nucleotides.append(nucleotide)
            if alt_nucleotide in nucleotides:
                snp_resist = alt_nucleotide #ALT
                resistance = dict_resistance_position[int(position)][1] #Resist name
                list_resistance.append(str(position)) #POS
                list_resistance.append(snp_resist)
                #Evaluate High confidence
                if (int(position) in dict_high_confidence.keys()) and (dict_high_confidence[int(position)] in nucleotides):
                    list_resistance.append("*")
                    
                    vcf_df.loc[index,'Resistance'] = resistance + "*"
                else:
                    vcf_df.loc[index,'Resistance'] = resistance
                    
            list_resistance.append("\t")
    #list_resistance.append(resistance + "\n")
    
    if len(list_resistance) > 0:
        print("This strain has resistance positions:\n:" + ",".join(list_resistance))
        return ",".join(list_resistance)
    else:
        print("No resistance were found\n")


In [27]:
add_resistance_snp(test2)

This strain has resistance positions:
:761155,T,*,	,2155168,G,*,	,	,4247431,A,*,	


'761155,T,*,\t,2155168,G,*,\t,\t,4247431,A,*,\t'

In [31]:
test2[test2['HGVS.p'].isnull()]

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,AC,af,AN,BaseQRankSum,DP,ExcessHet,FS,InbreedingCoeff,MQ,MQRankSum,QD,ReadPosRankSum,SOR,ANN,GT,AD,GQ,PL,Allele,Annotation,Annotation_Impact,Gene_Name,Gene_ID,Feature_Type,Feature_ID,Transcript_BioType,Rank,HGVS.c,HGVS.p,cDNA.pos / cDNA.length,CDS.pos / CDS.length,AA.pos / AA.length,Distance,PGT,PID,PS,ERRORS / WARNINGS / INFO,len_AD,REF_AD,ALT_AD,gt0,gt1,Gene length,AA length,dp,aF,AF,Lineage,Resistance


In [37]:
test2['HGVS.p']

0           Leu161Leu
1           Val301Leu
2           Ile614Ile
3            Asp33Asp
4            Asp55Asn
5           Lys105Lys
6            Ile68Ile
7           Ser385Arg
8           Ser455Ser
9             Arg3Arg
10                   
11           His78His
12     Ter110Glyext*?
13          Ile268Thr
14          Ser748Ser
15           His24His
16          Met347Ile
17           Cys24Cys
18          Val128Val
19          Ala167Ala
20          Arg190Arg
21          Val250Val
22          Ala244Ala
23          Pro191Leu
24          Val550Val
25          Lys151Lys
26           Gly60Gly
27                   
28           Asp62Gly
29          Gly382Glu
            ...      
830         Thr206Thr
831         Gly196Gly
832         Thr286Ala
833         Glu215Glu
834         Tyr291Tyr
835         Arg118Arg
836          Ile16Val
837          Val52Val
838          Val45Val
839          Thr49Thr
840        Ser1140Ser
841        Asp1002Asp
842         Ter303Ter
843         Cys183Arg
844       

In [36]:
test2['HGVS.p'].replace('', 'None')

0           Leu161Leu
1           Val301Leu
2           Ile614Ile
3            Asp33Asp
4            Asp55Asn
5           Lys105Lys
6            Ile68Ile
7           Ser385Arg
8           Ser455Ser
9             Arg3Arg
10               None
11           His78His
12     Ter110Glyext*?
13          Ile268Thr
14          Ser748Ser
15           His24His
16          Met347Ile
17           Cys24Cys
18          Val128Val
19          Ala167Ala
20          Arg190Arg
21          Val250Val
22          Ala244Ala
23          Pro191Leu
24          Val550Val
25          Lys151Lys
26           Gly60Gly
27               None
28           Asp62Gly
29          Gly382Glu
            ...      
830         Thr206Thr
831         Gly196Gly
832         Thr286Ala
833         Glu215Glu
834         Tyr291Tyr
835         Arg118Arg
836          Ile16Val
837          Val52Val
838          Val45Val
839          Thr49Thr
840        Ser1140Ser
841        Asp1002Asp
842         Ter303Ter
843         Cys183Arg
844       

In [28]:
test2[test2.Resistance.notnull()]

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,AC,af,AN,BaseQRankSum,DP,ExcessHet,FS,InbreedingCoeff,MQ,MQRankSum,QD,ReadPosRankSum,SOR,ANN,GT,AD,GQ,PL,Allele,Annotation,Annotation_Impact,Gene_Name,Gene_ID,Feature_Type,Feature_ID,Transcript_BioType,Rank,HGVS.c,HGVS.p,cDNA.pos / cDNA.length,CDS.pos / CDS.length,AA.pos / AA.length,Distance,PGT,PID,PS,ERRORS / WARNINGS / INFO,len_AD,REF_AD,ALT_AD,gt0,gt1,Gene length,AA length,dp,aF,AF,Lineage,Resistance
154,Chromosome,761155,.,C,T,36292.32,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=-2.779e+00;DP=1...,GT:AD:DP:GQ:PL,"1/1:0,123:123:99:3823,362,0",2.0,1.0,2.0,-2.779,123.0,0.0,0.598,0.6596,60.0,0.0,22.67,0.595,0.77,T|missense_variant|MODERATE|rpoB|Rv0667|transc...,1/1,123,99.0,38233620,T,missense_variant,MODERATE,rpoB,Rv0667,transcript,CCP43410.1,protein_coding,1/1,1349C>T,Ser450Leu,1349/3519,1349/3519,450/1172,,,,,,2,0.0,123.0,1,1,3519,1172,123.0,0.0,1.0,,rifampicin (RMP)*
426,Chromosome,2155168,.,C,G,60251.58,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=0.028;DP=119;Ex...,GT:AD:DP:GQ:PL,"1/1:0,119:119:99:4017,356,0",2.0,1.0,2.0,0.028,119.0,0.0,0.0,0.73,60.0,0.0,25.24,0.415,0.71,G|missense_variant|MODERATE|katG|Rv1908c|trans...,1/1,119,99.0,40173560,G,missense_variant,MODERATE,katG,Rv1908c,transcript,CCP44675.1,protein_coding,1/1,944G>C,Ser315Thr,944/2223,944/2223,315/740,,,,,,2,0.0,119.0,1,1,2223,740,119.0,0.0,1.0,,isoniazid (INH)*
800,Chromosome,4247431,.,G,A,24681.6,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=-2.323e+00;DP=1...,GT:AD:DP:GQ:PL,"1/1:0,103:103:99:3592,310,0",2.0,1.0,2.0,-2.323,103.0,0.0,0.0,0.6071,60.0,0.0,20.71,0.792,0.709,A|missense_variant|MODERATE|embB|Rv3795|transc...,1/1,103,99.0,35923100,A,missense_variant,MODERATE,embB,Rv3795,transcript,CCP46624.1,protein_coding,1/1,918G>A,Met306Ile,918/3297,918/3297,306/1098,,,,,,2,0.0,103.0,1,1,3297,1098,103.0,0.0,1.0,,ethambutol (EMB)*


In [34]:
        
"""
for index, data_row in vcf_df.iterrows():
        position = str(vcf_df.loc[index,'POS'])
        if position in dict_lineage_position.keys():
            if str(vcf_df.loc[index,'ALT']) == dict_lineage_position[str(position)][0]:
                lineage = dict_lineage_position[str(position)][1]
                vcf_df.loc[index,'Lineage'] = lineage
                list_lineage.append(lineage)
                
    if len(list_lineage) > 0:
        list_lineage.sort(reverse=True)
        asterix = ""
        for sublineage_n in range(len(list_lineage)):
            if sublineage_n < (len(list_lineage) - 1):
                if list_lineage[sublineage_n].startswith(list_lineage[sublineage_n + 1]):
                    asterix = asterix + "*"
        final_lineage = lin[0] + " " + asterix
        print("This strain has lineage position(s):\n: " + " ".join([lin[0],asterix]))
        return final_lineage
    else:
        print("No lineage were found\n")
"""

'\nfor index, data_row in vcf_df.iterrows():\nposition = str(vcf_df.loc[index,\'POS\'])\nif position in dict_lineage_position.keys():\n    if str(vcf_df.loc[index,\'ALT\']) == dict_lineage_position[str(position)][0]:\n        lineage = dict_lineage_position[str(position)][1]\n        vcf_df.loc[index,\'Lineage\'] = lineage\n        list_lineage.append(lineage)\n        \n    if len(list_lineage) > 0:\nlist_lineage.sort(reverse=True)\nasterix = ""\nfor sublineage_n in range(len(list_lineage)):\n    if sublineage_n < (len(list_lineage) - 1):\n        if list_lineage[sublineage_n].startswith(list_lineage[sublineage_n + 1]):\n            asterix = asterix + "*"\nfinal_lineage = lin[0] + " " + asterix\nprint("This strain has lineage position(s):\n: " + " ".join([lin[0],asterix]))\nreturn final_lineage\n    else:\nprint("No lineage were found\n")\n'

In [35]:
lin.sort(reverse=True)

NameError: name 'lin' is not defined

In [36]:
lin

NameError: name 'lin' is not defined

In [37]:
asterix = ""
for sublineage_n in range(len(lin)):
    if sublineage_n < (len(lin) - 1):
        if lin[sublineage_n].startswith(lin[sublineage_n + 1]):
            asterix = asterix + "*"
print(lin[0],asterix)

NameError: name 'lin' is not defined

In [43]:
test2.head()

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,AC,af,AN,BaseQRankSum,DP,ExcessHet,FS,InbreedingCoeff,MQ,MQRankSum,QD,ReadPosRankSum,SOR,ANN,GT,AD,GQ,PL,Allele,Annotation,Annotation_Impact,Gene_Name,Gene_ID,Feature_Type,Feature_ID,Transcript_BioType,Rank,HGVS.c,HGVS.p,cDNA.pos / cDNA.length,CDS.pos / CDS.length,AA.pos / AA.length,Distance,PGT,PID,PS,ERRORS / WARNINGS / INFO,len_AD,REF_AD,ALT_AD,gt0,gt1,dp,aF,AF,Lineage,Resistance
0,Chromosome,2532,.,C,T,176817.18,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=1.67;DP=73;Exce...,GT:AD:DP:GQ:PL,"1/1:0,73:73:99:2286,217,0",2.0,1.0,2.0,1.67,73.0,0.0061,0.0,0.7937,60.0,0.0,33.22,1.26,0.028,T|synonymous_variant|LOW|dnaN|Rv0002|transcrip...,1/1,73,99.0,22862170,T,synonymous_variant,LOW,dnaN,Rv0002,transcript,CCP42724.1,protein_coding,1/1,c.481C>T,p.Leu161Leu,481/1209,481/1209,161/402,,,,,,2,0.0,73.0,1,1,73.0,0.0,1.0,,
1,Chromosome,6140,.,G,T,26113.26,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=0.136;DP=177;Ex...,GT:AD:DP:GQ:PL,"1/1:0,177:177:99:6052,529,0",2.0,1.0,2.0,0.136,177.0,0.0,4.397,0.7016,60.0,0.0,25.96,0.468,0.667,T|missense_variant|MODERATE|gyrB|Rv0005|transc...,1/1,177,99.0,60525290,T,missense_variant,MODERATE,gyrB,Rv0005,transcript,CCP42727.1,protein_coding,1/1,c.901G>T,p.Val301Leu,901/2028,901/2028,301/675,,,,,,2,0.0,177.0,1,1,177.0,0.0,1.0,,
2,Chromosome,9143,.,C,T,506321.38,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=1.78;DP=198;Exc...,GT:AD:DP:GQ:PL,"1/1:0,198:198:99:6843,593,0",2.0,1.0,2.0,1.78,198.0,3.0103,0.0,-0.0028,60.0,0.0,33.71,0.467,0.764,T|synonymous_variant|LOW|gyrA|Rv0006|transcrip...,1/1,198,99.0,68435930,T,synonymous_variant,LOW,gyrA,Rv0006,transcript,CCP42728.1,protein_coding,1/1,c.1842C>T,p.Ile614Ile,1842/2517,1842/2517,614/838,,,,,,2,0.0,198.0,1,1,198.0,0.0,1.0,,
3,Chromosome,13460,.,G,A,449407.38,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=0.586;DP=145;Ex...,GT:AD:DP:GQ:PL,"1/1:0,145:145:99:4771,433,0",2.0,1.0,2.0,0.586,145.0,3.0103,0.0,-0.0028,60.0,0.0,32.96,1.49,0.237,A|synonymous_variant|LOW|Rv0010c|Rv0010c|trans...,1/1,145,99.0,47714330,A,synonymous_variant,LOW,Rv0010c,Rv0010c,transcript,CCP42732.1,protein_coding,1/1,c.99C>T,p.Asp33Asp,99/426,99/426,33/141,,,,,,2,0.0,145.0,1,1,145.0,0.0,1.0,,
4,Chromosome,14251,.,G,A,167619.45,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=-2.185e+00;DP=1...,GT:AD:DP:GQ:PL,"1/1:0,131:131:99:4402,391,0",2.0,1.0,2.0,-2.185,131.0,-0.0,0.0,0.8977,60.0,0.0,30.27,-0.137,0.834,A|missense_variant|MODERATE|Rv0012|Rv0012|tran...,1/1,131,99.0,44023910,A,missense_variant,MODERATE,Rv0012,Rv0012,transcript,CCP42734.1,protein_coding,1/1,c.163G>A,p.Asp55Asn,163/789,163/789,55/262,,,,,,2,0.0,131.0,1,1,131.0,0.0,1.0,,


In [44]:
product_file = "/home/laura/DEVELOP/SNPTB/annotation/genes/dict_locus_product.txt"

In [45]:
essential_file = "/home/laura/DEVELOP/SNPTB/annotation/genes/dict_locus_essential.txt"

In [49]:
dict_product = {}
with open(product_file, 'r') as f:
    for line in f:
        dict_product[line.split(":")[0]] = (":").join(line.split(":")[1:]).strip()

In [51]:
dict_product

{'Rv0001': 'chromosomal replication initiator protein DnaA',
 'Rv0002': 'DNA polymerase III subunit beta',
 'Rv0003': 'DNA replication/repair protein RecF',
 'Rv0004': 'hypothetical protein',
 'Rv0005': 'DNA gyrase subunit B',
 'Rv0006': 'DNA gyrase subunit A',
 'Rv0007': 'membrane protein',
 'Rvnt01': 'tRNA-Ile',
 'Rvnt02': 'tRNA-Ala',
 'Rv0008c': 'cell wall synthesis protein CwsA',
 'Rv0009': 'iron-regulated peptidyl-prolyl cis-trans isomerase PpiA',
 'Rv0010c': 'membrane protein',
 'Rv0011c': 'cell division protein CrgA',
 'Rv0012': 'membrane protein',
 'Rv0013': 'anthranilate synthase component II',
 'Rv0014c': 'serine/threonine-protein kinase PknB',
 'Rv0015c': 'serine/threonine-protein kinase PknA',
 'Rv0016c': 'penicillin-binding protein PbpA',
 'Rv0017c': 'cell division protein RodA',
 'Rv0018c': 'phosphoserine/threonine phosphatase PstP',
 'Rv0019c': 'FHA domain-containing protein FhaB',
 'Rv0020c': 'FHA domain-containing protein FhaA',
 'Rvnt03': 'tRNA-Leu',
 'Rv0021c': 'hypo

In [52]:
def add_essential_cateory(row):
    dict_essential = {}
    with open(product_file, 'r') as f:
        for line in f:
            dict_essential[line.split(":")[0]] = line.split(":")[1].strip()
    if row.Gene_ID in dict_essential.keys():
        if dict_essential[row.Gene_ID] == "essential":
            return "essential"
        else:
            return "nonessential"
        

In [53]:
def add_product_cateory(row):
    dict_product = {}
    with open(product_file, 'r') as f:
        for line in f:
            dict_product[line.split(":")[0]] = (":").join(line.split(":")[1:]).strip()
    if row.Gene_ID in dict_product.keys():
        return dict_product[row.Gene_ID]
        

In [54]:
test2['Is_essential'] = test2.apply(add_essential_cateory, axis=1)

In [55]:
test2['Product'] = test2.apply(add_product_cateory, axis=1)

In [56]:
test2['Is_essential'].value_counts()

nonessential    734
Name: Is_essential, dtype: int64

In [57]:
test2.head()

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,AC,af,AN,BaseQRankSum,DP,ExcessHet,FS,InbreedingCoeff,MQ,MQRankSum,QD,ReadPosRankSum,SOR,ANN,GT,AD,GQ,PL,Allele,Annotation,Annotation_Impact,Gene_Name,Gene_ID,Feature_Type,Feature_ID,Transcript_BioType,Rank,HGVS.c,HGVS.p,cDNA.pos / cDNA.length,CDS.pos / CDS.length,AA.pos / AA.length,Distance,PGT,PID,PS,ERRORS / WARNINGS / INFO,len_AD,REF_AD,ALT_AD,gt0,gt1,dp,aF,AF,Lineage,Resistance,Is_essential,Product
0,Chromosome,2532,.,C,T,176817.18,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=1.67;DP=73;Exce...,GT:AD:DP:GQ:PL,"1/1:0,73:73:99:2286,217,0",2.0,1.0,2.0,1.67,73.0,0.0061,0.0,0.7937,60.0,0.0,33.22,1.26,0.028,T|synonymous_variant|LOW|dnaN|Rv0002|transcrip...,1/1,73,99.0,22862170,T,synonymous_variant,LOW,dnaN,Rv0002,transcript,CCP42724.1,protein_coding,1/1,c.481C>T,p.Leu161Leu,481/1209,481/1209,161/402,,,,,,2,0.0,73.0,1,1,73.0,0.0,1.0,,,nonessential,DNA polymerase III subunit beta
1,Chromosome,6140,.,G,T,26113.26,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=0.136;DP=177;Ex...,GT:AD:DP:GQ:PL,"1/1:0,177:177:99:6052,529,0",2.0,1.0,2.0,0.136,177.0,0.0,4.397,0.7016,60.0,0.0,25.96,0.468,0.667,T|missense_variant|MODERATE|gyrB|Rv0005|transc...,1/1,177,99.0,60525290,T,missense_variant,MODERATE,gyrB,Rv0005,transcript,CCP42727.1,protein_coding,1/1,c.901G>T,p.Val301Leu,901/2028,901/2028,301/675,,,,,,2,0.0,177.0,1,1,177.0,0.0,1.0,,,nonessential,DNA gyrase subunit B
2,Chromosome,9143,.,C,T,506321.38,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=1.78;DP=198;Exc...,GT:AD:DP:GQ:PL,"1/1:0,198:198:99:6843,593,0",2.0,1.0,2.0,1.78,198.0,3.0103,0.0,-0.0028,60.0,0.0,33.71,0.467,0.764,T|synonymous_variant|LOW|gyrA|Rv0006|transcrip...,1/1,198,99.0,68435930,T,synonymous_variant,LOW,gyrA,Rv0006,transcript,CCP42728.1,protein_coding,1/1,c.1842C>T,p.Ile614Ile,1842/2517,1842/2517,614/838,,,,,,2,0.0,198.0,1,1,198.0,0.0,1.0,,,nonessential,DNA gyrase subunit A
3,Chromosome,13460,.,G,A,449407.38,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=0.586;DP=145;Ex...,GT:AD:DP:GQ:PL,"1/1:0,145:145:99:4771,433,0",2.0,1.0,2.0,0.586,145.0,3.0103,0.0,-0.0028,60.0,0.0,32.96,1.49,0.237,A|synonymous_variant|LOW|Rv0010c|Rv0010c|trans...,1/1,145,99.0,47714330,A,synonymous_variant,LOW,Rv0010c,Rv0010c,transcript,CCP42732.1,protein_coding,1/1,c.99C>T,p.Asp33Asp,99/426,99/426,33/141,,,,,,2,0.0,145.0,1,1,145.0,0.0,1.0,,,nonessential,membrane protein
4,Chromosome,14251,.,G,A,167619.45,PASS,AC=2;AF=1.00;AN=2;BaseQRankSum=-2.185e+00;DP=1...,GT:AD:DP:GQ:PL,"1/1:0,131:131:99:4402,391,0",2.0,1.0,2.0,-2.185,131.0,-0.0,0.0,0.8977,60.0,0.0,30.27,-0.137,0.834,A|missense_variant|MODERATE|Rv0012|Rv0012|tran...,1/1,131,99.0,44023910,A,missense_variant,MODERATE,Rv0012,Rv0012,transcript,CCP42734.1,protein_coding,1/1,c.163G>A,p.Asp55Asn,163/789,163/789,55/262,,,,,,2,0.0,131.0,1,1,131.0,0.0,1.0,,,nonessential,membrane protein


In [66]:
table_res ="/home/laura/DEVELOP/SNPTB/annotation/resistance/MTB_Resistance_Mediating.txt"

In [67]:
annot_file = "/home/laura/ANALYSIS/SNPTB/190624_Anotation_test/Annotation/SOMOZACOL2.snp.hf.pass.final.annot.tsv"

In [75]:
annot_file3 = "/home/laura/ANALYSIS/SNPTB/190624_Anotation_test/Annotation/SOMOZACOL3.snp.hf.pass.final.annot.tsv"

In [132]:
df_res = pd.read_csv(table_res, sep="\t", header=0)
df_res['High Confidence SNP'].fillna("no", inplace=True)
df_annot = pd.read_csv(annot_file, sep="\t", header=0)
df_annot3 = pd.read_csv(annot_file3, sep="\t", header=0)

In [133]:
df_res.head()

Unnamed: 0,Variant position genome start,Variant position genome stop,Var. type,Number,WT base,Var. base,Region,Gene ID,Gene Name,Gene start,Gene stop,Gene length,Dir.,WT AA,Codon nr.,Codon nr. E. coli,Var. AA,AA change,Codon change,Variant position gene start,Variant position gene stop,Antibiotic,Reference PMID,High Confidence SNP
0,6575,6575,SNP,1,c,t,coding,Rv0005,gyrB,5240,7267,2028,+,Arg,446,-,Cys,Arg446Cys,cgt/tgt,1336,1336,fluoroquinolones (FQ),19470506,no
1,6620,6620,SNP,1,g,c,coding,Rv0005,gyrB,5240,7267,2028,+,Asp,461,-,His,Asp461His,gac/cac,1381,1381,fluoroquinolones (FQ),24055765,yes
2,6620,6620,SNP,1,g,a,coding,Rv0005,gyrB,5240,7267,2028,+,Asp,461,-,Asn,Asp461Asn,gac/aac,1381,1381,fluoroquinolones (FQ),19721073,yes
3,6621,6621,SNP,1,a,c,coding,Rv0005,gyrB,5240,7267,2028,+,Asp,461,-,Ala,Asp461Ala,gac/gcc,1382,1382,fluoroquinolones (FQ),24055765,yes
4,6734,6734,SNP,1,a,g,coding,Rv0005,gyrB,5240,7267,2028,+,Asn,499,-,Asp,Asn499Asp,aac/gac,1495,1495,fluoroquinolones (FQ),17412727,yes


In [146]:
def get_reverse(nucleotyde):
    nucleotyde = str(nucleotyde)
    nucleotyde_rev = {'A' : 'T',
                     'T' : 'A',
                     'C' : 'G',
                     'G': 'C'}
    return nucleotyde_rev[nucleotyde]

In [297]:
def create_report(tab_annot, df_res, species="Mycobacterium tuberculosis"):
    output = os.path.dirname(tab_annot)
    sample = os.path.basename(tab_annot).split(".")[0]
    print("<html>"=
    print("<head>")
  <title>website title</title>
 </head>
 <body>
  content of website ...
 </body>
</html>
    print("Sample name: " + sample + "\n")
    
    print("Species: " + species + "\n")
    
    df_annot = pd.read_csv(tab_annot, sep="\t", header=0)
    
    list_resistance = df_annot['Resistance'][df_annot.Resistance.notnull()].tolist()
    list_lineage = df_annot['Lineage'][df_annot.Lineage.notnull()].tolist()
    
    #Output Lineage info
    if len(list_lineage) > 0:
        list_lineage.sort(reverse=True)
        asterix = ""
        for sublineage_n in range(len(list_lineage)):
            if sublineage_n < (len(list_lineage) - 1):
                if list_lineage[sublineage_n].startswith(list_lineage[sublineage_n + 1]):
                    asterix = asterix + "*"
        final_lineage = list_lineage[0] + " " + asterix
        line = "This strain has lineage position(s): " + final_lineage + "\n"
        print(line)
    else:
        line = "No lineage were found\n"
        print(line)
    
    #Output Resistance info
    if len(list_resistance) > 0:
        print("This strain has " + str(len(list_resistance)) + " resistance position(s):\n")
        
        final_res_table = pd.DataFrame(columns= df_res.columns.tolist())

        for index, data_row in df_annot[df_annot.Resistance.notnull()].iterrows():
            position = str(df_annot.loc[index,'POS'])
            if df_annot.loc[index,'Gene_ID'].endswith("c"):
                alt_nucleotide = get_reverse(df_annot.loc[index,'ALT']).lower()
            else:
                alt_nucleotide = df_annot.loc[index,'ALT'].lower()
        #resistance_name = df_annot.loc[index,'Resistance'].strip("*")
        #print(position, alt_nucleotide, resistance_name)
        
            row = df_res[(df_res['Var. base'] == alt_nucleotide) & (df_res['Variant position genome stop'] == position)]
            index = row.index[0]
            final_res_table = final_res_table.append(df_res.iloc[index], ignore_index=True)
            
        
        final_res_table.reset_index(drop=True, inplace=True)
        final_res_table_F = final_res_table[['Variant position genome stop', 'Var. base',
                                            'Region', 'Gene ID', 'Gene Name','Gene start',
                                             'Gene stop','Gene length'
                                             
                                            ]]
        #df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'}, inplace=True)
        final_res_table_F.columns = ['Position', 'Alt. base',
                                            'Region', 'Gene ID', 'Gene Name','Gene start',
                                             'Gene stop','Gene length'
                                        
                                            ]
        print(tabulate(final_res_table_F, headers='keys', tablefmt="html", showindex=False))
    else:
        print("No Resistance were found\n")

In [298]:
create_report(annot_file, df_res)

Sample name: SOMOZACOL2

Species: Mycobacterium tuberculosis

This strain has lineage position(s): 4.3.4.2 **

This strain has 3 resistance position(s):

<table>
<thead>
<tr><th style="text-align: right;">  Position</th><th>Alt. base  </th><th>Region  </th><th>Gene ID  </th><th>Gene Name  </th><th style="text-align: right;">  Gene start</th><th style="text-align: right;">  Gene stop</th><th style="text-align: right;">  Gene length</th></tr>
</thead>
<tbody>
<tr><td style="text-align: right;">    761155</td><td>t          </td><td>coding  </td><td>Rv0667   </td><td>rpoB       </td><td style="text-align: right;">      759807</td><td style="text-align: right;">     763325</td><td style="text-align: right;">         3519</td></tr>
<tr><td style="text-align: right;">   2155168</td><td>c          </td><td>coding  </td><td>Rv1908c  </td><td>katG       </td><td style="text-align: right;">     2156111</td><td style="text-align: right;">    2153889</td><td style="text-align: right;">         222

Sample name: SOMOZACOL2

Species: Mycobacterium tuberculosis

This strain has lineage position(s): 4.3.4.2 **

This strain has 3 resistance position(s):

|   Position | Alt. base   | Region   | Gene ID   | Gene Name   |   Gene start |   Gene stop |   Gene length | Dir.   | AA change   | Codon change   | Antibiotic       | High Confidence   |
|------------|-------------|----------|-----------|-------------|--------------|-------------|---------------|--------|-------------|----------------|------------------|-------------------|
|     761155 | t           | coding   | Rv0667    | rpoB        |       759807 |      763325 |          3519 | +      | Ser450Leu   | tcg/ttg        | rifampicin (RMP) | yes               |
|    2155168 | c           | coding   | Rv1908c   | katG        |      2156111 |     2153889 |          2223 | -      | Ser315Thr   | agc/acc        | isoniazid (INH)  | yes               |
|    4247431 | a           | coding   | Rv3795    | embB        |      4246514 |     4249810 |          3297 | +      | Met306Ile   | atg/ata        | ethambutol (EMB) | yes               |

In [None]:
- "plain"
- "simple"
- "github"
- "grid"
- "fancy_grid"
- "pipe"
- "orgtbl"
- "jira"
- "presto"
- "psql"
- "rst"
- "mediawiki"
- "moinmoin"
- "youtrack"
- "html"
- "latex"
- "latex_raw"
- "latex_booktabs"
- "textile"

In [299]:
css = """

<style type="text/css">

body {
    font: normal 20px Verdana, Arial, sans-serif;
}

table {
    text-align: center;
    border-color: #000;
    border-spacing: 0px;
    border-style: solid;
    border-width: 1px;
    cell-spacing: 0px;
  }

th, td {
  border-bottom: 1px solid #ddd;
}

th {
  background-color: rgb(76, 175, 170);
}

tr:hover {background-color:#7c7b7b;}
tr:nth-child(even) {background-color: #cecccc;}

</style>

"""

In [300]:
print(css)



<style type="text/css">

body {
    font: normal 20px Verdana, Arial, sans-serif;
}

table {
    text-align: center;
    border-color: #000;
    border-spacing: 0px;
    border-style: solid;
    border-width: 1px;
    cell-spacing: 0px;
  }

th, td {
  border-bottom: 1px solid #ddd;
}

th {
  background-color: rgb(76, 175, 170);
}

tr:hover {background-color:#7c7b7b;}
tr:nth-child(even) {background-color: #cecccc;}

</style>




In [263]:
df_res.columns.tolist()

['Variant position genome start',
 'Variant position genome stop',
 'Var. type',
 'Number',
 'WT base',
 'Var. base',
 'Region',
 'Gene ID',
 'Gene Name',
 'Gene start',
 'Gene stop',
 'Gene length',
 'Dir.',
 'WT AA',
 'Codon nr.',
 'Codon nr. E. coli',
 'Var. AA',
 'AA change',
 'Codon change',
 'Variant position gene start',
 'Variant position gene stop',
 'Antibiotic',
 'Reference PMID',
 'High Confidence SNP']

In [241]:
df_annot3['Resistance'][df_annot3.Resistance.notnull()].tolist()

[]

In [87]:
df_annot['Resistance'][df_annot.Resistance.notnull()].tolist().

['rifampicin (RMP)*', 'isoniazid (INH)*', 'ethambutol (EMB)*']

In [89]:
print('rifampicin (RMP)*'.strip("*"))

rifampicin (RMP)


In [82]:
df_annot3['Lineage'][df_annot3.Lineage.notnull()].tolist()

['4.1', '4.1.2.1', '4.1.2']

In [123]:
df_res.head()

Unnamed: 0,Variant position genome start,Variant position genome stop,Var. type,Number,WT base,Var. base,Region,Gene ID,Gene Name,Gene start,Gene stop,Gene length,Dir.,WT AA,Codon nr.,Codon nr. E. coli,Var. AA,AA change,Codon change,Variant position gene start,Variant position gene stop,Antibiotic,Reference PMID,High Confidence SNP
0,6575,6575,SNP,1,c,t,coding,Rv0005,gyrB,5240,7267,2028,+,Arg,446,-,Cys,Arg446Cys,cgt/tgt,1336,1336,fluoroquinolones (FQ),19470506,
1,6620,6620,SNP,1,g,c,coding,Rv0005,gyrB,5240,7267,2028,+,Asp,461,-,His,Asp461His,gac/cac,1381,1381,fluoroquinolones (FQ),24055765,yes
2,6620,6620,SNP,1,g,a,coding,Rv0005,gyrB,5240,7267,2028,+,Asp,461,-,Asn,Asp461Asn,gac/aac,1381,1381,fluoroquinolones (FQ),19721073,yes
3,6621,6621,SNP,1,a,c,coding,Rv0005,gyrB,5240,7267,2028,+,Asp,461,-,Ala,Asp461Ala,gac/gcc,1382,1382,fluoroquinolones (FQ),24055765,yes
4,6734,6734,SNP,1,a,g,coding,Rv0005,gyrB,5240,7267,2028,+,Asn,499,-,Asp,Asn499Asp,aac/gac,1495,1495,fluoroquinolones (FQ),17412727,yes


In [78]:
df_annot3.head()

Unnamed: 0,#CHROM,POS,ID,REF,ALT,Annotation,Annotation_Impact,Gene_Name,Gene_ID,Feature_Type,Feature_ID,Transcript_BioType,Rank,HGVS.c,HGVS.p,cDNA.pos / cDNA.length,CDS.pos / CDS.length,AA.pos / AA.length,Is_essential,Product,Lineage,Resistance
0,Chromosome,2532,.,C,T,synonymous_variant,LOW,dnaN,Rv0002,transcript,CCP42724.1,protein_coding,1/1,c.481C>T,p.Leu161Leu,481/1209,481/1209,161/402,nonessential,DNA polymerase III subunit beta,,
1,Chromosome,9143,.,C,T,synonymous_variant,LOW,gyrA,Rv0006,transcript,CCP42728.1,protein_coding,1/1,c.1842C>T,p.Ile614Ile,1842/2517,1842/2517,614/838,nonessential,DNA gyrase subunit A,,
2,Chromosome,11370,.,C,T,intergenic_region,MODIFIER,alaT-Rv0008c,EBG00000313365-Rv0008c,intergenic_region,EBG00000313365-Rv0008c,,,n.11370C>T,,,,,,,,
3,Chromosome,13460,.,G,A,synonymous_variant,LOW,Rv0010c,Rv0010c,transcript,CCP42732.1,protein_coding,1/1,c.99C>T,p.Asp33Asp,99/426,99/426,33/141,nonessential,membrane protein,,
4,Chromosome,14401,.,G,A,synonymous_variant,LOW,Rv0012,Rv0012,transcript,CCP42734.1,protein_coding,1/1,c.313G>A,p.Lys105Lys,313/789,313/789,105/262,nonessential,membrane protein,,


In [131]:
df_res[(df_res['Var. base'] == "t") & (df_res['Variant position genome stop'] == '6575')]

Unnamed: 0,Variant position genome start,Variant position genome stop,Var. type,Number,WT base,Var. base,Region,Gene ID,Gene Name,Gene start,Gene stop,Gene length,Dir.,WT AA,Codon nr.,Codon nr. E. coli,Var. AA,AA change,Codon change,Variant position gene start,Variant position gene stop,Antibiotic,Reference PMID,High Confidence SNP
0,6575,6575,SNP,1,c,t,coding,Rv0005,gyrB,5240,7267,2028,+,Arg,446,-,Cys,Arg446Cys,cgt/tgt,1336,1336,fluoroquinolones (FQ),19470506,


In [198]:
final_res_table = pd.DataFrame(columns= df_res.columns.tolist())

for index, data_row in df_annot[df_annot.Resistance.notnull()].iterrows():
        position = str(df_annot.loc[index,'POS'])
        if df_annot.loc[index,'Gene_ID'].endswith("c"):
            alt_nucleotide = get_reverse(df_annot.loc[index,'ALT']).lower()
        else:
            alt_nucleotide = df_annot.loc[index,'ALT'].lower()
        #resistance_name = df_annot.loc[index,'Resistance'].strip("*")
        #print(position, alt_nucleotide, resistance_name)
        
        row = df_res[(df_res['Var. base'] == alt_nucleotide) & (df_res['Variant position genome stop'] == position)]
        index = row.index[0]
        final_res_table = final_res_table.append(df_res.iloc[index], ignore_index = True)
        
print(tabulate(final_res_table, headers='keys', tablefmt='psql'))

+----+---------------------------------+--------------------------------+-------------+----------+-----------+-------------+----------+-----------+-------------+--------------+-------------+---------------+--------+---------+-------------+---------------------+-----------+-------------+----------------+-------------------------------+------------------------------+------------------+------------------+-----------------------+
|    |   Variant position genome start |   Variant position genome stop | Var. type   |   Number | WT base   | Var. base   | Region   | Gene ID   | Gene Name   |   Gene start |   Gene stop |   Gene length | Dir.   | WT AA   |   Codon nr. | Codon nr. E. coli   | Var. AA   | AA change   | Codon change   |   Variant position gene start |   Variant position gene stop | Antibiotic       |   Reference PMID | High Confidence SNP   |
|----+---------------------------------+--------------------------------+-------------+----------+-----------+-------------+----------+-----

In [None]:
Var position WT Var Region Gene ID Name Gene start Gene stop Gene length Dir. AA change Codon change Antibiotic High Conf

In [184]:
df_res.iloc[2].values

array([6620, '6620', 'SNP', 1, 'g', 'a', 'coding', 'Rv0005', 'gyrB', 5240,
       7267, 2028, '+', 'Asp', '461', '-', 'Asn', 'Asp461Asn', 'gac/aac',
       1381, 1381, 'fluoroquinolones (FQ)', 19721073, 'yes'], dtype=object)

In [196]:
final_res_table

Unnamed: 0,Variant position genome start,Variant position genome stop,Var. type,Number,WT base,Var. base,Region,Gene ID,Gene Name,Gene start,Gene stop,Gene length,Dir.,WT AA,Codon nr.,Codon nr. E. coli,Var. AA,AA change,Codon change,Variant position gene start,Variant position gene stop,Antibiotic,Reference PMID,High Confidence SNP
0,761155,761155,SNP,1,c,t,coding,Rv0667,rpoB,759807,763325,3519,+,Ser,450,531,Leu,Ser450Leu,tcg/ttg,1349,1349,rifampicin (RMP),21300839,yes
1,2155168,2155168,SNP,1,g,c,coding,Rv1908c,katG,2156111,2153889,2223,-,Ser,315,-,Thr,Ser315Thr,agc/acc,944,944,isoniazid (INH),8878604,yes
2,4247431,4247431,SNP,1,g,a,coding,Rv3795,embB,4246514,4249810,3297,+,Met,306,-,Ile,Met306Ile,atg/ata,918,918,ethambutol (EMB),21300839,yes


In [139]:
df_annot[df_annot.Resistance.notnull()]

Unnamed: 0,#CHROM,POS,ID,REF,ALT,Annotation,Annotation_Impact,Gene_Name,Gene_ID,Feature_Type,Feature_ID,Transcript_BioType,Rank,HGVS.c,HGVS.p,cDNA.pos / cDNA.length,CDS.pos / CDS.length,AA.pos / AA.length,Is_essential,Product,Lineage,Resistance
155,Chromosome,761155,.,C,T,missense_variant,MODERATE,rpoB,Rv0667,transcript,CCP43410.1,protein_coding,1/1,c.1349C>T,p.Ser450Leu,1349/3519,1349/3519,450/1172,essential,DNA-directed RNA polymerase subunit beta,,rifampicin (RMP)*
422,Chromosome,2155168,.,C,G,missense_variant,MODERATE,katG,Rv1908c,transcript,CCP44675.1,protein_coding,1/1,c.944G>C,p.Ser315Thr,944/2223,944/2223,315/740,nonessential,catalase-peroxidase,,isoniazid (INH)*
802,Chromosome,4247431,.,G,A,missense_variant,MODERATE,embB,Rv3795,transcript,CCP46624.1,protein_coding,1/1,c.918G>A,p.Met306Ile,918/3297,918/3297,306/1098,essential,arabinosyltransferase B,,ethambutol (EMB)*


In [144]:
df_res[(df_res['Variant position genome stop'] == '2155168')]

Unnamed: 0,Variant position genome start,Variant position genome stop,Var. type,Number,WT base,Var. base,Region,Gene ID,Gene Name,Gene start,Gene stop,Gene length,Dir.,WT AA,Codon nr.,Codon nr. E. coli,Var. AA,AA change,Codon change,Variant position gene start,Variant position gene stop,Antibiotic,Reference PMID,High Confidence SNP
106,2155168,2155168,SNP,1,g,c,coding,Rv1908c,katG,2156111,2153889,2223,-,Ser,315,-,Thr,Ser315Thr,agc/acc,944,944,isoniazid (INH),8878604,yes
107,2155168,2155168,SNP,1,g,a,coding,Rv1908c,katG,2156111,2153889,2223,-,Ser,315,-,Asn,Ser315Asn,agc/aac,944,944,isoniazid (INH),9210694,yes
108,2155168,2155168,SNP,1,g,t,coding,Rv1908c,katG,2156111,2153889,2223,-,Ser,315,-,Ile,Ser315Ile,agc/atc,944,944,isoniazid (INH),9210694,yes


In [149]:
print("\x1B[3mMycobacterium tuberculosis\x1B[23m")

[3mMycobacterium tuberculosis[23m
