In [69]:
import pandas as pd
import re
import os

In [6]:
pd.set_option('display.max_columns', None)

In [7]:
file = "190808_Panama_agosto_19.cohort.combined.hf.vcf"

In [8]:
def import_VCF42_cohort_pandas(vcf_file, sep='\t'):
    """
    Script to read vcf 4.2 cohort/join called vcf handling header lines
    """
    header_lines = 0
    with open(vcf_file) as f:
        first_line = f.readline().strip()
        next_line = f.readline().strip()
        while next_line.startswith("##"):
            header_lines = header_lines + 1
            #print(next_line)
            next_line = f.readline()
    
    if first_line.endswith('VCFv4.2'):
        dataframe = pd.read_csv(vcf_file, sep=sep, skiprows=[header_lines], header=header_lines)
    else:
        print("This vcf file is not v4.2")
        sys.exit(1)
           
    return dataframe

In [66]:
def identify_heterozygous(vcf_file, nocall_fr=0.2):
    
    df = import_VCF42_cohort_pandas(vcf_file)
    
    highly_hetz_positions = []
    
    sample_list = df.columns[9:].tolist()
    #remove positions which haven't been enotyped in 0.2% or more samples
    for index, data_row in df.iloc[:,9:].iterrows():
        if any(bool(re.search(r'0[|\/][1-9]', x)) for x in data_row):
            #print(data_row.tolist())
            is_heterozygous = [bool(re.search(r'0[|\/][1-9]', x)) for x in data_row] #True False array
            #is_heterozygous = [x.startswith("0/1") for x in data_row] #True False array
            is_heterozygous_count = sum(is_heterozygous) #True = 1, False = 0
            #Drop positions
            if is_heterozygous_count / len(is_heterozygous) > nocall_fr:
                highly_hetz_positions.append(df.loc[index, 'POS'])
                #print(df.loc[index, 'POS'], is_heterozygous_count, len(is_heterozygous))
        
    return highly_hetz_positions
    

In [68]:
identify_heterozygous(file)

[150898,
 333637,
 333640,
 333641,
 334724,
 334727,
 334769,
 334771,
 338810,
 338876,
 338903,
 338960,
 338963,
 339088,
 339095,
 424145,
 455742,
 580772,
 699615,
 699621,
 699624,
 806188,
 806194,
 806197,
 806199,
 806202,
 806205,
 806207,
 806209,
 806215,
 806222,
 815647,
 839215,
 839216,
 839224,
 839225,
 839515,
 839516,
 839519,
 839520,
 839534,
 839548,
 839552,
 839553,
 839557,
 839558,
 839560,
 839563,
 839566,
 839572,
 839581,
 839584,
 839586,
 1007423,
 1095981,
 1095990,
 1096205,
 1096235,
 1096239,
 1189688,
 1189689,
 1189697,
 1189727,
 1189729,
 1189736,
 1189738,
 1189745,
 1190017,
 1191450,
 1191453,
 1191497,
 1191741,
 1191817,
 1262964,
 1262969,
 1313355,
 1313361,
 1313362,
 1313363,
 1313370,
 1313386,
 1313389,
 1340578,
 1341023,
 1341029,
 1410471,
 1416222,
 1416232,
 1416234,
 1480099,
 1480102,
 1480104,
 1480105,
 1480109,
 1480111,
 1480113,
 1480116,
 1480123,
 1480128,
 1480131,
 1480132,
 1480135,
 1480138,
 1480144,
 1480146,
 14

In [53]:
test = ['0|1:40,0:40:99:0,102,1355', '0/3:33,0:33:99:0,99,1031', '0/0:52,0:52:99:0,120,1800', '0/1:45,0:45:99:0,107,1665', '0/0:57,0:57:99:0,113,1751', '0/0:35,0:35:93:0,93,1395', '0/0:43,0:43:99:0,110,1450', '0/0:50,0:50:99:0,113,1694', '0/0:54,0:54:99:0,117,1800']

In [55]:
if any(bool(re.search(r'0[|\/][1-9]', x)) for x in test):
    print("YES")
    test_l = [bool(re.search(r'0[|\/][1-9]', x)) for x in test]
    print(test_l)
else:
    print("NO")

YES
[True, True, False, True, False, False, False, False, False]


In [52]:
is_test = [bool(re.search(r'0[|\/]1', x)) for x in test] #True False array

In [36]:
is_test

[True, False, True, False, False, False, False, False, False]

In [185]:
df_test = import_VCF42_cohort_pandas(file)

In [186]:
df = repare_nongenotyped(file)

PA117-18C 21819 ./.:1,0:1:.:0,0,0
PA-004 132804 ./.:10,0:10:.:.:.:0,0,0
T-18-008 335642 ./.:1,0:1:.:0,0,0
PA-212 336325 ./.:1,0:1:.:.:.:0,0,0
P120 672330 ./.:3,0:3:.:.:.:0,0,0
P120 672332 ./.:3,0:3:.:.:.:0,0,0
PA197-18C 673564 ./.:1,0:1:.:0,0,0
T-18-008 673564 ./.:5,0:5:.:0,0,0
PA-004 839216 ./.:3,0:3:.:.:.:0,0,0
PA184-18C 839581 ./.:35,0:35:.:.:.:0,0,0
PA202-18C 839581 ./.:33,0:33:.:.:.:0,0,0
P107 839584 ./.:22,0:22:.:.:.:0,0,0
PA184-18C 839584 ./.:33,0:33:.:.:.:0,0,0
P107 839586 ./.:22,0:22:.:.:.:0,0,0
PA184-18C 839586 ./.:33,0:33:.:.:.:0,0,0
PA184-18C 1000794 ./.:59,0:59:.:.:.:0,0,0
T18-040 1093850 ./.:2,0:2:.:0,0,0
PA197-18C 1093954 ./.:1,0:1:.:.:.:0,0,0
T-18-008 1094204 ./.:1,0:1:.:0,0,0
T18-040 1094204 ./.:1,0:1:.:0,0,0
P46 1095644 ./.:13,0:13:.:.:.:0,0,0
P46 1095669 ./.:17,0:17:.:.:.:0,0,0
P46 1095672 ./.:16,0:16:.:.:.:0,0,0
PA117-18C 1095990 ./.:2,0:2:.:.:.:0,0,0
PA149-18C 1095990 ./.:4,0:4:.:.:.:0,0,0
T-18-008 1096398 ./.:1,0:1:.:.:.:0,0,0
T18-003 1189567 ./.:7,0:7:.:.:.:0,0,0

In [67]:
test = ['./.:17,0:17:.:.:.:0,0,0', '0/0:4,0:4:12:.:.:0,12,140', '0/0:10,0:10:30:.:.:0,30,405', '0/0:16,0:16:48:.:.:0,48,570', '0/0:44,0:44:0:.:.:0,0,15', '0/0:3,0:3:9:.:.:0,9,82', '0|1:2,4:6:99:0|1:150890_G_A:162,0,385:150890', './.:26,0:26:.:.:.:0,0,0', '0/0:19,0:19:57:.:.:0,57,663', './.:42,0:42:.:.:.:0,0,0', './.:33,0:33:.:.:.:0,0,0', './.:15,0:15:.:.:.:0,0,0', './.:11,0:11:.:.:.:0,0,0', './.:34,0:34:.:.:.:0,0,0', './.:18,0:18:.:.:.:0,0,0', '0|1:1,13:14:99:0|1:150890_G_A:495,0,127:150890', './.:31,0:31:.:.:.:0,0,0', './.:36,0:36:.:.:.:0,0,0', './.:17,0:17:.:.:.:0,0,0', './.:20,0:20:.:.:.:0,0,0', './.:16,0:16:.:.:.:0,0,0', '0/0:5,0:5:0:.:.:0,0,101', '0/0:9,0:9:27:.:.:0,27,342', './.:16,0:16:.:.:.:0,0,0', './.:19,0:19:.:.:.:0,0,0', '0/0:10,0:10:0:.:.:0,0,316']

In [68]:
test2 = any(x.startswith("./.") for x in test)

In [70]:
test2 = [x.startswith("./.") for x in test]

In [73]:
sum(test2)

15

In [74]:
len(test2)

26

In [18]:
df.columns[9:].tolist()

['1104',
 '1209',
 'P107',
 'P118',
 'P120',
 'P46',
 'P97',
 'PA-004',
 'PA-204',
 'PA-212',
 'PA117-18C',
 'PA119-18C',
 'PA136-18C',
 'PA149-18C',
 'PA155-18C',
 'PA157-18C',
 'PA164-18C',
 'PA176-18C',
 'PA184-18C',
 'PA197-18C',
 'PA202-18C',
 'T-18-008',
 'T18-003',
 'T18-027',
 'T18-040',
 'T18-055']

In [None]:
d.apply(lambda x: x.str.contains('|'.join(v)))

In [None]:
d.applymap(lambda x: any([x.startswith(v) for v in vals]))


In [79]:
def list_to_bed(input_list, output_dir, output_file_name, reference="CHROM"):
    """
    Turn a list into a bed file with start and end position having the same value
    """
    output_dir = os.path.abspath(output_dir)
    
    output_bed_file = output_file_name + ".bed"
    
    final_output_path = os.path.join(output_dir, output_bed_file)
    
    with open (final_output_path, 'w+') as f:
        for position in input_list:
            line = ("\t").join([reference, str(position), str(position)]) + "\n"
            f.write(line)
        
    

In [80]:
list_het = identify_heterozygous(file)

In [81]:
list_to_bed(list_het, ".", "is_heterozygous")

In [78]:
%%bash

pwd

/home/laura/DEVELOP/SNPTB/test
