## Concatenate variant and non-variant sites

#### Preliminary first steps are calling of non variants sites with gatk
#### Then is selection of sites with enough DP with get_unvariant_sites.py
#### Then are ordering with bcftools view -S and removing of LowQual

In [None]:
### On cluster:

#gatk --java-options -Xmx70G GenotypeGVCFs -L chr1 -R reference.fasta -V gendb://db_chr1 -O chr1_include-non-variant-sites.vcf.gz --include-non-variant-sites

#./get_unvariant_sites.py -v chr1_include-non-variant-sites.vcf


In [None]:
### After cluster:

#cat header.txt chr1_non_variant_sites.vcf > chr1_non_variant_sites_header.vcf

#bcftools view -S samples_ordered.txt chr1_non_variant_sites_header.vcf > chr1_non_variant_sites_ordered.vcf 

#grep -v "LowQual" chr1_non_variant_sites_ordered.vcf > chr1_non_variant_sites_ordered_no_low_qual.vcf


#### 1. masking genotypes with DP < 5 and DP > 100

In [1]:
vcf = "chr1_non_variant_sites_ordered_no_low_qual.vcf"

In [2]:
with open(vcf, "r") as file:
    with open(str(vcf)+"_masked.vcf", "w") as r_file:
        for line in file:
            e=line.split("\t")
            if str(e[0][0]) == "#":
                r_file.write(line)
            if str(e[0][0]) != "#": #GT:DP:RGQ => DP in 2nd position
                r_file.write(str(e[0]))
                for i in range(1,8):
                    r_file.write("\t"+str(e[i]))
                r_file.write("\tGT:DP")
                for i in range(9,135):
                    INFOS=e[i].split(":") #0/0:9:27
                    GT=INFOS[0]
                    DP=INFOS[1] 
                    if str(DP) != ".":
                        if int(DP) >= 5 and int(DP) <= 100:
                            r_file.write("\t"+str(GT)+":"+str(DP))
                        if int(DP) < 5 or int(DP) > 100: #add as masked ./. if DP < 5 or DP > 100
                            r_file.write("\t./.:"+str(DP))    
                r_file.write("\n")
                #print(line)
                #break

#### 2. Concatenate variant and non variant sites

##### 2.1 Get positions of non-variants, with masked genotypes:

In [3]:
dico_non_variants={}
with open(str(vcf)+"_masked.vcf", "r") as file:
    for line in file:
        e=line.split("\t")
        if str(e[0][0]) != "#":
            position=e[1]
            dico_non_variants[str(position)]=line
print(len(dico_non_variants))
print(dico_non_variants["51037"])

1015965
chr7	51037	.	G	.	.	.	DP=844;AN=252	GT:DP	0/0:5	0/0:5	0/0:6	0/0:6	./.:4	0/0:5	0/0:10	./.:4	0/0:5	0/0:7	0/0:11	0/0:8	0/0:11	0/0:6	./.:4	0/0:8	0/0:6	0/0:8	0/0:14	0/0:8	0/0:11	0/0:8	0/0:5	0/0:5	0/0:5	./.:4	./.:4	0/0:6	0/0:9	./.:4	0/0:6	0/0:7	0/0:5	0/0:9	0/0:9	0/0:10	./.:1	./.:4	0/0:9	0/0:5	0/0:7	0/0:12	0/0:10	0/0:6	0/0:8	0/0:7	0/0:10	0/0:7	0/0:5	0/0:10	0/0:11	0/0:8	./.:4	0/0:10	0/0:10	0/0:9	0/0:10	./.:1	0/0:8	0/0:6	./.:3	0/0:7	0/0:6	./.:0	./.:3	0/0:7	./.:2	0/0:9	0/0:7	0/0:9	0/0:7	0/0:6	./.:4	0/0:8	0/0:5	./.:2	./.:3	0/0:5	0/0:6	0/0:7	./.:2	0/0:5	0/0:15	0/0:9	./.:4	0/0:5	0/0:6	0/0:8	0/0:5	./.:4	0/0:8	0/0:6	./.:4	0/0:5	0/0:8	0/0:8	0/0:8	0/0:5	0/0:10	./.:4	0/0:7	0/0:5	0/0:6	0/0:8	0/0:9	./.:4	./.:3	0/0:6	0/0:5	0/0:11	0/0:5	./.:2	0/0:6	0/0:6	0/0:6	0/0:5	0/0:6	./.:2	./.:4	0/0:9	0/0:6	0/0:11	0/0:17	0/0:12	0/0:13	0/0:9



##### 2.2 Reformat vcf with variants to have GT:DP as for invariant sites:

In [4]:
vcf_variants = "chr1_variant_sites_cerao.vcf"

In [5]:
with open(vcf_variants+".reformat.vcf", "w") as fr:
    with open(vcf_variants, "r") as file:
        for line in file:
            e=line.split("\t")
            if str(e[0][0]) == "#": 
                fr.write(line)
            if str(e[0][0]) != "#": #GT:AD:DP:GQ ...=> DP in third position
                fr.write(str(e[0]))
                for i in range(1,8):
                    fr.write("\t"+str(e[i]))
                fr.write("\tGT:DP")
                for i in range(9,135):
                    INFOS=e[i].split(":") #0/0:9:27
                    GT=INFOS[0]
                    DP=INFOS[2]
                    fr.write("\t"+str(GT)+":"+str(DP))
            fr.write("\n")

##### 2.3 Get positions of variants:

In [6]:
dico_variants={}
with open(vcf_variants+".reformat.vcf", "r") as file:
    for line in file:
        e=line.split("\t")
        if str(e[0][0]) != "#": 
            position=e[1] 
            dico_variants[str(position)]=line
print(len(dico_variants))
print(dico_variants["129947"])

10090
chr7	129947	.	T	G	1484.59	PASS	.	GT:DP	0/1:5	0/0:8	0/0:10	0/0:9	./.:4	0/0:11	0/0:6	./.:1	0/0:5	0/0:6	./.:4	0/0:11	0/0:5	0/0:9	0/0:8	0/0:8	0/0:9	0/0:5	0/0:9	0/0:15	./.:4	0/0:13	./.:2	0/0:7	0/0:5	0/0:9	0/0:10	1/1:8	0/0:14	0/0:9	0/0:5	0/0:9	0/0:11	0/0:11	0/0:5	0/0:11	./.:3	0/1:8	0/0:6	0/0:8	0/0:11	0/1:10	0/0:5	0/0:6	0/0:9	./.:1	0/0:11	0/0:11	0/0:9	0/0:10	0/0:9	0/0:5	./.:1	0/0:10	0/0:7	0/0:6	0/0:9	0/0:17	0/0:5	0/0:7	0/0:11	0/0:9	0/0:10	0/0:5	./.:4	0/0:10	0/0:6	./.:2	0/1:11	0/0:6	0/0:8	./.:3	0/0:7	0/1:11	0/0:5	./.:4	0/0:6	0/0:14	0/0:5	0/0:6	0/0:8	0/0:6	0/0:9	0/0:9	0/0:7	./.:3	0/0:5	0/0:12	0/0:8	./.:3	0/0:7	0/0:12	./.:2	0/0:9	0/0:8	0/0:5	0/0:9	1/1:7	0/0:5	./.:3	0/0:8	0/0:5	0/0:8	0/0:6	0/1:6	0/0:8	0/0:8	0/0:8	0/0:11	./.:1	0/0:15	0/0:5	0/0:9	./.:4	0/0:5	0/0:8	0/0:9	0/0:10	0/0:13	0/0:7	0/0:6	0/0:17	0/1:12	0/0:18	0/0:9	0/0:8



##### 2.4 Write file with all positions: variants and unvariants

In [7]:
with open("chr1_all_sites_variant_unvariant.vcf", "w") as R_file:
    for i in range(1,330000000):
        if str(i) in dico_non_variants: 
            R_file.write(dico_non_variants[str(i)])
        if str(i) in dico_variants: 
            R_file.write(dico_variants[str(i)])

##### 2.5 Pixy to get diversity

In [None]:
#cat header-cerao.txt chr1_all_sites_variant_unvariant.vcf > chr1_all_sites_variant_unvariant_header.vcf
#conda activate pixy
#bgzip chr1_all_sites_variant_unvariant_header.vcf
#tabix chr1_all_sites_variant_unvariant_header.vcf.gz
#pixy --stats pi --vcf chr1_all_sites_variant_unvariant_header.vcf --populations samples_pop_pixy.txt --window_size 10000 --output_folder output_pixy --output_prefix pixy_o --n_cores 15