In [20]:
import os
import re
import numpy as np
import glob
import pandas as pd
import io

In [21]:
os.getcwd()

'/home/mkient/gitrepos/my_test'

In [22]:
#os.listdir()

# Building pipelines 

In [4]:
def fastp_trimm(dir_data, dir_out, q=15, l=200):
    '''
    Function to make reads quality filtering
    '''
    # outputs directories 
    os.makedirs(f'fastp_ouputs/{dir_out}1', exist_ok=True)
    os.makedirs(f'fastp_ouputs/{dir_out}2', exist_ok=True)
    os.makedirs('residual',exist_ok=True)
    
    #imputs files list
    pools = [re.split('_',i)[0] for i in os.listdir(dir_data)]

    for pool in pools:
        #in and output files names
        in1 = f'{dir_data}/{pool}_R1_001.fastq.gz'
        in2 = f'{dir_data}/{pool}_R2_001.fastq.gz'
        out1 = f'fastp_ouputs/{dir_out}1/{pool}_R1_trimmed.fastq.gz'
        out2 = f'fastp_ouputs/{dir_out}2/{pool}_R2_trimmed.fastq.gz'
        out_html,out_log = f'residual/{pool}.html', f'residual/{pool}.log'
        #run fastp for trimming
        ! fastp --in1 $in1 --in2 $in2 --out1 $out1 --out2 $out2 -q $q -l $l -h $out_html &> $out_log
    print('done !')

In [5]:
def align_fastq(dir_f, dir_r, ref_dir):
    '''
    function to align fastq data
    '''
    # reference
    ref = f'{ref_dir}/*.fasta'
    #list of the dir_data
    pools = [re.split('_',i)[0] for i in os.listdir(f'{dir_f}')]
    #make dir for output results
    os.makedirs('align_results/sam_files', exist_ok=True)
    os.makedirs('align_results/bam_view', exist_ok=True)
    os.makedirs('align_results/bam_sorted', exist_ok=True)
    #align
    for pool in pools:
        print(f'starting alignment of sample {pool}')
        forward = f'{dir_f}/{pool}_*'
        reverse = f'{dir_r}/{pool}_*'
        arg=f'{pool}'
        output1 = f'align_results/sam_files/{pool}_mapped.sam'
        output2 = f'align_results/bam_view/{pool}_mapped_view.bam'
        output3 = f'align_results/bam_sorted/{pool}_mapped_sorted.bam'
        ! bwa mem -M -t 4 $ref $forward $reverse | samtools view -b | samtools sort -T $arg > $output3
        #! samtools view $output1 -b -o $output2
        #! samtools sort $output2 -o $output3

    print('done')

print('done !')

done !


In [6]:
def lofreq_call (bam_path, ref_path, output_path):
    """
    for variants calling using lofreq program
    """
    os.makedirs(f'{output_path}', exist_ok=True)
    ref = f'{ref_path}'
    ##
    print('Variant calling in progress !')
    for path in glob.glob(f'{bam_path}'):
        vi_path = re.split('/|.bam',path)[-2]
        out_path = f'{output_path}/{vi_path}.vcf'
        path_i=f'{path}'
        ##call variants
        !lofreq call -f $ref -o $out_path $path_i
    print('done !')

In [24]:
def vcf_to_table(vcf_path, sample_id=None):
    """
    To convert vcf file to dataFrames
    """
    #Open vcf file and extract variants
    with open(f'{vcf_path}', 'r') as file:
        lines_vcf = [line for line in file.readlines() if not line.startswith('##')]

    # Creat vcf tables
    vcf_table1 = pd.read_table(io.StringIO(''.join(lines_vcf)), sep='\t').rename(columns={'#CHROM': 'CHROM'})

    #change data in the INFO column
    lines_tab = [lin for lin in vcf_table1.INFO]
    text1, text_list = '', []
    for i in range(0,8,2):
        text1 += re.split(';|=',lines_tab[0])[i]+'\t'
    text_list.append(text1[:-1]+'\n')
    for line in lines_tab:
        text2=''
        for i in range(1,9,2):
            text2 += re.split(';|=',line)[i]+'\t'
        text_list.append(text2[:-1]+'\n')
    info_df = pd.read_table(io.StringIO(''.join(text_list)),sep='\t')

    #concat both table to one
    df_tab = pd.concat([vcf_table1,info_df],axis=1)
    if sample_id:
        sample = [f'{sample_id}' for pos in df_tab.POS]
        df_tab.insert(0,'samples',sample)
    return df_tab

# Running the pipelines 

In [8]:
## trimming and QC
fastp_trimm(dir_data='raw_data/', dir_out='R')

done !


In [11]:
## Mapping to ref genome 
ref = 'reference/*.fasta'
#! bwa index $ref

In [12]:
%%capture 
align_fastq(dir_f='fastp_ouputs/R1/', dir_r='fastp_ouputs/R2/',
           ref_dir='reference/')

In [13]:
## checking mapping stats - in one bam files 
! samtools flagstats 'align_results/bam_sorted/AC1_mapped_sorted.bam'

83676 + 0 in total (QC-passed reads + QC-failed reads)
83670 + 0 primary
6 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
83656 + 0 mapped (99.98% : N/A)
83650 + 0 primary mapped (99.98% : N/A)
83670 + 0 paired in sequencing
41835 + 0 read1
41835 + 0 read2
83208 + 0 properly paired (99.45% : N/A)
83644 + 0 with itself and mate mapped
6 + 0 singletons (0.01% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)


In [14]:
## copy files
## indexing bam files 
os.makedirs('align_results/bam_index', exist_ok=True)
! cp align_results/bam_sorted/* align_results/bam_index/
os.listdir('align_results/bam_index')
## samtools index
pools = [re.split('_', i)[0] for i in os.listdir('align_results/bam_index')]
for pool in pools:
    index_files = f'align_results/bam_index/{pool}_mapped_sorted.bam'
    ! samtools index $index_files
print('done !')

In [17]:
## variants calling using lofreq 
os.makedirs('variants/lofreq', exist_ok=True)
! ls -l 'variants/lofreq'

total 0


In [82]:
#for path in glob.glob('align_results/bam_sorted/*_mapped_sorted.bam'):
    #print(path)

In [18]:
lofreq_call(bam_path='align_results/bam_sorted/*_mapped_sorted.bam',ref_path='reference/Anopheles_gambiaePEST4.fasta',
           output_path='variants/lofreq/')

Variant calling in progress !
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
done !


In [25]:
## convert vcf files to dataframes
df_list = []
for link in glob.glob('variants/lofreq/*vcf'):
    sample_id=re.split('/|_',link)[2]
    vcf_table = vcf_to_table(f'{link}', sample_id=sample_id)
    df_list.append(vcf_table)
vcf_table_samples = pd.concat(df_list)
vcf_table_samples.samples.unique()

array(['P14', 'P8', 'P23', 'P12', 'P21', 'P1', 'P19', 'P15', 'P17', 'P10',
       'AC1', 'P6'], dtype=object)

In [26]:
vcf_table_samples

Unnamed: 0,samples,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,DP,AF,SB,DP4
0,P14,AgamP4_2R,48714444,.,G,T,139,PASS,"DP=46503;AF=0.001118;SB=0;DP4=46445,0,52,0",46503,0.001118,0,464450520
1,P14,AgamP4_2R,48714445,.,C,T,95,PASS,"DP=46503;AF=0.000946;SB=0;DP4=46453,0,44,0",46503,0.000946,0,464530440
2,P14,AgamP4_2R,48714446,.,G,T,50,PASS,"DP=46503;AF=0.001355;SB=0;DP4=46379,0,63,0",46503,0.001355,0,463790630
3,P14,AgamP4_2R,48714453,.,G,A,124,PASS,"DP=46503;AF=0.001441;SB=0;DP4=46397,0,67,0",46503,0.001441,0,463970670
4,P14,AgamP4_2R,48714458,.,A,T,182,PASS,"DP=46503;AF=0.003548;SB=0;DP4=46162,0,165,0",46503,0.003548,0,4616201650
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,P6,AgamP4_2R,48714692,.,C,A,61,PASS,"DP=45133;AF=0.001684;SB=0;DP4=0,44936,0,76",45133,0.001684,0,044936076
11,P6,AgamP4_2R,48714692,.,C,G,151,PASS,"DP=45133;AF=0.002238;SB=0;DP4=0,44936,0,101",45133,0.002238,0,0449360101
12,P6,AgamP4_2R,48714697,.,C,A,115,PASS,"DP=45133;AF=0.002016;SB=0;DP4=0,44988,0,91",45133,0.002016,0,044988091
13,P6,AgamP4_2R,48714700,.,C,A,219,PASS,"DP=45133;AF=0.001795;SB=0;DP4=0,45014,0,81",45133,0.001795,0,045014081


## Genomic data filtering 

### function for raw data quality filtering

In [5]:
def fastp_trimm(dir_data, dir_out, q=15, l=200):
    '''
    Function to make reads quality filtering
    '''
    # outputs directories 
    os.makedirs(f'fastp_ouputs/{dir_out}1', exist_ok=True)
    os.makedirs(f'fastp_ouputs/{dir_out}2', exist_ok=True)
    os.makedirs('residual',exist_ok=True)
    
    #imputs files list
    pools = [re.split('_',i)[0] for i in os.listdir(dir_data)]

    for pool in pools:
        #in and output files names
        in1 = f'{dir_data}/{pool}_R1_001.fastq.gz'
        in2 = f'{dir_data}/{pool}_R2_001.fastq.gz'
        out1 = f'fastp_ouputs/{dir_out}1/{pool}_R1_trimmed.fastq.gz'
        out2 = f'fastp_ouputs/{dir_out}2/{pool}_R2_trimmed.fastq.gz'
        out_html,out_log = f'residual/{pool}.html', f'residual/{pool}.log'
        #run fastp for trimming
        ! fastp --in1 $in1 --in2 $in2 --out1 $out1 --out2 $out2 -q $q -l $l -h $out_html &> $out_log
    print('done !')

In [55]:
fastp_trimm(dir_data='raw_data/', dir_out='R')

done !


In [6]:
list2 = [i.split('_')[0] for i in os.listdir('raw_data/')]
#list2

In [7]:
#! less 'fastp_ouputs/dir_out1/AC1_R1_trimmed.fastq.gz' %%capture

In [8]:
#! ls -ls fastp_ouputs/

In [9]:
os.listdir('fastp_ouputs/')

['R1', 'out1', 'dir_out2', 'out2', 'R2', 'dir_out1']

## data mapping to the reference genome

In [10]:
ref = 'reference/*.fasta'
#! bwa index $ref

In [60]:
#! bwa mem -M -t 4 $ref  'raw_data/AC1_R1_001.fastq.gz' 'raw_data/AC1_R2_001.fastq.gz' > 'align_results/AC1_mapped.sam'  #| samtools view -b | samtools sort -T AC > 'align_results/AC1_mapped.bam' 

In [61]:
#! bwa mem -M -t 4 $ref  'raw_data/AC1_R1_001.fastq.gz' 'raw_data/AC1_R2_001.fastq.gz' | samtools view -b | samtools sort -T AC > 'align_results/AC1_mapped.bam' 

In [62]:
#! bwa mem -M -t 4 $ref  'raw_data/P1_R1_001.fastq.gz' 'raw_data/P1_R2_001.fastq.gz' | samtools view -b | samtools sort -T AC > 'align_results/P1_mapped.bam' 

In [11]:
def align_fastq(dir_f, dir_r, ref_dir):
    '''
    function to align fastq data
    '''
    # reference
    ref = f'{ref_dir}/*.fasta'
    #list of the dir_data
    pools = [re.split('_',i)[0] for i in os.listdir(f'{dir_f}')]
    #make dir for output results
    os.makedirs('align_results/sam_files', exist_ok=True)
    os.makedirs('align_results/bam_view', exist_ok=True)
    os.makedirs('align_results/bam_sorted', exist_ok=True)
    #align
    for pool in pools:
        print(f'starting alignment of sample {pool}')
        forward = f'{dir_f}/{pool}_*'
        reverse = f'{dir_r}/{pool}_*'
        arg=f'{pool}'
        output1 = f'align_results/sam_files/{pool}_mapped.sam'
        output2 = f'align_results/bam_view/{pool}_mapped_view.bam'
        output3 = f'align_results/bam_sorted/{pool}_mapped_sorted.bam'
        ! bwa mem -M -t 4 $ref $forward $reverse | samtools view -b | samtools sort -T $arg > $output3
        #! samtools view $output1 -b -o $output2
        #! samtools sort $output2 -o $output3

    print('done')

print('done !')

done !


In [68]:
%%capture 
align_fastq(dir_f='fastp_ouputs/R1/', dir_r='fastp_ouputs/R2/',
           ref_dir='reference/')

In [12]:
! samtools flagstats 'align_results/bam_sorted/AC1_mapped_sorted.bam'

83676 + 0 in total (QC-passed reads + QC-failed reads)
83670 + 0 primary
6 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
83656 + 0 mapped (99.98% : N/A)
83650 + 0 primary mapped (99.98% : N/A)
83670 + 0 paired in sequencing
41835 + 0 read1
41835 + 0 read2
83208 + 0 properly paired (99.45% : N/A)
83644 + 0 with itself and mate mapped
6 + 0 singletons (0.01% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)


## samtools index for igv 

In [70]:
## copy files
os.makedirs('align_results/bam_index', exist_ok=True)
! cp align_results/bam_sorted/* align_results/bam_index/
os.listdir('align_results/bam_index')
## samtools index
pools = [re.split('_', i)[0] for i in os.listdir('align_results/bam_index')]
for pool in pools:
    index_files = f'align_results/bam_index/{pool}_mapped_sorted.bam'
    ! samtools index $index_files

In [13]:
os.listdir('align_results/bam_index/')

['P6_mapped_sorted.bam.bai',
 'P6_mapped_sorted.bam',
 'P10_mapped_sorted.bam',
 'P17_mapped_sorted.bam',
 'P14_mapped_sorted.bam',
 'P21_mapped_sorted.bam',
 'P17_mapped_sorted.bam.bai',
 'P19_mapped_sorted.bam.bai',
 'P8_mapped_sorted.bam.bai',
 'AC1_mapped_sorted.bam',
 'P1_mapped_sorted.bam',
 'P14_mapped_sorted.bam.bai',
 'P1_mapped_sorted.bam.bai',
 'P21_mapped_sorted.bam.bai',
 'P12_mapped_sorted.bam',
 'P10_mapped_sorted.bam.bai',
 'P15_mapped_sorted.bam',
 'AC1_mapped_sorted.bam.bai',
 'P15_mapped_sorted.bam.bai',
 'P23_mapped_sorted.bam.bai',
 'P12_mapped_sorted.bam.bai',
 'P23_mapped_sorted.bam',
 'P8_mapped_sorted.bam',
 'P19_mapped_sorted.bam']

## variants calling 

In [14]:
os.makedirs('variants/lofreq', exist_ok=True)
! ls -l 'variants/lofreq'

total 4
-rw-rw-r-- 1 mkient mkient 2882 Jul 10 23:15 P1_vars.vcf


In [15]:
! ls -la 'align_results/bam_sorted/'

total 80944
drwxrwxr-x 2 mkient mkient    4096 Jul  6 23:51 .
drwxrwxr-x 6 mkient mkient    4096 Jul 10 22:28 ..
-rw-rw-r-- 1 mkient mkient 6528987 Jul 10 23:04 AC1_mapped_sorted.bam
-rw-rw-r-- 1 mkient mkient 6807350 Jul 10 23:04 P10_mapped_sorted.bam
-rw-rw-r-- 1 mkient mkient 6924844 Jul 10 23:04 P12_mapped_sorted.bam
-rw-rw-r-- 1 mkient mkient 7089954 Jul 10 23:04 P14_mapped_sorted.bam
-rw-rw-r-- 1 mkient mkient 7561100 Jul 10 23:04 P15_mapped_sorted.bam
-rw-rw-r-- 1 mkient mkient 6791533 Jul 10 23:04 P17_mapped_sorted.bam
-rw-rw-r-- 1 mkient mkient 4631398 Jul 10 23:04 P19_mapped_sorted.bam
-rw-rw-r-- 1 mkient mkient 8218061 Jul 10 23:04 P1_mapped_sorted.bam
-rw-rw-r-- 1 mkient mkient 6318585 Jul 10 23:04 P21_mapped_sorted.bam
-rw-rw-r-- 1 mkient mkient 6461800 Jul 10 23:04 P23_mapped_sorted.bam
-rw-rw-r-- 1 mkient mkient 7401808 Jul 10 23:04 P6_mapped_sorted.bam
-rw-rw-r-- 1 mkient mkient 8124849 Jul 10 23:04 P8_mapped_sorted.bam


In [48]:
#os.makedirs('ost/ist/')

In [58]:
def lofreq_call (bam_path, ref_path, output_path):
    """
    for variants calling using lofreq program
    """
    os.makedirs(f'{output_path}', exist_ok=True)
    ref = f'{ref_path}'
    ##
    print('Variant calling in progress !')
    for path in glob.glob(f'{bam_path}'):
        vi_path = re.split('/|.bam',path)[-2]
        out_path = f'{output_path}/{vi_path}.vcf'
        path_i=f'{path}'
        ##call variants
        !lofreq call -f $ref -o $out_path $path_i
    print('done !')

In [59]:
lofreq_call(bam_path='align_results/bam_sorted/*_mapped_sorted.bam',ref_path='reference/Anopheles_gambiaePEST4.fasta',
           output_path='variants/lofreq/')

Variant calling in progress !
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
Number of substitution tests performed: 873
Number of indel tests performed: 0
done !


In [51]:
for i in glob.glob('align_results/bam_sorted/*_mapped_sorted.bam'):
    print(str(i))

align_results/bam_sorted/P6_mapped_sorted.bam
align_results/bam_sorted/P10_mapped_sorted.bam
align_results/bam_sorted/P17_mapped_sorted.bam
align_results/bam_sorted/P14_mapped_sorted.bam
align_results/bam_sorted/P21_mapped_sorted.bam
align_results/bam_sorted/AC1_mapped_sorted.bam
align_results/bam_sorted/P1_mapped_sorted.bam
align_results/bam_sorted/P12_mapped_sorted.bam
align_results/bam_sorted/P15_mapped_sorted.bam
align_results/bam_sorted/P23_mapped_sorted.bam
align_results/bam_sorted/P8_mapped_sorted.bam
align_results/bam_sorted/P19_mapped_sorted.bam


In [57]:
vi = 'align_results/bam_sorted/P19_mapped_sorted.bam'
re.split('/|.bam',vi)[-2]

'P19_mapped_sorted'

In [75]:
! lofreq call -f 'reference/Anopheles_gambiaePEST4.fasta' -o 'variants/lofreq/P1_vars.vcf' 'align_results/bam_sorted/P1_mapped_sorted.bam'

Number of substitution tests performed: 873
Number of indel tests performed: 0


In [16]:
! ls -l 'variants/lofreq'

total 4
-rw-rw-r-- 1 mkient mkient 2882 Jul 10 23:15 P1_vars.vcf


In [17]:
! cat 'variants/lofreq/P1_vars.vcf'

##fileformat=VCFv4.0
##fileDate=20240710
##source=lofreq call -f reference/Anopheles_gambiaePEST4.fasta -o variants/lofreq/P1_vars.vcf align_results/bam_sorted/P1_mapped_sorted.bam 
##reference=reference/Anopheles_gambiaePEST4.fasta
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth">
##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">
##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
##INFO=<ID=CONSVAR,Number=0,Type=Flag,Description="Indicates that the variant is a consensus variant (as opposed to a low frequency variant).">
##INFO=<ID=HRUN,Number=1,Type=Integer,Description="Homopolymer length to the right of report indel position">
##FILTER=<ID=min_dp_10,Description="Minimum Coverage 10">
##FILTER=<ID=sb

In [108]:
#pd.read_csv('variants/lofreq/P1_vars.vcf', sep='\t', comment='#')

In [66]:
def vcf_to_table(vcf_path, sample_id=None):
    """
    To convert vcf file to dataFrames
    """
    #Open vcf file and extract variants
    with open(f'{vcf_path}', 'r') as file:
        lines_vcf = [line for line in file.readlines() if not line.startswith('##')]

    # Creat vcf tables
    vcf_table1 = pd.read_table(io.StringIO(''.join(lines)), sep='\t').rename(columns={'#CHROM': 'CHROM'})

    #change data in the INFO column
    lines_tab = [lin for lin in vcf_table1.INFO]
    text1, text_list = '', []
    for i in range(0,8,2):
        text1 += re.split(';|=',lines_tab[0])[i]+'\t'
    text_list.append(text1[:-1]+'\n')
    for line in lines_tab:
        text2=''
        for i in range(1,9,2):
            text2 += re.split(';|=',line)[i]+'\t'
        text_list.append(text2[:-1]+'\n')
    info_df = pd.read_table(io.StringIO(''.join(text_list)),sep='\t')

    #concat both table to one
    df_tab = pd.concat([vcf_table1,info_df],axis=1)
    if sample_id:
        sample = [f'{sample_id}' for pos in df_tab.POS]
        df_tab.insert(0,'samples',sample)
    return df_tab

In [68]:
vcf_table_P1 = vcf_to_table('variants/lofreq/P1_vars.vcf')
vcf_table_P1.query('SB<30')

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,DP,AF,SB,DP4
0,AgamP4_2R,48714441,.,G,T,68,PASS,"DP=52478;AF=0.000705;SB=0;DP4=52424,0,37,0",52478,0.000705,0,524240370
1,AgamP4_2R,48714444,.,G,T,137,PASS,"DP=52478;AF=0.000972;SB=0;DP4=52423,0,51,0",52478,0.000972,0,524230510
2,AgamP4_2R,48714445,.,C,T,152,PASS,"DP=52478;AF=0.001181;SB=0;DP4=52410,0,62,0",52478,0.001181,0,524100620
3,AgamP4_2R,48714446,.,G,A,67,PASS,"DP=52478;AF=0.001410;SB=0;DP4=52317,0,74,0",52478,0.00141,0,523170740
4,AgamP4_2R,48714446,.,G,T,61,PASS,"DP=52478;AF=0.001372;SB=0;DP4=52317,0,72,0",52478,0.001372,0,523170720
5,AgamP4_2R,48714451,.,C,T,85,PASS,"DP=52478;AF=0.000838;SB=0;DP4=52427,0,44,0",52478,0.000838,0,524270440
6,AgamP4_2R,48714453,.,G,A,191,PASS,"DP=52478;AF=0.001601;SB=0;DP4=52350,0,84,0",52478,0.001601,0,523500840
7,AgamP4_2R,48714458,.,A,T,254,PASS,"DP=52478;AF=0.003868;SB=0;DP4=52118,0,203,0",52478,0.003868,0,5211802030
8,AgamP4_2R,48714486,.,C,A,49314,PASS,"DP=104706;AF=0.408448;SB=4;DP4=30919,30892,215...",104706,0.408448,4,30919308922152121246
9,AgamP4_2R,48714527,.,A,G,2941,PASS,"DP=104882;AF=0.008982;SB=24;DP4=52028,51621,42...",104882,0.008982,24,5202851621427515


In [69]:
glob.glob('variants/lofreq/*vcf')

['variants/lofreq/P14_mapped_sorted.vcf',
 'variants/lofreq/P8_mapped_sorted.vcf',
 'variants/lofreq/P23_mapped_sorted.vcf',
 'variants/lofreq/P12_mapped_sorted.vcf',
 'variants/lofreq/P21_mapped_sorted.vcf',
 'variants/lofreq/P1_mapped_sorted.vcf',
 'variants/lofreq/P19_mapped_sorted.vcf',
 'variants/lofreq/P15_mapped_sorted.vcf',
 'variants/lofreq/P17_mapped_sorted.vcf',
 'variants/lofreq/P10_mapped_sorted.vcf',
 'variants/lofreq/AC1_mapped_sorted.vcf',
 'variants/lofreq/P1_vars.vcf',
 'variants/lofreq/P6_mapped_sorted.vcf']

In [27]:
df_list = []
for link in glob.glob('variants/lofreq/*vcf'):
    sample_id=re.split('/|_',link)[2]
    vcf_table = vcf_to_table(f'{link}', sample_id=sample_id)
    df_list.append(vcf_table)
vcf_table_samples = pd.concat(df_list)
vcf_table_samples

Unnamed: 0,samples,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,DP,AF,SB,DP4
0,P14,AgamP4_2R,48714444,.,G,T,139,PASS,"DP=46503;AF=0.001118;SB=0;DP4=46445,0,52,0",46503,0.001118,0,464450520
1,P14,AgamP4_2R,48714445,.,C,T,95,PASS,"DP=46503;AF=0.000946;SB=0;DP4=46453,0,44,0",46503,0.000946,0,464530440
2,P14,AgamP4_2R,48714446,.,G,T,50,PASS,"DP=46503;AF=0.001355;SB=0;DP4=46379,0,63,0",46503,0.001355,0,463790630
3,P14,AgamP4_2R,48714453,.,G,A,124,PASS,"DP=46503;AF=0.001441;SB=0;DP4=46397,0,67,0",46503,0.001441,0,463970670
4,P14,AgamP4_2R,48714458,.,A,T,182,PASS,"DP=46503;AF=0.003548;SB=0;DP4=46162,0,165,0",46503,0.003548,0,4616201650
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,P6,AgamP4_2R,48714692,.,C,A,61,PASS,"DP=45133;AF=0.001684;SB=0;DP4=0,44936,0,76",45133,0.001684,0,044936076
11,P6,AgamP4_2R,48714692,.,C,G,151,PASS,"DP=45133;AF=0.002238;SB=0;DP4=0,44936,0,101",45133,0.002238,0,0449360101
12,P6,AgamP4_2R,48714697,.,C,A,115,PASS,"DP=45133;AF=0.002016;SB=0;DP4=0,44988,0,91",45133,0.002016,0,044988091
13,P6,AgamP4_2R,48714700,.,C,A,219,PASS,"DP=45133;AF=0.001795;SB=0;DP4=0,45014,0,81",45133,0.001795,0,045014081


In [75]:
re.split('/|_','variants/lofreq/P14_mapped_sorted.vcf')[2]

'P14'

In [77]:
pd.concat(df_list)

Unnamed: 0,samples,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,DP,AF,SB,DP4
0,P14,AgamP4_2R,48714441,.,G,T,68,PASS,"DP=52478;AF=0.000705;SB=0;DP4=52424,0,37,0",52478,0.000705,0,524240370
1,P14,AgamP4_2R,48714444,.,G,T,137,PASS,"DP=52478;AF=0.000972;SB=0;DP4=52423,0,51,0",52478,0.000972,0,524230510
2,P14,AgamP4_2R,48714445,.,C,T,152,PASS,"DP=52478;AF=0.001181;SB=0;DP4=52410,0,62,0",52478,0.001181,0,524100620
3,P14,AgamP4_2R,48714446,.,G,A,67,PASS,"DP=52478;AF=0.001410;SB=0;DP4=52317,0,74,0",52478,0.001410,0,523170740
4,P14,AgamP4_2R,48714446,.,G,T,61,PASS,"DP=52478;AF=0.001372;SB=0;DP4=52317,0,72,0",52478,0.001372,0,523170720
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,P6,AgamP4_2R,48714685,.,T,G,72,PASS,"DP=52478;AF=0.001181;SB=0;DP4=0,52397,0,62",52478,0.001181,0,052397062
16,P6,AgamP4_2R,48714692,.,C,A,290,PASS,"DP=52478;AF=0.002306;SB=0;DP4=0,52338,0,121",52478,0.002306,0,0523380121
17,P6,AgamP4_2R,48714697,.,C,A,73,PASS,"DP=52478;AF=0.001563;SB=0;DP4=0,52321,0,82",52478,0.001563,0,052321082
18,P6,AgamP4_2R,48714700,.,C,A,138,PASS,"DP=52478;AF=0.001296;SB=0;DP4=0,52373,0,68",52478,0.001296,0,052373068


In [18]:
with open('variants/lofreq/P1_vars.vcf','r') as file:
    lines = [line for line in file.readlines() if not line.startswith('##')]
lines

['#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
 'AgamP4_2R\t48714441\t.\tG\tT\t68\tPASS\tDP=52478;AF=0.000705;SB=0;DP4=52424,0,37,0\n',
 'AgamP4_2R\t48714444\t.\tG\tT\t137\tPASS\tDP=52478;AF=0.000972;SB=0;DP4=52423,0,51,0\n',
 'AgamP4_2R\t48714445\t.\tC\tT\t152\tPASS\tDP=52478;AF=0.001181;SB=0;DP4=52410,0,62,0\n',
 'AgamP4_2R\t48714446\t.\tG\tA\t67\tPASS\tDP=52478;AF=0.001410;SB=0;DP4=52317,0,74,0\n',
 'AgamP4_2R\t48714446\t.\tG\tT\t61\tPASS\tDP=52478;AF=0.001372;SB=0;DP4=52317,0,72,0\n',
 'AgamP4_2R\t48714451\t.\tC\tT\t85\tPASS\tDP=52478;AF=0.000838;SB=0;DP4=52427,0,44,0\n',
 'AgamP4_2R\t48714453\t.\tG\tA\t191\tPASS\tDP=52478;AF=0.001601;SB=0;DP4=52350,0,84,0\n',
 'AgamP4_2R\t48714458\t.\tA\tT\t254\tPASS\tDP=52478;AF=0.003868;SB=0;DP4=52118,0,203,0\n',
 'AgamP4_2R\t48714486\t.\tC\tA\t49314\tPASS\tDP=104706;AF=0.408448;SB=4;DP4=30919,30892,21521,21246\n',
 'AgamP4_2R\t48714527\t.\tA\tG\t2941\tPASS\tDP=104882;AF=0.008982;SB=24;DP4=52028,51621,427,515\n',
 'AgamP4_2R\t48714532\t.\tC

In [133]:
#lines = [line for line in var1 if not line.startswith('##')]
#lines     

In [19]:
vcf_table1 = pd.read_table(io.StringIO(''.join(lines)), sep='\t').rename(columns={'#CHROM': 'CHROM'})
vcf_table1

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,AgamP4_2R,48714441,.,G,T,68,PASS,"DP=52478;AF=0.000705;SB=0;DP4=52424,0,37,0"
1,AgamP4_2R,48714444,.,G,T,137,PASS,"DP=52478;AF=0.000972;SB=0;DP4=52423,0,51,0"
2,AgamP4_2R,48714445,.,C,T,152,PASS,"DP=52478;AF=0.001181;SB=0;DP4=52410,0,62,0"
3,AgamP4_2R,48714446,.,G,A,67,PASS,"DP=52478;AF=0.001410;SB=0;DP4=52317,0,74,0"
4,AgamP4_2R,48714446,.,G,T,61,PASS,"DP=52478;AF=0.001372;SB=0;DP4=52317,0,72,0"
5,AgamP4_2R,48714451,.,C,T,85,PASS,"DP=52478;AF=0.000838;SB=0;DP4=52427,0,44,0"
6,AgamP4_2R,48714453,.,G,A,191,PASS,"DP=52478;AF=0.001601;SB=0;DP4=52350,0,84,0"
7,AgamP4_2R,48714458,.,A,T,254,PASS,"DP=52478;AF=0.003868;SB=0;DP4=52118,0,203,0"
8,AgamP4_2R,48714486,.,C,A,49314,PASS,"DP=104706;AF=0.408448;SB=4;DP4=30919,30892,215..."
9,AgamP4_2R,48714527,.,A,G,2941,PASS,"DP=104882;AF=0.008982;SB=24;DP4=52028,51621,42..."


In [141]:
lines_tab = [lin for lin in vcf_table1.INFO]
lines_tab

['DP=52478;AF=0.000705;SB=0;DP4=52424,0,37,0',
 'DP=52478;AF=0.000972;SB=0;DP4=52423,0,51,0',
 'DP=52478;AF=0.001181;SB=0;DP4=52410,0,62,0',
 'DP=52478;AF=0.001410;SB=0;DP4=52317,0,74,0',
 'DP=52478;AF=0.001372;SB=0;DP4=52317,0,72,0',
 'DP=52478;AF=0.000838;SB=0;DP4=52427,0,44,0',
 'DP=52478;AF=0.001601;SB=0;DP4=52350,0,84,0',
 'DP=52478;AF=0.003868;SB=0;DP4=52118,0,203,0',
 'DP=104706;AF=0.408448;SB=4;DP4=30919,30892,21521,21246',
 'DP=104882;AF=0.008982;SB=24;DP4=52028,51621,427,515',
 'DP=104898;AF=0.017007;SB=4;DP4=51537,51347,875,909',
 'DP=104918;AF=0.010170;SB=84;DP4=52003,51742,438,629',
 'DP=104929;AF=0.001925;SB=99;DP4=52275,52386,146,56',
 'DP=104929;AF=0.002020;SB=24;DP4=52336,52279,85,127',
 'DP=104929;AF=0.001935;SB=0;DP4=52293,52315,102,101',
 'DP=52478;AF=0.001181;SB=0;DP4=0,52397,0,62',
 'DP=52478;AF=0.002306;SB=0;DP4=0,52338,0,121',
 'DP=52478;AF=0.001563;SB=0;DP4=0,52321,0,82',
 'DP=52478;AF=0.001296;SB=0;DP4=0,52373,0,68',
 'DP=52478;AF=0.001601;SB=0;DP4=0,52364,0,8

In [144]:
res1 = re.split(';|=',lines_tab[0])
res1

['DP', '52478', 'AF', '0.000705', 'SB', '0', 'DP4', '52424,0,37,0']

In [35]:
text1, text_list = '', []
lines_tab = [lin for lin in vcf_table1.INFO]
for i in range(0,8,2):
    text1 += re.split(';|=',lines_tab[0])[i]+'\t'
text_list.append(text1[:-1]+'\n')
for line in lines_tab:
    text2=''
    for i in range(1,9,2):
        text2 += re.split(';|=',line)[i]+'\t'
    text_list.append(text2[:-1]+'\n')

In [38]:
text_list

['DP\tAF\tSB\tDP4\n',
 '52478\t0.000705\t0\t52424,0,37,0\n',
 '52478\t0.000972\t0\t52423,0,51,0\n',
 '52478\t0.001181\t0\t52410,0,62,0\n',
 '52478\t0.001410\t0\t52317,0,74,0\n',
 '52478\t0.001372\t0\t52317,0,72,0\n',
 '52478\t0.000838\t0\t52427,0,44,0\n',
 '52478\t0.001601\t0\t52350,0,84,0\n',
 '52478\t0.003868\t0\t52118,0,203,0\n',
 '104706\t0.408448\t4\t30919,30892,21521,21246\n',
 '104882\t0.008982\t24\t52028,51621,427,515\n',
 '104898\t0.017007\t4\t51537,51347,875,909\n',
 '104918\t0.010170\t84\t52003,51742,438,629\n',
 '104929\t0.001925\t99\t52275,52386,146,56\n',
 '104929\t0.002020\t24\t52336,52279,85,127\n',
 '104929\t0.001935\t0\t52293,52315,102,101\n',
 '52478\t0.001181\t0\t0,52397,0,62\n',
 '52478\t0.002306\t0\t0,52338,0,121\n',
 '52478\t0.001563\t0\t0,52321,0,82\n',
 '52478\t0.001296\t0\t0,52373,0,68\n',
 '52478\t0.001601\t0\t0,52364,0,84\n']

In [32]:
text_list[0][:-1]#+'\n'

'DP\tAF\tSB\tDP4'

In [37]:
info_df = pd.read_table(io.StringIO(''.join(text_list)),sep='\t')
info_df

Unnamed: 0,DP,AF,SB,DP4
0,52478,0.000705,0,524240370
1,52478,0.000972,0,524230510
2,52478,0.001181,0,524100620
3,52478,0.00141,0,523170740
4,52478,0.001372,0,523170720
5,52478,0.000838,0,524270440
6,52478,0.001601,0,523500840
7,52478,0.003868,0,5211802030
8,104706,0.408448,4,30919308922152121246
9,104882,0.008982,24,5202851621427515


In [39]:
pd.concat([vcf_table1,info_df],axis=1)

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,DP,AF,SB,DP4
0,AgamP4_2R,48714441,.,G,T,68,PASS,"DP=52478;AF=0.000705;SB=0;DP4=52424,0,37,0",52478,0.000705,0,524240370
1,AgamP4_2R,48714444,.,G,T,137,PASS,"DP=52478;AF=0.000972;SB=0;DP4=52423,0,51,0",52478,0.000972,0,524230510
2,AgamP4_2R,48714445,.,C,T,152,PASS,"DP=52478;AF=0.001181;SB=0;DP4=52410,0,62,0",52478,0.001181,0,524100620
3,AgamP4_2R,48714446,.,G,A,67,PASS,"DP=52478;AF=0.001410;SB=0;DP4=52317,0,74,0",52478,0.00141,0,523170740
4,AgamP4_2R,48714446,.,G,T,61,PASS,"DP=52478;AF=0.001372;SB=0;DP4=52317,0,72,0",52478,0.001372,0,523170720
5,AgamP4_2R,48714451,.,C,T,85,PASS,"DP=52478;AF=0.000838;SB=0;DP4=52427,0,44,0",52478,0.000838,0,524270440
6,AgamP4_2R,48714453,.,G,A,191,PASS,"DP=52478;AF=0.001601;SB=0;DP4=52350,0,84,0",52478,0.001601,0,523500840
7,AgamP4_2R,48714458,.,A,T,254,PASS,"DP=52478;AF=0.003868;SB=0;DP4=52118,0,203,0",52478,0.003868,0,5211802030
8,AgamP4_2R,48714486,.,C,A,49314,PASS,"DP=104706;AF=0.408448;SB=4;DP4=30919,30892,215...",104706,0.408448,4,30919308922152121246
9,AgamP4_2R,48714527,.,A,G,2941,PASS,"DP=104882;AF=0.008982;SB=24;DP4=52028,51621,42...",104882,0.008982,24,5202851621427515


In [163]:
res1[0]

'DP'

In [120]:
res = re.split('\t|\n',lines[1])
res[:-1]

['AgamP4_2R',
 '48714441',
 '.',
 'G',
 'T',
 '68',
 'PASS',
 'DP=52478;AF=0.000705;SB=0;DP4=52424,0,37,0']

In [129]:
re.split(';|=', res[-2])

['DP', '52478', 'AF', '0.000705', 'SB', '0', 'DP4', '52424,0,37,0']