In [2]:
#run in sch_man_nwinvasion-jupyter environment

import os
#import vcf
from tqdm import tqdm
import shutil
import allel
# import math
# import yaml
# import pandas as pd
# #import matplotlib.pyplot as plt
# import numpy as np
# from collections import defaultdict
# from scipy import stats
# import itertools
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

os.chdir("/master/nplatt/sch_man_nwinvasion")

# Full

In [15]:
#read in vcf
phased_callset=allel.read_vcf('results/phasing/auto_beagle.vcf')

#now get an index for each sample/population
samples = phased_callset["samples"]

#get genotypes
gt=allel.GenotypeArray(phased_callset['calldata/GT'])

In [16]:
#get chroms
chroms = phased_callset["variants/CHROM"]

#get ref alleles
ref_alleles = phased_callset["variants/REF"]

#get alt alleles
alt_alleles=[]
for sites in phased_callset["variants/ALT"]:
    alt_alleles.append(sites[0])

alleles=[list(a) for a in zip(ref_alleles, alt_alleles)]

In [17]:
# for chrom in range(1,8):
#     chrom="SM_V7_{}".format(chrom)
    
haplotypes = {}

for sample in samples:
    haplotypes[sample]=["",""]
    

for i in tqdm(range(len(gt))):
    for j in range(len(gt[i])):
        h1=gt[i][j][0]
        h2=gt[i][j][1]
        
        h1_nuc=alleles[i][h1]
        h2_nuc=alleles[i][h2]
        
        sample=samples[j]
        
        haplotypes[sample][0]+=h1_nuc
        haplotypes[sample][1]+=h2_nuc
   

100%|██████████| 475081/475081 [2:21:12<00:00, 56.07it/s]  


In [18]:
#delete Kenyan and rep samples:
for sample in ["ERR119614", "Sm.BR_PdV.1409_rep", "Sm.BR_PdV.1475_rep", "Sm.BR_PdV.2406_rep" ]:
    haplotypes.pop(sample, None)

In [20]:
#save as a phylip:
hap_1s=[]
hap_2s=[]

#make a sample key:
sample_key = {}
i=0
for sample in haplotypes.keys():
    h1_id="{}_h1".format(sample)
    h2_id="{}_h2".format(sample)
    sample_key[h1_id]=i
    sample_key[h2_id]=i+1
    i=i+2
    
    
for sample in haplotypes.keys():
    h1_id="{}_h1".format(sample)
    h2_id="{}_h2".format(sample)
    
    hap1 = SeqRecord( Seq(haplotypes[sample][0]),
                      id="{}".format(sample_key[h1_id]) )
    hap2 = SeqRecord( Seq(haplotypes[sample][1]),
                      id="{}".format(sample_key[h2_id]) )        
        
    hap_1s.append(hap1)
    hap_2s.append(hap2)
    

SeqIO.write(hap_1s + hap_2s, "results/phylonet/full.phy", "phylip")

302

In [None]:
#remove invariant sites
! python ~/sch_man_nwinvasion/bin/raxml_ascbias/ascbias.py -p results/phylonet/full.phy -o results/phylonet/variant.phy

In [22]:
#read in variable sites and update names.
rev_sample_key = {v: k for k, v in sample_key.items()}

var_haps={}
with open("results/phylonet/variant.phy", 'r') as f:
    next(f)
    for entry in f:
        tmp_id, seq = entry.rstrip().split("\t")
        sample_id = rev_sample_key[int(tmp_id)]
        var_haps[sample_id] = seq
        
        
# if os.path.isdir("results/phylonet/alignments") :
#     shutil.rmtree("results/phylonet/alignments")

os.makedirs("results/phylonet/alignments", exist_ok=True)

num_samples=len(var_haps.keys())
n = 1000

for sample in var_haps.keys():
    seq=var_haps[sample]
    chunks = [seq[i:i+n] for i in range(0, len(seq), n)]

    for i in range(len(chunks)):
#         if not os.path.isfile("results/phylonet/alignments/split_seq_{}.fas".format(i)):
#              with open("results/phylonet/alignments/split_seq_{}.fas".format(i), 'w') as f:
#                 f.write("{} {}\n".format(num_samples, n))

        split_seq=">{}\n{}\n".format(sample, chunks[i])

        with open("results/phylonet/alignments/split_seq_{}.fas".format(i), 'a') as f:
            f.write(split_seq)

In [None]:
#fix names
! sed -i 's/\./-/g' results/phylonet/alignments/split_seq*.fas 

In [None]:
%%bash

mkdir results/phylonet/raxml-ng
mkdir results/phylonet/raxml-ng/qlogs

CONDA="conda activate sch_man_nwinvasion-phylonet;"
QSUB="qsub -V -cwd -S /bin/bash -q all.q -j y -pe smp 6 "    

for FAS in $(ls results/phylonet/alignments/split_seq_*.fas); do

    SAMPLE=$(basename $FAS .fas)

    RAXML_CMD="raxml-ng \
        --all \
        --msa $FAS \
        --msa-format fasta \
        --outgroup ERR310938_h1,ERR310938_h2 \
        --threads 6 \
        --bs-trees 100 \
        --model GTR+G+ASC_LEWIS \
        --prefix results/phylonet/raxml-ng/$SAMPLE"     
        
        echo "$CONDA $RAXML_CMD" | $QSUB -N $SAMPLE -o results/phylonet/raxml-ng/qlogs/$SAMPLE.log
done


In [None]:
#now prep gene trees for phylonet
cat results/phylonet/gene_trees/*support >results/phylonet/gene_trees.nwk

#collapse trees w/ <10 bs support
nw_ed  results/phylonet/gene_trees.nwk 'i & b<=10' o > results/phylonet/gene_trees_BS10.nwk

#root on margrebowiei
nw_reroot results/phylonet/gene_trees_BS10.nwk ERX284221 >results/phylonet/gene_trees_BS10_marg_root.nwk

#rename and remove "."s in name
nw_rename short_IDs.nw id2longname.map

#may need to open file in figtree to remove bootstrap values


#then added the following text to the bottom to run phylonet
# BEGIN PHYLONET;

# InferNetwork_MPL (all) 4 -pl 48 -a <margrebowiei:ERX284221;cameroon:ERR103050;uganda:ERR119615,ERR997461;guadeloupe:ERR539847,ERR539848;puerto_rico:ERR046038;brazil:Sm-BR_PdV-1278-1,Sm-BR_PdV-2039-1,Sm-BR_PdV-2076-1,Sm-BR_PdV-2265-1,Sm-BR_PdV-2334-1,Sm-BR_PdV-2406-1,Sm-BR_PdV-2456-1,Sm-BR_PdV-2481-1,Sm-BR_PdV-2538-1,Sm-BR_PdV-2556-1;niger:Sm-NE_Di158-1,Sm-NE_Di186-1,Sm-NE_Di238-1,Sm-NE_Di297-1,Sm-NE_Di297-2,Sm-NE_Di68-2,Sm-NE_Na376-2,Sm-NE_Na381-1,Sm-NE_Na39-1,Sm-NE_Na40-1;senegal:ERR103049,Sm-SN_Nd115-1,Sm-SN_Nd22-1,Sm-SN_Nd24-1,Sm-SN_Nd34-1,Sm-SN_Nd43-1,Sm-SN_Nd47-1,Sm-SN_Nd50-1,Sm-SN_Nd54-1,Sm-SN_Nd90-1;tanzania:Sm-TZ_009-4-2,Sm-TZ_009-8-2,Sm-TZ_055-1-3,Sm-TZ_055-10-1,Sm-TZ_055-8-1,Sm-TZ_074N-1-2,Sm-TZ_074N-7-2,Sm-TZ_074N-8-1,Sm-TZ_086-1-1,Sm-TZ_134-1-1;rodhaini:ERR310938,Sro_female_1-1_CCATCCTC,Sro_female_1-2_CCGACAAC,Sro_female_2-1_CCTAATCC,Sro_female_2-2_CCTCTATC,Sro_male_1-1_ATCATTCC,Sro_male_1-2_ATTGGCTC,Sro_male_2-1_CAAGGAGC,Sro_male_2-2_CACCTTAC> mpl. ;

# END;

In [None]:
#NEXUS

BEGIN TREES;

Tree gt0=


END;

BEGIN PHYLONET;

InferNetwork_MPL geneTreeList 4 -a <> -pl 96 mpl. ;

END;

<uganda:ERR119615,ERR997461; tanzania:Sm.TZ_009.1.1,Sm.TZ_009.10.1,Sm.TZ_009.2.2,Sm.TZ_009.3.1,Sm.TZ_009.4.2,Sm.TZ_009.5.2,Sm.TZ_009.6.1,Sm.TZ_009.7.1,Sm.TZ_009.8.2,Sm.TZ_009.9.1,Sm.TZ_055.1.3,Sm.TZ_055.10.1,Sm.TZ_055.2.1,Sm.TZ_055.3.2,Sm.TZ_055.5.1,Sm.TZ_055.6.1,Sm.TZ_055.7.1,Sm.TZ_055.8.1,Sm.TZ_074N.1.2,Sm.TZ_074N.10.2,Sm.TZ_074N.2.2,Sm.TZ_074N.3.2,Sm.TZ_074N.4.1,Sm.TZ_074N.6.3,Sm.TZ_074N.7.2,Sm.TZ_074N.8.1,Sm.TZ_074N.9.1,Sm.TZ_077.2.1,Sm.TZ_077.3.1,Sm.TZ_077.4.2,Sm.TZ_077.5.1,Sm.TZ_077.6.1,Sm.TZ_077.7.3,Sm.TZ_077.8.1,Sm.TZ_077.9.2,Sm.TZ_086.1.1,Sm.TZ_086.2.3,Sm.TZ_086.3.1,Sm.TZ_086.4.2,Sm.TZ_086.5.1,Sm.TZ_086.6.1,Sm.TZ_086.7.1,Sm.TZ_086.8.1,Sm.TZ_134.1.1,Sm.TZ_134.2.2,Sm.TZ_134.4.1,Sm.TZ_134.5.1,Sm.TZ_134.6.1,Sm.TZ_141.1.1,Sm.TZ_141.3.1,Sm.TZ_141.4.2,Sm.TZ_141.5.3,Sm.TZ_141.6.1,Sm.TZ_141.7.1,Sm.TZ_141.8.1; senegal:ERR103049,Sm.SN_Nd103.1,Sm.SN_Nd109.1,Sm.SN_Nd114.1,Sm.SN_Nd115.1,Sm.SN_Nd18.1,Sm.SN_Nd22.1,Sm.SN_Nd24.1,Sm.SN_Nd25.1,Sm.SN_Nd34.1,Sm.SN_Nd43.1,Sm.SN_Nd47.1,Sm.SN_Nd5.1,Sm.SN_Nd5.2,Sm.SN_Nd50.1,Sm.SN_Nd54.1,Sm.SN_Nd56.1,Sm.SN_Nd77.1,Sm.SN_Nd79.1,Sm.SN_Nd9.1,Sm.SN_Nd90.1,Sm.SN_Te26.1,Sm.SN_Te3.1,Sm.SN_Te49.1,Sm.SN_Te55.1,Sm.SN_Te68.1; rodhaini:ERR310938,Sro_female_1.1_CCATCCTC,Sro_female_1.2_CCGACAAC,Sro_female_2.1_CCTAATCC,Sro_female_2.2_CCTCTATC,Sro_male_1.1_ATCATTCC,Sro_male_1.2_ATTGGCTC,Sro_male_2.1_CAAGGAGC,Sro_male_2.2_CACCTTAC; niger:Sm.NE_Di158.1,Sm.NE_Di186.1,Sm.NE_Di238.1,Sm.NE_Di297.1,Sm.NE_Di297.2,Sm.NE_Di68.2,Sm.NE_Na376.2,Sm.NE_Na381.1,Sm.NE_Na39.1,Sm.NE_Na40.1; margrebowiei:ERX284221; cameroon:ERR103050; brazil:Sm.BR_PdV.0447.1,Sm.BR_PdV.1039.1,Sm.BR_PdV.1079.1,Sm.BR_PdV.1094.1,Sm.BR_PdV.1103.1,Sm.BR_PdV.1127.1,Sm.BR_PdV.1278.1,Sm.BR_PdV.1340.1,Sm.BR_PdV.1340.2,Sm.BR_PdV.1371.1,Sm.BR_PdV.1404.1,Sm.BR_PdV.1409.1,Sm.BR_PdV.1418.1,Sm.BR_PdV.1475.1,Sm.BR_PdV.1489.1,Sm.BR_PdV.2039.1,Sm.BR_PdV.2039.2,Sm.BR_PdV.2072.1,Sm.BR_PdV.2074.1,Sm.BR_PdV.2076.1,Sm.BR_PdV.2133.1,Sm.BR_PdV.2147.1,Sm.BR_PdV.2189.1,Sm.BR_PdV.2196.2,Sm.BR_PdV.2225.1,Sm.BR_PdV.2227.1,Sm.BR_PdV.2265.1,Sm.BR_PdV.2290.1,Sm.BR_PdV.2300.1,Sm.BR_PdV.2334.1,Sm.BR_PdV.2368.1,Sm.BR_PdV.2406.1,Sm.BR_PdV.2422.1,Sm.BR_PdV.2450.1,Sm.BR_PdV.2456.1,Sm.BR_PdV.2481.1,Sm.BR_PdV.2489.1,Sm.BR_PdV.2508.1,Sm.BR_PdV.2516.2,Sm.BR_PdV.2530.1,Sm.BR_PdV.2538.1,Sm.BR_PdV.2546.1,Sm.BR_PdV.2556.1,Sm.BR_PdV.2577.1,Sm.BR_PdV.4293.2; guadeloupe:ERR539847,ERR539848; puerto_rico:ERR046038>

# reduced biallelic

In [3]:
samples = { "Sm.BR_PdV.1278.1":        "brazil", 
            "Sm.BR_PdV.2039.1":        "brazil",
            "Sm.BR_PdV.2076.1":        "brazil",
            "Sm.BR_PdV.2265.1":        "brazil",
            "Sm.BR_PdV.2334.1":        "brazil",
            "Sm.BR_PdV.2406.1":        "brazil",
            "Sm.BR_PdV.2456.1":        "brazil",
            "Sm.BR_PdV.2481.1":        "brazil",
            "Sm.BR_PdV.2538.1":        "brazil",
            "Sm.BR_PdV.2556.1":        "brazil",
            "ERR103050":               "cameroon",
            "ERR046038":               "puerto_rico",
            "ERR539847":               "guadeloupe",
            "ERR539848":               "guadeloupe",
            "ERX284221":               "margrebowiei",
            "Sm.NE_Di158.1":           "niger",
            "Sm.NE_Di186.1":           "niger",
            "Sm.NE_Di238.1":           "niger",
            "Sm.NE_Di297.1":           "niger",
            "Sm.NE_Di297.2":           "niger",
            "Sm.NE_Di68.2":            "niger",
            "Sm.NE_Na376.2":           "niger",
            "Sm.NE_Na381.1":           "niger",
            "Sm.NE_Na39.1":            "niger",
            "Sm.NE_Na40.1":            "niger",
            "ERR310938":               "rodhaini",
            "Sro_female_1.1_CCATCCTC": "rodhaini",
            "Sro_female_1.2_CCGACAAC": "rodhaini",
            "Sro_female_2.1_CCTAATCC": "rodhaini",
            "Sro_female_2.2_CCTCTATC": "rodhaini",
            "Sro_male_1.1_ATCATTCC":   "rodhaini",
            "Sro_male_1.2_ATTGGCTC":   "rodhaini",
            "Sro_male_2.1_CAAGGAGC":   "rodhaini",
            "Sro_male_2.2_CACCTTAC":   "rodhaini",
            "ERR103049":               "senegal",
            "Sm.SN_Nd115.1":           "senegal",
            "Sm.SN_Nd22.1":            "senegal",
            "Sm.SN_Nd24.1":            "senegal",
            "Sm.SN_Nd34.1":            "senegal",
            "Sm.SN_Nd43.1":            "senegal",
            "Sm.SN_Nd47.1":            "senegal",
            "Sm.SN_Nd50.1":            "senegal",
            "Sm.SN_Nd54.1":            "senegal",
            "Sm.SN_Nd90.1":            "senegal",
            "Sm.TZ_009.4.2":           "tanzania",
            "Sm.TZ_009.8.2":           "tanzania",
            "Sm.TZ_055.1.3":           "tanzania",
            "Sm.TZ_055.10.1":          "tanzania",
            "Sm.TZ_055.8.1":           "tanzania",
            "Sm.TZ_074N.1.2":          "tanzania",
            "Sm.TZ_074N.7.2":          "tanzania",
            "Sm.TZ_074N.8.1":          "tanzania",
            "Sm.TZ_086.1.1":           "tanzania",
            "Sm.TZ_134.1.1":           "tanzania",
            "ERR119615":               "uganda",
            "ERR997461":               "uganda" }

 
with open("results/phylonet/random_samples.list", 'w') as f:
    for sample in samples.keys():
        f.write("{}\n".format(sample))

In [135]:
%%bash

#create vcf with only these samples
vcftools \
    --vcf results/phasing/auto_beagle.vcf \
    --maf 0.05 \
    --recode \
    --recode-INFO-all \
    --stdout \
    >results/phylonet/auto_beagle_maf05.vcf
#After filtering, kept 98715 out of a possible 475081 Sites

vcftools \
    --vcf results/phylonet/auto_beagle_maf05.vcf \
    --keep results/phylonet/random_samples.list \
    --recode \
    --recode-INFO-all \
    --stdout \
    >results/phylonet/random_auto_beagle_maf05.vcf
    
plink \
    --vcf results/phylonet/random_auto_beagle_maf05.vcf \
    --double-id \
    --allow-extra-chr \
    --indep-pairwise 25 5 0.20 \
    --out results/phylonet/random_auto_beagle_maf05
#Pruning complete.  89154 of 98715 variants removed.

vcftools \
    --vcf results/phylonet/random_auto_beagle_maf05.vcf \
    --exclude results/phylonet/random_auto_beagle_maf05.prune.out \
    --recode \
    --recode-INFO-all \
    --stdout \
    >results/phylonet/random_auto_beagle_maf05_ld.vcf
#After filtering, kept 55 out of 55 Individuals
#Outputting VCF file...
#After filtering, kept 9561 out of a possible 98715 Sites



VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf results/phasing/auto_beagle.vcf
	--keep results/phylonet/random_samples/random_samples.list
	--recode-INFO-all
	--recode
	--stdout

Keeping individuals in 'keep' list
After filtering, kept 55 out of 155 Individuals
Outputting VCF file...
After filtering, kept 475081 out of a possible 475081 Sites
Run Time = 27.00 seconds


In [None]:
#create phylip
! bin/vcf2phylip/vcf2phylip.py -i results/phylonet/random_auto_beagle_maf05_ld.vcf -b

In [None]:
### fix header to say:
#NEXUS

BEGIN DATA;
        DIMENSIONS NTAX=55 NCHAR=9561;
        FORMAT DATATYPE=DNA SYMBOLS="012" MISSING=? GAP=- ;
MATRIX

<...>

BEGIN PHYLONET;

    MLE_BiMarkers -pseudo -pl 96 -op -mnr 10 -mno 20 -mf 50 -diploid -mr 4 -sd 987654321 -tm <margrebowiei:ERR310938; rodhaini:Sro_female_1.1_CCATCCTC,Sro_female_1.2_CCGACAAC,Sro_female_2.1_CCTAATCC,Sro_female_2.2_CCTCTATC,Sro_male_1.1_ATCATTCC,Sro_male_1.2_ATTGGCTC,Sro_male_2.1_CAAGGAGC,Sro_male_2.2_CACCTTAC; puerto_rico:ERR046038; guadeloupe:ERR539847,ERR539848; brazil:Sm.BR_PdV.1278.1,Sm.BR_PdV.2039.1,Sm.BR_PdV.2076.1,Sm.BR_PdV.2265.1,Sm.BR_PdV.2334.1,Sm.BR_PdV.2406.1,Sm.BR_PdV.2456.1,Sm.BR_PdV.2481.1,Sm.BR_PdV.2538.1,Sm.BR_PdV.2556.1; cameroon:ERR103050; niger:Sm.NE_Di158.1,Sm.NE_Di186.1,Sm.NE_Di238.1,Sm.NE_Di297.1,Sm.NE_Di297.2,Sm.NE_Di68.2,Sm.NE_Na376.2,Sm.NE_Na381.1,Sm.NE_Na39.1,Sm.NE_Na40.1; senegal:ERR103049,Sm.SN_Nd115.1,Sm.SN_Nd22.1,Sm.SN_Nd24.1,Sm.SN_Nd34.1,Sm.SN_Nd43.1,Sm.SN_Nd47.1,Sm.SN_Nd50.1,Sm.SN_Nd54.1,Sm.SN_Nd90.1; uganda:ERR119615,ERR997461; tanzania:Sm.TZ_009.4.2,Sm.TZ_009.8.2,Sm.TZ_055.1.3,Sm.TZ_055.10.1,Sm.TZ_055.8.1,Sm.TZ_074N.1.2,Sm.TZ_074N.7.2,Sm.TZ_074N.8.1,Sm.TZ_086.1.1,Sm.TZ_134.1.1> mle_output;

END;



In [None]:
%%bash

cd results/phylonet

java -jar ~/sch_man_nwinvasion/bin/PhyloNet_3.8.2.jar random_auto_beagle_maf05_ld.min4.bin.PHYLONET.nexus

In [141]:
os.getcwd()

'/master/nplatt/sch_man_nwinvasion'

In [142]:
#split into 1K snp chunks

seqs={}
with open("results/phylonet/random_samples/random_auto_beagle_variant.phy", 'r') as f:
    next(f)
    for entry in f:
        seq_id, seq = entry.rstrip().split("\t")
        seqs[seq_id] = seq
        

In [147]:
os.makedirs("results/phylonet/random_alignments", exist_ok=True)

num_samples=len(seqs.keys())
n_snps = 1_000

for sample in seqs.keys():
    seq=seqs[sample]
    chunks = [seq[i:i+n_snps] for i in range(0, len(seq), n_snps)]
    
    for i in range(len(chunks)):
        split_seq=">{}\n{}\n".format(sample, chunks[i])

        with open("results/phylonet/random_alignments/random_split_seq_{}.fas".format(i), 'a') as f:
            f.write(split_seq)
            

In [None]:
%%bash

CONDA="conda activate sch_man_nwinvasion-phylonet;"
QSUB="qsub -V -cwd -S /bin/bash -q all.q -j y -pe smp 8 "    

mkdir results/phylonet/random_raxml-ng
mkdir results/phylonet/random_raxml-ng/qlogs

for FAS in $(ls results/phylonet/random_alignments/random_split_seq_*.fas); do

    SAMPLE=$(basename $FAS .fas)

    RAXML_CMD="raxml-ng \
        --all \
        --msa $FAS \
        --msa-format fasta \
        --outgroup ERR310938 \
        --threads 8 \
        --bs-trees 100 \
        --model GTGTR4+G+ASC_LEWIS \
        --prefix results/phylonet/random_raxml-ng/$SAMPLE"     
        
        echo "$CONDA $RAXML_CMD" | $QSUB -N r$SAMPLE -o results/phylonet/random_raxml-ng/qlogs/$SAMPLE.log
done


In [None]:
#now prep gene trees for phylonet
nw_ed  trees 'i & b<=10' o > treesBS10.trees
ERX284221

In [None]:
#NEXUS

BEGIN TREES;

Tree gt0=


END;

BEGIN PHYLONET;

    MLE_BiMarkers -pseudo -pl 96 -op -mnr 10 -mno 20 -mf 50 -diploid -mr 4 -sd 987654321 -tm <margrebowiei:ERR310938; rodhaini:Sro_female_1.1_CCATCCTC,Sro_female_1.2_CCGACAAC,Sro_female_2.1_CCTAATCC,Sro_female_2.2_CCTCTATC,Sro_male_1.1_ATCATTCC,Sro_male_1.2_ATTGGCTC,Sro_male_2.1_CAAGGAGC,Sro_male_2.2_CACCTTAC; puerto_rico:ERR046038; guadeloupe:ERR539847,ERR539848; brazil:Sm.BR_PdV.1278.1,Sm.BR_PdV.2039.1,Sm.BR_PdV.2076.1,Sm.BR_PdV.2265.1,Sm.BR_PdV.2334.1,Sm.BR_PdV.2406.1,Sm.BR_PdV.2456.1,Sm.BR_PdV.2481.1,Sm.BR_PdV.2538.1,Sm.BR_PdV.2556.1; cameroon:ERR103050; niger:Sm.NE_Di158.1,Sm.NE_Di186.1,Sm.NE_Di238.1,Sm.NE_Di297.1,Sm.NE_Di297.2,Sm.NE_Di68.2,Sm.NE_Na376.2,Sm.NE_Na381.1,Sm.NE_Na39.1,Sm.NE_Na40.1; senegal:ERR103049,Sm.SN_Nd115.1,Sm.SN_Nd22.1,Sm.SN_Nd24.1,Sm.SN_Nd34.1,Sm.SN_Nd43.1,Sm.SN_Nd47.1,Sm.SN_Nd50.1,Sm.SN_Nd54.1,Sm.SN_Nd90.1; uganda:ERR119615,ERR997461; tanzania:Sm.TZ_009.4.2,Sm.TZ_009.8.2,Sm.TZ_055.1.3,Sm.TZ_055.10.1,Sm.TZ_055.8.1,Sm.TZ_074N.1.2,Sm.TZ_074N.7.2,Sm.TZ_074N.8.1,Sm.TZ_086.1.1,Sm.TZ_134.1.1> mle_output;

END;



# reduced iupac

In [3]:
samples = { "Sm.BR_PdV.1278.1":        "brazil", 
            "Sm.BR_PdV.2039.1":        "brazil",
            "Sm.BR_PdV.2076.1":        "brazil",
            "Sm.BR_PdV.2265.1":        "brazil",
            "Sm.BR_PdV.2334.1":        "brazil",
            "Sm.BR_PdV.2406.1":        "brazil",
            "Sm.BR_PdV.2456.1":        "brazil",
            "Sm.BR_PdV.2481.1":        "brazil",
            "Sm.BR_PdV.2538.1":        "brazil",
            "Sm.BR_PdV.2556.1":        "brazil",
            "ERR103050":               "cameroon",
            "ERR046038":               "puerto_rico",
            "ERR539847":               "guadeloupe",
            "ERR539848":               "guadeloupe",
            "ERX284221":               "margrebowiei",
            "Sm.NE_Di158.1":           "niger",
            "Sm.NE_Di186.1":           "niger",
            "Sm.NE_Di238.1":           "niger",
            "Sm.NE_Di297.1":           "niger",
            "Sm.NE_Di297.2":           "niger",
            "Sm.NE_Di68.2":            "niger",
            "Sm.NE_Na376.2":           "niger",
            "Sm.NE_Na381.1":           "niger",
            "Sm.NE_Na39.1":            "niger",
            "Sm.NE_Na40.1":            "niger",
            "ERR310938":               "rodhaini",
            "Sro_female_1.1_CCATCCTC": "rodhaini",
            "Sro_female_1.2_CCGACAAC": "rodhaini",
            "Sro_female_2.1_CCTAATCC": "rodhaini",
            "Sro_female_2.2_CCTCTATC": "rodhaini",
            "Sro_male_1.1_ATCATTCC":   "rodhaini",
            "Sro_male_1.2_ATTGGCTC":   "rodhaini",
            "Sro_male_2.1_CAAGGAGC":   "rodhaini",
            "Sro_male_2.2_CACCTTAC":   "rodhaini",
            "ERR103049":               "senegal",
            "Sm.SN_Nd115.1":           "senegal",
            "Sm.SN_Nd22.1":            "senegal",
            "Sm.SN_Nd24.1":            "senegal",
            "Sm.SN_Nd34.1":            "senegal",
            "Sm.SN_Nd43.1":            "senegal",
            "Sm.SN_Nd47.1":            "senegal",
            "Sm.SN_Nd50.1":            "senegal",
            "Sm.SN_Nd54.1":            "senegal",
            "Sm.SN_Nd90.1":            "senegal",
            "Sm.TZ_009.4.2":           "tanzania",
            "Sm.TZ_009.8.2":           "tanzania",
            "Sm.TZ_055.1.3":           "tanzania",
            "Sm.TZ_055.10.1":          "tanzania",
            "Sm.TZ_055.8.1":           "tanzania",
            "Sm.TZ_074N.1.2":          "tanzania",
            "Sm.TZ_074N.7.2":          "tanzania",
            "Sm.TZ_074N.8.1":          "tanzania",
            "Sm.TZ_086.1.1":           "tanzania",
            "Sm.TZ_134.1.1":           "tanzania",
            "ERR119615":               "uganda",
            "ERR997461":               "uganda" }

 
with open("results/phylonet/random_samples.list", 'w') as f:
    for sample in samples.keys():
        f.write("{}\n".format(sample))

In [None]:
%%bash

#create vcf with only these samples
vcftools \
    --vcf results/variant_filtration/smv7_ex_autosomes.vcf \
    --keep results/phylonet/random_samples.list \
    --recode \
    --recode-INFO-all \
    --stdout \
    >results/phylonet/gene_trees/random_auto.vcf
#After filtering, kept 56 out of 156 Individuals
#Outputting VCF file...
#After filtering, kept 475081 out of a possible 475081 Sites



In [None]:
#create phylip
! bin/vcf2phylip/vcf2phylip.py -i results/phylonet/gene_trees/random_auto.vcf 

#remove invariant
! python ~/sch_man_nwinvasion/bin/raxml_ascbias/ascbias.py -p results/phylonet/gene_trees/random_auto.min4.phy  -o results/phylonet/gene_trees/random_auto_variant.phy 

In [7]:
#split into 1K snp chunks

seqs={}
with open("results/phylonet/gene_trees/random_auto_variant.phy", 'r') as f:
    next(f)
    for entry in f:
        seq_id, seq = entry.rstrip().split("\t")
        seqs[seq_id] = seq
        

In [10]:
num_samples=len(seqs.keys())
n_snps = 1_000

for sample in seqs.keys():
    seq=seqs[sample]
    chunks = [seq[i:i+n_snps] for i in range(0, len(seq), n_snps)]
    
    for i in range(len(chunks)):
        split_seq=">{}\n{}\n".format(sample.replace(".", "_"), chunks[i])

        with open("results/phylonet/gene_trees/random_split_seq_{}.fas".format(i), 'a') as f:
            f.write(split_seq)
            

In [None]:
%%bash

CONDA="conda activate sch_man_nwinvasion-phylonet;"
QSUB="qsub -V -cwd -S /bin/bash -q all.q -j y -pe smp 4 "    

mkdir results/phylonet/gene_trees
mkdir results/phylonet/gene_trees/qlogs

for FAS in $(ls results/phylonet/gene_trees/random_split_seq_*.fas); do

    SAMPLE=$(basename $FAS .fas)

    RAXML_CMD="raxml-ng \
        --all \
        --msa $FAS \
        --msa-format fasta \
        --outgroup ERX284221 \
        --threads 4 \
        --bs-trees 100 \
        --model GTGTR4+G+ASC_LEWIS \
        --prefix results/phylonet/gene_trees/$SAMPLE"     
        
        echo "$CONDA $RAXML_CMD" | $QSUB -N $SAMPLE -o results/phylonet/gene_trees/qlogs/$SAMPLE.log
done


In [None]:
#now prep gene trees for phylonet
cat results/phylonet/gene_trees/*support >results/phylonet/gene_trees.nwk

#collapse trees w/ <10 bs support
nw_ed  results/phylonet/gene_trees.nwk 'i & b<=10' o > results/phylonet/gene_trees_BS10.nwk

#root on margrebowiei
nw_reroot results/phylonet/gene_trees_BS10.nwk ERX284221 >results/phylonet/gene_trees_BS10_marg_root.nwk

#rename and remove "."s in name
nw_rename short_IDs.nw id2longname.map

#may need to open file in figtree to remove bootstrap values


#then added the following text to the bottom to run phylonet
# BEGIN PHYLONET;

# InferNetwork_MPL (all) 4 -pl 48 -a <margrebowiei:ERX284221;cameroon:ERR103050;uganda:ERR119615,ERR997461;guadeloupe:ERR539847,ERR539848;puerto_rico:ERR046038;brazil:Sm-BR_PdV-1278-1,Sm-BR_PdV-2039-1,Sm-BR_PdV-2076-1,Sm-BR_PdV-2265-1,Sm-BR_PdV-2334-1,Sm-BR_PdV-2406-1,Sm-BR_PdV-2456-1,Sm-BR_PdV-2481-1,Sm-BR_PdV-2538-1,Sm-BR_PdV-2556-1;niger:Sm-NE_Di158-1,Sm-NE_Di186-1,Sm-NE_Di238-1,Sm-NE_Di297-1,Sm-NE_Di297-2,Sm-NE_Di68-2,Sm-NE_Na376-2,Sm-NE_Na381-1,Sm-NE_Na39-1,Sm-NE_Na40-1;senegal:ERR103049,Sm-SN_Nd115-1,Sm-SN_Nd22-1,Sm-SN_Nd24-1,Sm-SN_Nd34-1,Sm-SN_Nd43-1,Sm-SN_Nd47-1,Sm-SN_Nd50-1,Sm-SN_Nd54-1,Sm-SN_Nd90-1;tanzania:Sm-TZ_009-4-2,Sm-TZ_009-8-2,Sm-TZ_055-1-3,Sm-TZ_055-10-1,Sm-TZ_055-8-1,Sm-TZ_074N-1-2,Sm-TZ_074N-7-2,Sm-TZ_074N-8-1,Sm-TZ_086-1-1,Sm-TZ_134-1-1;rodhaini:ERR310938,Sro_female_1-1_CCATCCTC,Sro_female_1-2_CCGACAAC,Sro_female_2-1_CCTAATCC,Sro_female_2-2_CCTCTATC,Sro_male_1-1_ATCATTCC,Sro_male_1-2_ATTGGCTC,Sro_male_2-1_CAAGGAGC,Sro_male_2-2_CACCTTAC> mpl. ;

# END;

!java -jar ~/sch_man_nwinvasion/bin/PhyloNet_3.8.2.jar PHYLONET_gene_trees_BS10_marg_root_renamed.nex

# all haplotypes

In [3]:
samples = { "Sm.BR_PdV.1278.1":        "brazil", 
            "Sm.BR_PdV.2039.1":        "brazil",
            "Sm.BR_PdV.2076.1":        "brazil",
            "Sm.BR_PdV.2265.1":        "brazil",
            "Sm.BR_PdV.2334.1":        "brazil",
            "Sm.BR_PdV.2406.1":        "brazil",
            "Sm.BR_PdV.2456.1":        "brazil",
            "Sm.BR_PdV.2481.1":        "brazil",
            "Sm.BR_PdV.2538.1":        "brazil",
            "Sm.BR_PdV.2556.1":        "brazil",
            "ERR103050":               "cameroon",
            "ERR046038":               "puerto_rico",
            "ERR539847":               "guadeloupe",
            "ERR539848":               "guadeloupe",
            "ERX284221":               "margrebowiei",
            "Sm.NE_Di158.1":           "niger",
            "Sm.NE_Di186.1":           "niger",
            "Sm.NE_Di238.1":           "niger",
            "Sm.NE_Di297.1":           "niger",
            "Sm.NE_Di297.2":           "niger",
            "Sm.NE_Di68.2":            "niger",
            "Sm.NE_Na376.2":           "niger",
            "Sm.NE_Na381.1":           "niger",
            "Sm.NE_Na39.1":            "niger",
            "Sm.NE_Na40.1":            "niger",
            "ERR310938":               "rodhaini",
            "Sro_female_1.1_CCATCCTC": "rodhaini",
            "Sro_female_1.2_CCGACAAC": "rodhaini",
            "Sro_female_2.1_CCTAATCC": "rodhaini",
            "Sro_female_2.2_CCTCTATC": "rodhaini",
            "Sro_male_1.1_ATCATTCC":   "rodhaini",
            "Sro_male_1.2_ATTGGCTC":   "rodhaini",
            "Sro_male_2.1_CAAGGAGC":   "rodhaini",
            "Sro_male_2.2_CACCTTAC":   "rodhaini",
            "ERR103049":               "senegal",
            "Sm.SN_Nd115.1":           "senegal",
            "Sm.SN_Nd22.1":            "senegal",
            "Sm.SN_Nd24.1":            "senegal",
            "Sm.SN_Nd34.1":            "senegal",
            "Sm.SN_Nd43.1":            "senegal",
            "Sm.SN_Nd47.1":            "senegal",
            "Sm.SN_Nd50.1":            "senegal",
            "Sm.SN_Nd54.1":            "senegal",
            "Sm.SN_Nd90.1":            "senegal",
            "Sm.TZ_009.4.2":           "tanzania",
            "Sm.TZ_009.8.2":           "tanzania",
            "Sm.TZ_055.1.3":           "tanzania",
            "Sm.TZ_055.10.1":          "tanzania",
            "Sm.TZ_055.8.1":           "tanzania",
            "Sm.TZ_074N.1.2":          "tanzania",
            "Sm.TZ_074N.7.2":          "tanzania",
            "Sm.TZ_074N.8.1":          "tanzania",
            "Sm.TZ_086.1.1":           "tanzania",
            "Sm.TZ_134.1.1":           "tanzania",
            "ERR119615":               "uganda",
            "ERR997461":               "uganda" }

 
with open("results/phylonet/random_samples.list", 'w') as f:
    for sample in samples.keys():
        f.write("{}\n".format(sample))

In [None]:
%%bash

#create vcf with only these samples
vcftools \
    --vcf results/variant_filtration/smv7_ex_autosomes.vcf \
    --keep results/phylonet/random_samples.list \
    --recode \
    --recode-INFO-all \
    --stdout \
    >results/phylonet/gene_trees/random_auto.vcf
#After filtering, kept 56 out of 156 Individuals
#Outputting VCF file...
#After filtering, kept 475081 out of a possible 475081 Sites



In [None]:
#create phylip
! bin/vcf2phylip/vcf2phylip.py -i results/phylonet/gene_trees/random_auto.vcf 

#remove invariant
! python ~/sch_man_nwinvasion/bin/raxml_ascbias/ascbias.py -p results/phylonet/gene_trees/random_auto.min4.phy  -o results/phylonet/gene_trees/random_auto_variant.phy 

In [7]:
#split into 1K snp chunks

seqs={}
with open("results/phylonet/gene_trees/random_auto_variant.phy", 'r') as f:
    next(f)
    for entry in f:
        seq_id, seq = entry.rstrip().split("\t")
        seqs[seq_id] = seq
        

In [10]:
num_samples=len(seqs.keys())
n_snps = 1_000

for sample in seqs.keys():
    seq=seqs[sample]
    chunks = [seq[i:i+n_snps] for i in range(0, len(seq), n_snps)]
    
    for i in range(len(chunks)):
        split_seq=">{}\n{}\n".format(sample.replace(".", "_"), chunks[i])

        with open("results/phylonet/gene_trees/random_split_seq_{}.fas".format(i), 'a') as f:
            f.write(split_seq)
            

In [None]:
%%bash

CONDA="conda activate sch_man_nwinvasion-phylonet;"
QSUB="qsub -V -cwd -S /bin/bash -q all.q -j y -pe smp 4 "    

mkdir results/phylonet/gene_trees
mkdir results/phylonet/gene_trees/qlogs

for FAS in $(ls results/phylonet/gene_trees/random_split_seq_*.fas); do

    SAMPLE=$(basename $FAS .fas)

    RAXML_CMD="raxml-ng \
        --all \
        --msa $FAS \
        --msa-format fasta \
        --outgroup ERX284221 \
        --threads 4 \
        --bs-trees 100 \
        --model GTGTR4+G+ASC_LEWIS \
        --prefix results/phylonet/gene_trees/$SAMPLE"     
        
        echo "$CONDA $RAXML_CMD" | $QSUB -N $SAMPLE -o results/phylonet/gene_trees/qlogs/$SAMPLE.log
done


In [None]:
#now prep gene trees for phylonet
cat results/phylonet/gene_trees/*support >results/phylonet/gene_trees.nwk

#collapse trees w/ <10 bs support
nw_ed  results/phylonet/gene_trees.nwk 'i & b<=10' o > results/phylonet/gene_trees_BS10.nwk

#root on margrebowiei
nw_reroot results/phylonet/gene_trees_BS10.nwk ERX284221 >results/phylonet/gene_trees_BS10_marg_root.nwk

#rename and remove "."s in name
nw_rename short_IDs.nw id2longname.map

#may need to open file in figtree to remove bootstrap values


#then added the following text to the bottom to run phylonet
# BEGIN PHYLONET;

# InferNetwork_MPL (all) 4 -pl 48 -a <margrebowiei:ERX284221;cameroon:ERR103050;uganda:ERR119615,ERR997461;guadeloupe:ERR539847,ERR539848;puerto_rico:ERR046038;brazil:Sm-BR_PdV-1278-1,Sm-BR_PdV-2039-1,Sm-BR_PdV-2076-1,Sm-BR_PdV-2265-1,Sm-BR_PdV-2334-1,Sm-BR_PdV-2406-1,Sm-BR_PdV-2456-1,Sm-BR_PdV-2481-1,Sm-BR_PdV-2538-1,Sm-BR_PdV-2556-1;niger:Sm-NE_Di158-1,Sm-NE_Di186-1,Sm-NE_Di238-1,Sm-NE_Di297-1,Sm-NE_Di297-2,Sm-NE_Di68-2,Sm-NE_Na376-2,Sm-NE_Na381-1,Sm-NE_Na39-1,Sm-NE_Na40-1;senegal:ERR103049,Sm-SN_Nd115-1,Sm-SN_Nd22-1,Sm-SN_Nd24-1,Sm-SN_Nd34-1,Sm-SN_Nd43-1,Sm-SN_Nd47-1,Sm-SN_Nd50-1,Sm-SN_Nd54-1,Sm-SN_Nd90-1;tanzania:Sm-TZ_009-4-2,Sm-TZ_009-8-2,Sm-TZ_055-1-3,Sm-TZ_055-10-1,Sm-TZ_055-8-1,Sm-TZ_074N-1-2,Sm-TZ_074N-7-2,Sm-TZ_074N-8-1,Sm-TZ_086-1-1,Sm-TZ_134-1-1;rodhaini:ERR310938,Sro_female_1-1_CCATCCTC,Sro_female_1-2_CCGACAAC,Sro_female_2-1_CCTAATCC,Sro_female_2-2_CCTCTATC,Sro_male_1-1_ATCATTCC,Sro_male_1-2_ATTGGCTC,Sro_male_2-1_CAAGGAGC,Sro_male_2-2_CACCTTAC> mpl. ;

# END;

!java -jar ~/sch_man_nwinvasion/bin/PhyloNet_3.8.2.jar PHYLONET_gene_trees_BS10_marg_root_renamed.nex