## 0. Setup

In [None]:
import os
import numpy as np
import pandas as pd

## 1. Identify best contig in Parhyale genome using blast_to_gff.py

You can get blast_to_gff.py from [here](https://github.com/alvaralmstedt/py_scripts). </br>
You can get blast_to_gff_wrapper.sh from [here](https://github.com/alvaralmstedt/shell_scripts).

1. Set the transcript FASTA file of interest to blastn to the Parhyale genome.
2. Load the output into IGV and navigate to the region of interest using the 'IGV_address' field in the table that the cell below prints.
3. Identify a window around the gene including all 3' and 5' regions surrounding the gene.
4. Log this window into the [phaw_5.0 Genomic Addresses document](https://docs.google.com/spreadsheets/d/1rFzF0x8vpltd60TerYOCvBwXUBAFvdwSScArHvy_zD0/edit?usp=sharing).
5. Use the IGV coordinates for this window in the next step.

In [None]:
blasttype = 'nucl'
gene_name = 'CRY2'
transcript_name = 'Par-haw_CRY2.fasta'

if transcript_name != '' and blasttype == 'nucl':
    transcript_loc = '~/Labwork/Bioinformatics/Transcripts/'
    transcript = transcript_loc + transcript_name
elif transcript_name != '' and blasttype == 'prot':
    transcript_loc = '~/Labwork/Bioinformatics/Proteins/'
    transcript = transcript_loc + transcript_name
elif blasttype == 'prot':
    transcript_name = gene_name + '_protein.fasta'
    transcript_loc = '~/Labwork/Bioinformatics/Proteins/'
    transcript = transcript_loc + transcript_name
elif blasttype == 'nucl':
    transcript_name = gene_name + '.fasta'
    transcript_loc = '~/Labwork/Bioinformatics/Transcripts/'
    transcript = transcript_loc + transcript_name

#transcript = '~/Labwork/Bioinformatics/ContigsandScaffolds/Par-haw_Sp69-p2-old.fasta'

genome_blastdb = '~/Labwork/Bioinformatics/GenomeSequences/Phaw_5.0_Annotation/genome/phaw_5.0.blastdb'
script_loc = '~/Labwork/Bioinformatics/Scripts/shell_scripts-master/blast_to_gff_wrapper.sh'

if blasttype == 'prot':
    output_file = transcript.replace('Proteins', 'BLASTResults').replace('.fasta', '_vs_phaw5.0.blastn')
    !$script_loc -q $transcript -d $genome_blastdb -p tblastn -t 4 -l -o $output_file
elif blasttype == 'nucl':
    output_file = transcript.replace('Transcripts', 'BLASTResults').replace('.fasta', '_vs_phaw5.0.blastn')
    !$script_loc -q $transcript -d $genome_blastdb -p blastn -t 4 -l -o $output_file

gff_file = output_file + '.gff'

output = pd.read_csv(gff_file, sep = '\t', header = None, skiprows = 1)
output['IGV_address'] = output[0] + ':' + output[3].astype(str) + '-' + output[4].astype(str)
output

# Extract FASTA from Parhyale genome

Running the cell below allows you to extract FASTA sequence files from your genome of choice based on genome coordinates from IGV.

Be sure to set the variables for the run in the section before the hashes.

In [None]:
#Input the IGV address of the region you want to extract
#The script tolerates commas and removes them
igv_address = 'phaw_50.283815c:29,301,115-29,375,053'
nickname = 'Phaw-BmaI'

#set whether or not you want to extract the fasta file for the whole region
get_region = True

#set the location of the fasta file you want to extract from
fasta_file = '~/Labwork/Bioinformatics/GenomeSequences/Phaw_5.0_Annotation/genome/phaw_5.0.fa'
#set the destination directory for your output fasta files
output_destination = '~/Labwork/Bioinformatics/ContigsandScaffolds/'

#set whether or not you want to extract fasta files of the peaks of that region
get_peaks = False
peak_expansion = 500

#set the location of the peak region file you want to compare to
peaks_file = '~/Labwork/Bioinformatics/Omni-ATAC-Seq/OmniATAC_bothruns_peaks/OmniATAC_bothruns_q005_allpeaks.igv_new.bed'

####################################################
####################################################

#Decompose the IGV address into component parts
contig = igv_address.split(':')[0]
start = igv_address.split(':')[1].split('-')[0].replace(',', '')
end = igv_address.split(':')[1].split('-')[1].replace(',', '')
print('genomic region of interest is:', contig, start, end, '\n')

#Run this block to get the fasta file from your region of interest
if get_region == True:
    region = contig + '\t' + start + '\t' + end
    region_name = contig + '_' + start + '_' + end 
    with open('region.bed', 'w+') as f:
        f.write(region)
    region_fasta = output_destination + nickname + '_' + region_name + '.fasta'
    region_bed = 'region.bed'
    print('getting fasta file of coordinate', igv_address, 'and saving to:\n', region_fasta, '\n')
    !/usr/local/bin/bedtools getfasta -fi $fasta_file -bed $region_bed -fo $region_fasta

#Run this block to get the fasta file of peaks overlapping your region of interest
if get_peaks == True:
    region = contig + '\t' + start + '\t' + end
    region_name = contig + '_' + start + '_' + end + '_peaks'
    with open('region.bed', 'w+') as f:
        f.write(region)
    region_bed = 'region.bed'
    region_peaks_bed = 'region_peaks.bed'
    !bedtools intersect -wa -a $peaks_file -b $region_bed > $region_peaks_bed
    peaks = pd.read_csv(region_peaks_bed, sep = '\t', header = None)
    peaks[1] = peaks[1] - peak_expansion
    peaks[2] = peaks[2] + peak_expansion
    peaks.to_csv(region_peaks_bed, sep = '\t', header = None, index = None)
    region_peaks_fasta = output_destination + nickname + '_' + region_name + '.fasta'
    print('getting fasta file of expanded peaks within coordinate', igv_address, 'and saving to:\n', region_peaks_fasta, '\n')
    !bedtools getfasta -fi $fasta_file -bed $region_peaks_bed -fo $region_peaks_fasta

# Identify best contig in Hyalella genome using blast_to_gff.py
1. Set the transcript FASTA file of interest to blastn to the Hyalella genome.
2. Load the output into IGV and navigate to the region of interest using the 'IGV_address' field in the table that the cell below prints.
3. Double-check the address using the online [i5k BLAST webapp](https://i5k.nal.usda.gov/webapp/blast/). If the i5k differs from the BLAST from this script, go with the i5k address.

In [None]:
blasttype = 'prot'
gene_name = 'dll-e'
transcript_name = 'Par-haw_' + gene_name + '.fasta'
protein_name = 'Par-haw_' + gene_name + '_protein.fasta'

transcript_loc = '~/Labwork/Bioinformatics/Transcripts/'
protein_loc = '~/Labwork/Bioinformatics/Proteins/'
script_loc = '~/Labwork/Bioinformatics/Scripts/shell_scripts-master/blast_to_gff_wrapper.sh'
transcript = ''
genome_blastdb = '~/Labwork/Bioinformatics/GenomeSequences/Hya_azt/Hazt_2.0.1_genomic.blastdb'

if blasttype == 'nucl':
    transcript = transcript_loc + transcript_name
    output_file = transcript.replace('Transcripts', 'BLASTResults').replace('.fasta', '_vs_Hazt2.0.1.blastn')
    !$script_loc -q $transcript -d $genome_blastdb -p blastn -t 4 -l -o $output_file
elif blasttype == 'prot':
    transcript = protein_loc + protein_name
    output_file = transcript.replace('Proteins', 'BLASTResults').replace('.fasta', '_vs_Hazt2.0.1.blastn')
    !$script_loc -q $transcript -d $genome_blastdb -p tblastn -t 4 -l -o $output_file

gff_file = output_file + '.gff'

output = pd.read_csv(gff_file, sep = '\t', header = None, skiprows = 1)
output['IGV_address'] = output[0] + ':' + output[3].astype(str) + '-' + output[4].astype(str)
display(output.head(20))

# Merge Hyalella BLAST .gffs together

In [None]:
# Creating a list of filenames
directory = '/users/dennis/Labwork/Bioinformatics/BLASTResults/'
filenames = [file for file in os.listdir(directory) if 'vs_Hazt2.0.1.blastn.gff' in file]

# Open file3 in write mode 
with open(directory + 'Hazt_BLAST_results.gff', 'w+') as outfile: 
  
    # Iterate through list 
    for name in filenames: 
  
        # Open each file in read mode 
        with open(directory + name) as infile: 
  
            # read the data from file1 and 
            # file2 and write it in file3 
            outfile.write(infile.read()) 
  
        # Add '\n' to enter data of file2 
        # from next line 
        outfile.write("\n") 

# Extract FASTA from Hyalella genome

Running the cell below allows you to extract FASTA sequence files from your genome of choice based on genome coordinates from IGV.

Be sure to set the variables for the run in the section before the hashes.

You can also validate your BLAST result using the [online BLAST for Hyalella at i5k](https://i5k.nal.usda.gov/webapp/blast/)

In [None]:
#Input the IGV address of the region you want to extract
igv_address = 'JQDR03001188.1:1-207,259'
nickname = 'dll-cluster'

#set whether or not you want to extract the fasta file for the whole region
get_region = True

#set the location of the fasta file you want to extract from
fasta_file = '~/Labwork/Bioinformatics/GenomeSequences/Hya_azt/Hazt_2.0.1_genomic.fna'
#set the destination directory for your output fasta files
output_destination = '~/Labwork/Bioinformatics/ContigsandScaffolds/'

####################################################
####################################################

#Decompose the IGV address into component parts
contig = igv_address.split(':')[0]
start = igv_address.split(':')[1].split('-')[0].replace(',', '')
end = igv_address.split(':')[1].split('-')[1].replace(',', '')
print('genomic region of interest is:', contig, start, end, '\n')

#Run this block to get the fasta file from your region of interest
if get_region == True:
    region = contig + '\t' + start + '\t' + end
    region_name = contig + '_' + start + '_' + end 
    with open('region.bed', 'w+') as f:
        f.write(region)
    region_fasta = output_destination + nickname + '_' +  region_name + '.fasta'
    region_bed = 'region.bed'
    print('getting fasta file of coordinate', igv_address, 'and saving to:\n', region_fasta, '\n')
    !bedtools getfasta -fi $fasta_file -bed $region_bed -fo $region_fasta

# Convert VISTA to .gff
1. Run [VISTA](http://genome.lbl.gov/cgi-bin/VistaInput?num_seqs=2)
2. Modify correct file names

In [None]:
gene_name = 'dll-cluster'

Phaw_name = 'Par-haw_' + gene_name
Hazt_name = 'Hya-azt_' + gene_name

filename = 'VISTA_' + Phaw_name + '_vs_' + Hazt_name + '.txt'
folder = '/Users/dennis/Labwork/Bioinformatics/VISTAAlignments/'

vista_to_gff = pd.DataFrame()

with open(folder + filename) as v:
    lines = v.readlines()
    Phaw_contigs = [line.lstrip('>' + Phaw_name).lstrip(' ').split(' ')[0] for line in lines if Phaw_name in line]
    contig = [item.split(':')[0].split('_')[0] + '_' + item.split(':')[0].split('_')[1] for item in Phaw_contigs]
    first = int(Phaw_contigs[0].split(':')[0].split('_')[2])
    start = [int(item.split(':')[1].split('-')[0]) + first for item in Phaw_contigs]
    stop = [int(item.split(':')[1].split('-')[1]) + first for item in Phaw_contigs]
    Hazt_contigs = [line.lstrip('>' + Hazt_name + ' ').split(' ')[0] for line in lines if Hazt_name in line]
    Orient = [line.lstrip('>' + Phaw_name + ' ').split(' ')[1].strip('()\n') for line in lines if Phaw_name in line]
    Stats = [line for line in lines if '%' in line]
    Lens = [stat.split(',')[0].lstrip('= ').replace(' ', '') + ';' for stat in Stats]
    Pid = [stat.split(',')[1].lstrip('identity = ').rstrip('%') for stat in Stats]
    vista_to_gff['contig'], vista_to_gff['source'], vista_to_gff['feature'], vista_to_gff['start'], vista_to_gff['end'], vista_to_gff['score'], vista_to_gff['strand'], vista_to_gff['frame'], vista_to_gff['attribute'] = [contig, 'VISTA', Hazt_contigs, start, stop, Pid, Orient, '0', Lens]

output = filename.replace('.txt', '.gff')
vista_to_gff = vista_to_gff[['contig', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']].drop_duplicates()
vista_to_gff.to_csv(folder + output, header = None, sep = '\t', index = None)
vista_to_gff

# Collect all VISTA .gff files and merge into a single .gff file for IGV

For all VISTA alignments in the specified directory, this script will merge them all into a single .gff file. </br>
This allows for updating of the .gff file in IGV without having to manually re-load the file. </br>
After running this script, you may need to restart IGV for the changes to be reflected.

In [None]:
# Creating a list of filenames
directory = '/users/dennis/Labwork/Bioinformatics/VISTAAlignments/'
filenames = [file for file in os.listdir(directory) if '.gff' in file]

# Open file3 in write mode 
with open(directory + 'VISTA_peaks.gff', 'w+') as outfile: 
  
    # Iterate through list 
    for name in filenames: 
        
        if name == 'VISTA_peaks.gff':
            continue
  
        # Open each file in read mode 
        with open(directory + name) as infile: 
  
            # read the data from file1 and 
            # file2 and write it in file3 
            outfile.write(infile.read()) 
  
        # Add '\n' to enter data of file2 
        # from next line 
        outfile.write("\n") 

# Merge all PWMs in gene_list into a single PWM file for FIMO

In [None]:
pwm_directory = '/users/dennis/Labwork/Bioinformatics/PWMs/'

output_name = 'Dro-mel_Sp1_regulators.meme'
gene_list = ['tll', 'Hb', 'dl']

def common_member(a, b): 
    a_set = set(a) 
    b_set = set(b) 
    if (a_set & b_set): 
        return True 
    else: 
        return False

match = False
newfile = None
files_added = []
started = False
    
for file in sorted(os.listdir(pwm_directory)):
    if 'meme' in file.split('.') and common_member(gene_list, file.rstrip('.meme').split('_')):
        files_added += [file]
        with open(pwm_directory + file) as file:            
            for line in file:
                if 'MOTIF' in line:
                    match = True
                    if started == False:
                        newfile = open(pwm_directory + 'temp.meme', 'w+')
                    else:
                        newfile = open(pwm_directory + 'temp.meme', 'a+')
                    newfile.write(line)
                    continue
                elif 'URL' in line:
                    match = False
                    newfile.write(line)
                    newfile.write('\n')
                    newfile.close()
                    continue
                elif match:
                    newfile.write(line)
        started = True

data = data2 = "" 
  
with open(pwm_directory + 'MEME_header.txt') as fp: 
    data = fp.read() 

with open(pwm_directory + 'temp.meme') as fp: 
    data2 = fp.read() 

data += "\n"
data += data2
  
with open (pwm_directory + output_name, 'w+') as fp: 
    fp.write(data)

print('Combined:')
print([file for file in files_added])
print('into one MEME file:', output_name)

# Convert FIMO output into a .bed file 
If using a text-input PWM, set a value for custom_alias that will be the gene name used.

In [None]:
input_file = '~/Labwork/Bioinformatics/FIMO/FIMO_Dro-mel_Pho_vs_Phaw-BXC_peaks.gff'
custom_alias = ''

fimo_output = pd.read_csv(input_file, sep = '\t', skiprows = 1, header = None)
fimo_output_name = input_file.replace('.gff', '_converted.bed')

fimo_output['contig'] = fimo_output[0].str.split(":", n = 1, expand = True)[0]
fimo_output['paststart'] = fimo_output[0].str.split(":", n = 1, expand = True)[1].str.split('-', n=1, expand = True)[0].astype(int)
fimo_output['start'] = fimo_output[3].astype(int) + fimo_output['paststart']
fimo_output['end'] = fimo_output[4].astype(int) + fimo_output['paststart']
fimo_output['alias'] = fimo_output[8].str.split(';', n = 2, expand = True)[1].str.replace('Alias=', '')
if custom_alias != '':
    fimo_output['alias'] = custom_alias
fimo_output['name'] = fimo_output[8].str.split(';', n = 1, expand = True)[0].str.lstrip('Name=')
fimo_output['ID'] = fimo_output[8].str.split(';', n = 3, expand = True)[2].str.lstrip('ID=')
fimo_output['other'] = fimo_output[8].str.split(';', n = 3, expand = True)[3].str.rstrip(';')
fimo_output[5] = fimo_output[5] * 10

fimo_output_export = fimo_output[['contig', 'start', 'end', 'alias', 5, 6]]
fimo_output_export.to_csv(fimo_output_name, sep = '\t', index = None, header = None)
fimo_output_export

# Collect all FIMO .bed files and merge into a single .bed file for IGV

In [None]:
# Creating a list of filenames
directory = '/users/dennis/Labwork/Bioinformatics/FIMO/'
filenames = [file for file in os.listdir(directory) if '_converted.bed' in file]

# Open file3 in write mode 
with open(directory + 'FIMO_sites.bed', 'w+') as outfile: 
  
    # Iterate through list 
    for name in filenames: 
  
        # Open each file in read mode 
        with open(directory + name) as infile: 
  
            # read the data from file1 and 
            # file2 and write it in file3 
            outfile.write(infile.read()) 
  
        # Add '\n' to enter data of file2 
        # from next line 
        outfile.write("\n") 

# Convert RepeatMasker to .gff file

In [None]:
input_rm = 'Par-haw_BXC'
rm_alias = ''

rm_loc = '~/Labwork/Bioinformatics/RepeatMasker/'
rm_file_loc = '~/Labwork/Bioinformatics/RepeatMasker/' + input_rm + '_repeatmasker.txt'
rm_bed_name = rm_file_loc.replace('.txt', '.bed')

rm_output = pd.read_csv(rm_file_loc, sep = '\t', header = None)

rm_output['contig'] = rm_output[5].str.split(":", n = 1, expand = True)[0]
rm_output['start'] = rm_output[5].str.split(":", n = 1, expand = True)[1].str.split('-', n = 1, expand = True)[0].astype('int') + rm_output[6]
rm_output['end'] = rm_output[5].str.split(":", n = 1, expand = True)[1].str.split('-', n = 1, expand = True)[0].astype('int') + rm_output[7]
rm_output['feature'] = rm_output[11] + '_' +  rm_output[10]

rm_bed = rm_output[['contig', 'start', 'end', 'feature', 1, 9]]
rm_bed.to_csv(rm_bed_name, sep = '\t', index = None, header = None)
display(rm_bed)

# Collect all RepeatMasker .bed files and merge into a single .bed file for IGV

In [None]:
# Creating a list of filenames
directory = '/users/dennis/Labwork/Bioinformatics/RepeatMasker/'
filenames = [file for file in os.listdir(directory) if '_repeatmasker.bed' in file]
rm_collated = directory + 'Repeatmasker_sites.bed'
rm_sorted = rm_collated.replace('.bed', '.sorted.bed')

# Open file3 in write mode 
with open(rm_collated, 'w+') as outfile: 
  
    # Iterate through list 
    for name in filenames: 
  
        # Open each file in read mode 
        with open(directory + name) as infile: 
  
            # read the data from file1 and 
            # file2 and write it in file3 
            outfile.write(infile.read()) 
  
        # Add '\n' to enter data of file2 
        # from next line 
        outfile.write("\n")

!bedtools sort -i $rm_collated > $rm_sorted