# Honours fasta processing pipeline

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
from onekp_func import onekp
from phyto_func import phyto
from append_file import append_file
from log_func import log
from species_fix import sp_fix
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO

In [None]:
# Manually run unique_id.py, this is done so the file is not overwrittenbfrom random import shuffle

Generate datasets, merge and standardise from Phytozome and OneKp

In [None]:
# Runs the phyto and onekp functions doing the initial processing 
# These functions take in a file list of different blast searches to be combined. 
# The file must be in the format of fasta \t details \t protein \n

phyto('phyto_filelist')
onekp('onekp_filelist')

In [None]:
# Just joins the two output fasta files from both functions by appending to new 'master' files

master = {'master':['phytozome_new_all', 'onekp_new_all'], 
'master_blinded':['phytozome_blinded_all','onekp_blinded_all']}

for k,v in master.items():
    append_file(k,v)

In [None]:
# Merges remaining phyotzome and onekp files, creates new columns so that the files can be concatenated

ph_index = pd.read_csv('phytozome_master.txt', sep='\t')
o_index = pd.read_csv('onekp_master.txt', sep='\t')
ph_details = pd.read_csv('phytozome_details_mod_all_sp_corrected.txt', sep='\t')
ph_details['Unid'] = ph_details['Unid'].apply(str)
sp = pd.read_csv('onekp_species_list.csv')

# Adds the species to the onekp details dataframe
o_index = pd.merge(o_index, sp, how = 'inner', on='Onekp_index_id')
log(f'Species and taxonomic info from onekp_species_list.csv was added to Onekp_index_id dataframe')

# Adds the species to the phytozome details dataframe
ph_species = ph_details[['Unid', 'Species']]
ph_species['Unid'] = ph_species['Unid'].apply(pd.to_numeric) 
ph_index = pd.merge(ph_index, ph_species, how='inner', on='Unid')
log(f'Species names from phytozome_details_mod_all_sp_corrected.txt was added to phytozome_master dataframe')

# Creates new columns in each dataframe so that they match 
ph_index['Onekp_index_id'] = np.nan
ph_index['Scaffold'] = np.nan
ph_index['Tax_1'] = np.nan
ph_index['Tax_2'] = np.nan
ph_index['PACid'] = ph_index['PACid'].apply(str)

o_index['PACid'] = np.nan

master_index = pd.concat([ph_index, o_index])


master_index.to_csv('master_index.txt', sep='\t', index=False)

log(f'Onekp_index_id and phytozome_master dataframes were concatednated with a total of {len(master_index)} sequences')

In [None]:
# Creates a working ('w') dataframe with short seqs removed

# Variables defining 70% length of AtSAL variants, without signal peptide, and the length coefficient 
coef = 0.7
SAL1 = 353
SAL2 = 347
SAL3 = 357
SAL4 = 345
AHL = 373
Unid = 397

log('Sequences filtered by length (A. thaliana version minus signal peptide) as follows: \n' +
    f'                        SAl1 = {SAL1}\n' +
    f'                        SAl2 = {SAL2}\n' +
    f'                        SAL3 = {SAL3}\n' +
    f'                        SAL4 = {SAL4}\n' +
    f'                        AHL  = {AHL}\n' +
    f'                        Unid = {Unid}\n' +
    f'                        multiplied by a coefficient of {coef}')

orig = pd.read_csv('master_index.txt', sep='\t')

# Creates the dataframe of filtered sequences
w_index =  pd.concat([
    orig[(orig['SAL_variant'] == 'SAL1') & (orig['Sequence'].str.len() >= (SAL1 * coef))],
    orig[(orig['SAL_variant'] == 'SAL2') & (orig['Sequence'].str.len() >= (SAL2 * coef))],
    orig[(orig['SAL_variant'] == 'SAL3') & (orig['Sequence'].str.len() >= (SAL3 * coef))],
    orig[(orig['SAL_variant'] == 'SAL4') & (orig['Sequence'].str.len() >= (SAL4 * coef))],
    orig[(orig['SAL_variant'] == 'AHL') & (orig['Sequence'].str.len() >= (AHL * coef))],
    orig[(orig['SAL_variant'] == 'Unid') & (orig['Sequence'].str.len() >= (Unid * coef))],
                     ])

log(f'{len(w_index)} sequences were >= to the above parameters (working set)')

# Creates a dataframe of the dropped sequences (not kept), just to generate a count, for validation purposes
d_index =  pd.concat([
    orig[(orig['SAL_variant'] == 'SAL1') & (orig['Sequence'].str.len() < (SAL1 * coef))],
    orig[(orig['SAL_variant'] == 'SAL2') & (orig['Sequence'].str.len() < (SAL2 * coef))],
    orig[(orig['SAL_variant'] == 'SAL3') & (orig['Sequence'].str.len() < (SAL3 * coef))],
    orig[(orig['SAL_variant'] == 'SAL4') & (orig['Sequence'].str.len() < (SAL4 * coef))],
    orig[(orig['SAL_variant'] == 'AHL') & (orig['Sequence'].str.len() < (AHL * coef))],
    orig[(orig['SAL_variant'] == 'Unid') & (orig['Sequence'].str.len() < (Unid * coef))],
                     ])

log(f'{len(d_index)}  sequences were < the above parameters (discard set)')

In [None]:
# Takes the w_index dataframe from above and creates a list of the unique id's from it
w_unid = []
for i in w_index['Unid']:
    w_unid.append(str(i))

log(f'{len(w_unid)} unique ids were listed from the working set')

In [None]:
# Takes the master fasta file with only unique ids as headers and creates a dictionary with key = unique id and 
# values = sequence

with open ('master_blinded.txt', 'r') as blinded:

    temp_dict = {}
    k = ''
    v = ''
    countw = 0
    countd = 0

    w_bofile = open('w_blinded.txt', 'w')
    d_bofile = open('d_blinded.txt', 'w')

    for line in blinded:
        if line.startswith('>'):
            k = line.strip('\n').lstrip('>')
        else:
            v = line.strip('\n')
        temp_dict.update({k: v}) 

    # Creates two dictionaries based off whether the unique id was found in the list generated in the cell above
    w_bdict = {k: v for k, v in temp_dict.items() if k in w_unid}
    d_bdict = {k: v for k, v in temp_dict.items() if k not in w_unid}

    # Writes two new fasta files, one with the unique ids found in the list, one without
    for k,v in w_bdict.items():
        w_bofile.write('>' + k + '\n')
        w_bofile.write(v + '\n')
        countw += 1
    for k,v in d_bdict.items():
        d_bofile.write('>' + k + '\n')
        d_bofile.write(v + '\n')
        countd += 1
        
    w_bofile.close()
    d_bofile.close()  

    log(f'w_blinded.txt was created with {countw} sequences from the unique id listed from the working set, derived from master_blinded.txt')
    log(f'd_blinded.txt was created with {countd} sequences absent from the unique id listed from the working set, derived from master_blinded.txt')
  
    # pretty much the same as above but for fasta sequences with the orignal (+ unid) headers
with open ('master.txt', 'r') as master:
    
    temp_dict = {}
    k = ''
    v = ''
    countw = 0
    countd = 0
    
    w_ofile = open('w.txt', 'w')
    d_ofile = open('d.txt', 'w')
    
    for line in master:
        if line.startswith('>'):
            k = line.strip('\n')
        else:
            v = line.strip('\n')
        temp_dict.update({k: v})
        
    w_dict = {k: v for k, v in temp_dict.items() if k.split()[0].lstrip('>') in w_unid}
    d_dict = {k: v for k, v in temp_dict.items() if k.split()[0].lstrip('>') not in w_unid}

    w_ofile = open('w.txt', 'w')
    d_ofile = open('d.txt', 'w')
    
    for k,v in w_dict.items():
        w_ofile.write(k + '\n')
        w_ofile.write(v + '\n')
        countw += 1

    for k,v in d_dict.items():
        d_ofile.write(k + '\n')
        d_ofile.write(v + '\n')
        countd += 1
        
    w_ofile.close()
    d_ofile.close()

    log(f'w.txt was created with {countw} sequences from the unique id listed from the working set, derived from master.txt')
    log(f'd.txt was created with {countd} sequences absent from the unique id listed from the working set, derived from master.txt')

# Generate cluster list from SSN results

The following unique IDs were mannually assigned to the sequences detailed below, and the master_index.txt file was manually updated and saved as master_index_w_outg_controls.txt

17155: AtUnid, 27030: AtAHL, 36896: AtSAL1, 38488: AtSAL2, 44133: AtSAL2, 36454: AtSAL4, 52125: mammalian, 42852: mammalian, 87894: mammalian, 27803: mammalian, 23604: mammalian, 49932: mammalian, 53531: mammalian

# 200826 Restart from w_blinded  
I'm going to skip CDHit and trim using t_coffee, then resubmit to EFI for Cytoscape visulisation 


Ran:  
t_coffee -other_pg seq_reformat -in w_blinded.fasta -action +trim _seq_%%90_O10 >200826_blinded_trimed.fasta   
This is approx. 20k sequences and turns out is too large to run on flashheart  
Submitted to CDHit at 0.95 with default values which reduced it to ~2800 sequences  
This was then run on t_coffee with:  
* t_coffee -other_pg seq_reformat -in 200826_cdhit_95.fasta -action +trim _seq_n1500 -output fasta_seq >200826_chit_95_trimmed.fasta  
</br>  
AtSAL and ordered sequences were added  
* fasta file was submitted to EFI-EST 'Fasta (option C)', E-value = 1, fragments were left in  
EFI-EST Job ID: 48003  
Computation Type: FASTA (Option C), no FASTA header reading  
Job Name: 200826_chit_95_trimed.fasta  
Uploaded Fasta File: 200826_chit_95_trimed.fasta  
E-Value: 1  
Fraction: 1  
</br>  
  
EST settings were non-selective (download all), I'll manuallly remove sequences of likely wrong lengths  
Analysis Job ID: 58552  
Minimum Length: 0  
Maximum Length: 50000  
Filter Type: E-Value  
Filter Value: 3  
Network Name: 200826_chit_95_trimed_fasta  


In [None]:
#ordered and At sequences were remoeved first 

ahl = pd.read_csv('Output/200827_ssn/hax/200827_ahl.csv', sep=',')
sal = pd.read_csv('Output/200827_ssn/hax/200827_SAL.csv', sep=',')
other = pd.read_csv('Output/200827_ssn/hax/200827_other.csv', sep=',')
unid = pd.read_csv('Output/200827_ssn/hax/200827_unid.csv', sep=',')

In [None]:
# This dictionary provides the two arguments for the 'clusters' function
d = {'ahl': ahl, 'sal': sal, 'other': other, 'unid': unid}

# Unpacks the dictionary interatively and passes the keys and values as the arguments for the 'clusters' function
for k, v in d.items():
    clusters(k, v, 'Output/master_blinded.txt')

w_ahl_cluster.txt, w_sal_cluster.txt, w_other_cluster.txt were renamed to 200827_ahl.fasta, 200827_ahl.fasta, 200827_ahl.fasta, with the At and ordered sequences added back in and put in Alignments/200827

# 200830 Restart from w_blinded.fasta

200830 - took w_blinded.fasta and ran through cdhit at 90, then ran  
t_coffee -reg -seq 200830_cdhit_90.fasta -nseq 100 -tree mbed -method mafftginsi_msa -outfile 200830_aln -outtree 200830_aln.mbed -thread=9  
(-thread didn't do anything)
Went through and removed suspected signal peptide, cut messy N-terminal sequences off and removed any short sequences - from the N-terminal short ones that did not start with M were deleted.  
</br>
EFI-EST Job ID: 48130  
Computation Type: FASTA (Option C), no FASTA header reading  
Job Name: 200830_aln_for_ssn.fasta  
Uploaded Fasta File: 200830_aln_for_ssn.fasta  
E-Value: 5  
Fraction: 1  
</br>
Analysis Job ID: 58681  
Minimum Length: 0  
Maximum Length: 50000  
Filter Type: E-Value  
Filter Value: 10  
Network Name: 200830_aln_ssn_fasta  


In [None]:
#Uses BioPython sequence cleaner script, modified for my use. Removes sequences with non-amino acid residues 

import sys
from Bio import SeqIO

def sequence_cleaner(fasta_file, por_n=0):
    # Create our hash table to add the sequences
    sequences={}
    dropped={}
    dropped_list =[]

    # Using the Biopython fasta parse we can read our fasta input
    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        # Take the current sequence
        sequence = str(seq_record.seq).upper()

        # If the sequence passed in the test "is it clean?" and it isn't in the
        # hash table, the sequence and its id are going to be in the hash
        if (float(sequence.count("X")) / float(len(sequence))) * 100 <= por_n:
            if sequence not in sequences:
                sequences[sequence] = seq_record.id   
        else: dropped[sequence] = seq_record.id
            
    for k,v in dropped.items():
        dropped_list.append(v)
            

    # Write the clean sequences

    # Create a file in the same directory where I ran this script
    with open("clear_" + fasta_file, "w+") as output_file:
        # Just read the hash table and write on the file as a fasta format
        for sequence in sequences:
            output_file.write(">" + sequences[sequence] + "\n" + sequence + "\n")
    print("These sequences were dropped")
    print(dropped_list)

In [None]:
# Ran 200831

sequence_cleaner('200830_aln_for_ssn.fasta')

#Sequence cleaner is now as duplicate sequences may not be caught until signal peptides are removed 

Run on 200831, returned:  
These sequences were dropped  
['69375', '60666', '34646', '40609']

In [None]:
def clusters(k, v, master):
    """
    Takes three arguments, the file with a list of unique ids, the master file of sequences, and the detail to add 
    in the header (the cluster it's derived from), from which the new file name is derived. 
    This function takes the Cytoscape table of sequences (as a df) and converts them to new files as a list of unique ids. 
    The files are named "{colour}_cluster.txt" as the list of unique ids in the the cluster and "w_{colour}_cluster.txt" 
    which is a fasta file of all sequences in that cluster
    It opens and automatically closes the file using 'with open [...] as x', and logs the results in Honours_log.txt
    """
    count = 0
       
    temp_list = v['Description'].tolist()
    temp_set = []
    for i in temp_list:
        t = i.split("|")
        for n in t:
            n = int(n)
            temp_set.append(n)
    
    temp_set = set(temp_set)

    with open(f'{k}_cluster.txt', 'w') as temp:
        for i in temp_set:
            count += 1
            temp.write(str(i) + '\n')
    
    with open (master, 'r') as blinded:

        temp_dict = {}
        x = ''
        y = ''
        countw = 0

        w_bofile = open(f'w_{k}_cluster.txt', 'a')

        # Converts the master fasta file into a dictionary where the key is fasta header and the value is the sequence
        for line in blinded:
            if line.startswith('>'):
                line = line[:6]
                x = line.strip('\n').lstrip('>')
            else:
                y = line.strip('\n')
            temp_dict.update({int(x): y}) 
    
        # Creates a new dictionary based off whether the unique id was found in the orignal SSN cluster csv
        w_bdict = {x: y for x, y in temp_dict.items() if x in temp_set}

        
        # Writes a new fasta file with the unique ids found in the dictionary w_bdict
        for x, y in w_bdict.items():
            w_bofile.write('>' + str(x) + ' ' + k + '\n')
            w_bofile.write(y + '\n')
            countw += 1

        w_bofile.close()

        log(f'w_{k}_cluster.txt was created/appended with {countw} sequences from the 200818_{k}.csv file, derived from {master}')
            
    log(f'{k}_cluster.txt was created with {count} unique IDs')

In [None]:
# Takes each of the Cytoscape sequence tables and reads them in pandas assigning them to df as their colours

red = pd.read_csv('Output/200830_ssn/200830_red.csv', sep=',')
blue = pd.read_csv('Output/200830_ssn/200830_blue.csv', sep=',')
cyan = pd.read_csv('Output/200830_ssn/200830_cyan.csv', sep=',')
pink = pd.read_csv('Output/200830_ssn/200830_pink.csv', sep=',')
tan = pd.read_csv('Output/200830_ssn/200830_tan.csv', sep=',')
green = pd.read_csv('Output/200830_ssn/200830_green.csv', sep=',')
forest = pd.read_csv('Output/200830_ssn/200830_forest.csv', sep=',')


In [None]:
# Ran 200831

# This dictionary provides the two arguments for the 'clusters' function
d = {'blue': blue, 'cyan': cyan, 'pink': pink, 'tan': tan, 'red': red, 'green': green, 'forest': forest}

# Unpacks the dictionary interatively and passes the keys and values as the arguments for the 'clusters' function
for k, v in d.items():
    clusters(k, v, 'Output/master_blinded.txt')

Renamed w_{colour}_cluster.txt files to {colour}.fasta and aligned using:  
  


In [None]:
def named_cluster(date):
    """
    Takes the date as a string to generate fasta files from the clusters files, resultant fasta files have headers
    in the following format: >Unid Species Tax_1. Sequences are taken from the index file
    """
    
    colours = ['blue', 'cyan', 'pink', 'tan', 'red', 'green', 'forest']

    # Reads the master index file
    index = pd.read_csv("master_index.txt", sep="\t")

    for colour in colours:
        ofile = open(f'Output/200830_ssn/{colour}_cluster.txt', 'r')
        seq_list = []

        #Generates a list of Unids from the above file and populates seq_list as int (not string)
        for line in ofile:
            seq_list.append(int(line))

        # Makes a new dataframe only if the 'Unid' is also found in seq_list
        index_colour = index[index['Unid'].isin(seq_list)]  

        # Creates a new dataframe from index_colour using the columns listed below
        builder = index_colour[["Unid", "Sequence", "Species", "Tax_1"]]    

        # Creates a dictionary setting the 'Sequence' column as the index then passing that as the Key, 
        # while a list is created from the other columns as the Value
        temp_dict = builder.set_index('Sequence').T.to_dict('list')    

        # Writes a new fasta file with the header as >Unid Species Tax_1 
        alnfile = open(f'{date}_{colour}.fasta', 'w')
        
        countw = 0
        for k,v in temp_dict.items():
            alnfile.write('>' + str(v[0]) + ' ' + str(v[1]) + ' ' + str(v[2]) + '\n')
            countw += 1
            alnfile.write(k + '\n')

        alnfile.close()

        ofile.close()
        
        log(f'{date}_{colour}.txt was created/appended with {countw} sequences from the 200830_ssn {colour}_cluster.txt file, sequences derived from mast_index.txt')

In [None]:
named_cluster("200901") # Ran 200901, will align as above. 

t_coffee drops anything in a fasta header that is after a space. *anger *
Below fixes that

I'm going to try to align all the sequences in 200830_cdhit_90.fasta and do the clustering manually 

In [None]:
def named_aln(file_name):
    """
    Takes an aligned fasta file (input as string)and adds Species and Tax_1 info from 
    master_index.txt in the following format: >Unid |Species|Tax_1. 
    """

    # Reads the master index file
    index = pd.read_csv("master_index.txt", sep="\t")

    aln_dict = {}
    k = ''
    v = ''

    # Makes a dictionary from an alignment file with the Unid as the Key and the sequence as the Value
    alnfile = open(f"{file_name}", 'r')     
    
    for line in alnfile:
        if line.startswith('>'):
            k = line.rstrip('\n').lstrip('>')
            k = int(k)
        else:
            v = line.strip('\n')
        aln_dict.update({k: v}) 

    alnfile.close()

    #Creates a datafram out of the above dictionary with two columns, "Unid" and "Sequence"
    aln = pd.DataFrame.from_dict(aln_dict, orient='index', columns=['Sequence'])
    aln.index.name = "Unid"
    aln = aln.reset_index()

    # Creates a new dataframe from index_colour using the columns listed below to ultimately create the new fasta header
    sp = index[["Unid", "Species", "Tax_1"]]    

    #Merges both the datafram with the aligned sequnce with the one directly above, on the Unid
    builder = pd.merge(aln,sp,how='inner',on='Unid')

    # Creates a dictionary setting the 'Sequence' column as the index then passing that as the Key, 
    # while a list is created from the other columns as the Value
    temp_dict = builder.set_index('Sequence').T.to_dict('list')    

    # Writes a new fasta file with the header as >Unid|Species|Tax_1 
    alnfile2 = open(f'{file_name}_named.fasta', 'w')

    countw = 0
    for k,v in temp_dict.items():
        alnfile.write('>' + str(v[0]) + '|' + str(v[1]) + '|' + str(v[2]) + '\n')
        countw += 1
        alnfile.write(k + '\n')

    alnfile2.close()


Added in the control sequences to sequence cleaner output, clear_200830_cdhit_90.fasta (renamed to 200901_all.fasta), and aligned using t_coffee regression

In [None]:
named_aln("200901_all_aln_w.fasta")

manually curtating the above then will continue manually clustering, note in methods identification of isoforms, and truncated sequences likely from endonucleases 

# 200909 Final branch

Using the phlyogeny from 200830 to go forward.   
Removing outliers (long branch/poor align) / duplicates manually. Generated the list of unique ids of the ordered genes to keep, below:  
(need to keep the sequences of genes we already have in the phylogeny)

In [None]:
kp_dup = [52894, 65326, 90543, 73058, 23687, 61093, 76228, 88670, 65848, 73944, 22979, 52629, 71462, 45128, 74987]

In [None]:
# First manually remove outlier sequences from the profile alignment (48803, 68977, 71379, 94733, 59845, 43533, 93150, 72690) and duplicate, 38676

def raw_clust(cluster, profile):
    """
    This function takes a fasta file ('cluster') and an alignment fasta file ('profile'), and finds the subset/intersection of sequences
    in both fasta files then writes a new fasta file from that list with unmodified sequences taken from 'master_index.txt'
    """
    clust_lst = []  
    aln_lst = []

    with open(f'{cluster}.fasta', 'r') as clust:
        for line in clust:
            if line.startswith('>'):
                a = line.rstrip('\n').lstrip('>')
                clust_lst.append(a)
            else:
                pass
    
    with open(f'{profile}.fasta', 'r') as pro_aln:
        for line in pro_aln:
            if line.startswith('>'):
                b = line.rstrip('\n').lstrip('>')
                aln_lst.append(b)
            else:
                pass    
            
    #subset
    subset = list(set(clust_lst) & set(aln_lst))
    
    index = pd.read_csv("master_index.txt", sep="\t")
    
    # Makes a new index dataframe of unids found in subset
    subset_df = index[index['Unid'].isin(subset)]

    # Converts subset_df into a dictionary where the key is the unid and the value is the sequence
    fdict = dict(zip(subset_df.Unid, subset_df.Sequence))

    ofile = open(f'raw_{cluster}.txt', 'w')            
    # Writes a new fasta file with the unique ids found in the dictionary fdict
    for x, y in fdict.items():
        ofile.write('>' + str(x) + '\n')
        ofile.write(y + '\n')

    ofile.close()      

    log(f'raw_{cluster}.txt was created with {len(fdict)} sequences from the {cluster}.fasta alingment - part of the 200830 branch, derived from master_index.txt')
        

In [None]:
raw_clust("green", "200905_u_t1_mod")

Run the above for red, blue, green  
aln with regression  
t_coffee -reg -seq{file}.fasta -nseq 100 -tree mbed -method mafftginsi_msa -outfile raw_{file}_aln.fasta -outtree {file}.mbed -thread 9 
curate - manually removed any seq with large insertions or deletions   

I will manually remove outlier sequences from the profile alignment (48803, 68977, 71379, 94733, 59845, 43533, 93150, 72690) and duplicate, 38676 from the aligned files (red.fasta ect)  
  
Then run t-coffee trim on them  
SAL 200 sequences, other two 150 each  
t_coffee -other_pg seq_reformat -in [file].fasta -in2 ordered_to_keep.fasta -action +trim _seq_n200 -output fasta_seq -outfile red_trim.fasta  
    this takes the fasta file and trims it to n200 (keeps 200 sequences) while making sure that sequences in 'ordered_to_keep.fasta' are retained 

In [None]:
# Takes trimmed clusters (not aligned), generates a litst of unids and then takes the aligned files and makes a new fasta with the intersection
# this new fasta file has the same actual sequences as the trimmed file but uses the aligned versions from the orignal file

def final_clust(cluster):

    clust_lst = []  

    with open(f'{cluster}_trim.fasta', 'r') as clust:
        for line in clust:
            if line.startswith('>'):
                a = line.rstrip('\n').lstrip('>')
                a = int(a)
                clust_lst.append(a)
            else:
                pass
           
    with open (f'{cluster}.fasta', 'r') as aln:

        temp_dict = {}
        x = ''
        y = ''

        # Converts the orignal, aligned and not trimmed, fasta file into a dictionary where the key is fasta header and the value is the sequence
        for line in aln:
            if line.startswith('>'):
                line = line[:6]
                x = line.strip('\n').lstrip('>')
            else:
                y = line.strip('\n')
            temp_dict.update({int(x): y}) 

        # Creates a new dictionary based off whether the unique id was found in the orignal SSN cluster csv
        fdict = {x: y for x, y in temp_dict.items() if x in clust_lst}

        ofile = open(f'{cluster}_f.fasta', 'w')        
        # Writes a new fasta file with the unique ids found in the dictionary w_bdict
        for x, y in fdict.items():
            ofile.write('>' + str(x) + '\n')
            ofile.write(y + '\n')

        ofile.close()    

    log(f'{cluster}_f.txt was created with {len(fdict)} sequences from the {cluster}_trim.fasta file - part of the 200830 branch, derived from {cluster}.fasta alignment')

In [None]:
final_clust("green")

did profile alignment  
then model finder using:  
iqtree2 -s 200909_profile_aln.fasta -m MF -mset WAG,LG,JTT,UL2,UL3,EX_EHO,LG4M,LG4X,CF4 -T AUTO  




# GADI tree handling

In [None]:
import os

In [None]:
# Copies all tree files that finished (i.e. filters dir that have 12 files in the folder (a necessary but not sufficent condition for a finished run) 
# and checks in the log file that the results were written out)

ran = set()
tree_lst = []
tree_count = 0

# Opens every dir in '200916_bulk1' and checks the number of file in the dir exceeds 11 
for r, d, f in os.walk("200916_bulk1"):
    for i in f:
        if len(f) > 11:
            ran.add(r)  # If condition met, adds that dir to the list 'ran'
        else:
            pass
# Creates a new list of truncated dir names (from list 'ran')    
for i in ran:
    dir = str(i)
    dir = dir[13:]
    
    # Opens each log file from the dir names in the list 'dir' and checks to see if the str 'Analysis results written to', which is necessary for the 
    # tree building run to have completed. If this str is there, the dir is added to a new list ('tree_lst') of completed tree building runs
    with open (f'200916_bulk1/{dir}/200909_profile_aln.fasta.log', 'r') as tree_log:
        if 'Analysis results written to' in tree_log.read():
            tree_lst.append(dir)
        else:
            print(str(dir) + " failed")

# Takes the tree file from each completed run and copies it to a single tree file called 'all_tree.nwk'
with open(f'200916_bulk1/all_trees.nwk', 'a') as op_file:

    for i in tree_lst:
        tree_count += 1
        with open(f'200916_bulk1/{i}/200909_profile_aln.fasta.treefile', 'r') as temp_file:
            for line in temp_file:
                op_file.write(line)
            op_file.write('\n')
            
        # writes a file 'tree_lst_AU.txt' of each tree (dir name) that was copied to the above file, in the same order
        with open(f'200916_bulk1/tree_lst_AU.txt', 'a') as record:
            record.write(str(tree_count) + '\t' + str(i) + '\n')

print(tree_count)
print(len(ran))
print(len(tree_lst))

# SSN file generation  
The cells below take the sequences used in the initial phylogeny to determine the root and creates a new version of unaligned sequences with a detailed fasta header 

In [None]:
#Opens the revelant fasta file and saves off the unids in variable 'seq_list'

ofile = open('200830_aln_for_ssn.fasta', 'r')

seq_list = []

for line in ofile:
    if line.startswith('>'):
        seq_list.append(line[1:].rstrip('\n'))
    
print(str(len(seq_list)) + " sequences total")

ofile.close()

In [None]:
#takes 'seq_list' from above and finds all relevant entries in the master index
index = pd.read_csv('master_index.txt', sep='\t')

# Makes a dataframe, 'ssn', that is a subset of index comprising only of rows with 'Unid' that match seq_list
ssn = index[index['Unid'].isin(seq_list)]

In [None]:
# Changes column 'Unid' from type int to str 
ssn['Unid'] = ssn.Unid.astype(str)

# Merges colums (must be type str) into a new column called 'new_head' where values from each column are seperated by '|' 
ssn['new_head'] = ssn[['Unid','Datasource', 'Species']].agg('|'.join, axis=1)

In [None]:
#Creates a dictionary with column 'new_head' as the key and 'Sequence' as the variable
temp_dict = dict(zip(ssn.new_head,ssn.Sequence))

In [None]:
#creates a fasta file from the above dict

alnfile = open('fig_1_ssn.fasta', 'a')

for k,v in temp_dict.items():
    alnfile.write('>' + str(k) + '\n')
    alnfile.write(v + '\n')

alnfile.close()

# Make working index with full taxonomic info for figures  
Rather than rely one the full index with ~3k sequences that are needed a subset index is created of only the 500 final sequences in the tree (not including the AtSAL sequences)

In [None]:
# Took final alignments of the three main clades from ~/Bioinformatics\06.restart_200830\02.Profile_alignment\02.Main_profile_alignment\01.Main_clades\02.Final_alignment_profiles
# blue_f.fasta
# green_f.fasta
# red_f.fasta

In [None]:
clust_lst = []  
files = ['red', 'green', 'blue']

# Makes a list of all Unids in each of the clade fasta files
for i in files:
    with open(f'{i}_f.fasta', 'r') as clust:
        for line in clust:
            if line.startswith('>'):
                a = line.rstrip('\n').lstrip('>')
                a = int(a)
                clust_lst.append(a)
            else:
                pass
            
index = pd.read_csv("master_index.txt", sep="\t")

# Makes a new index dataframe with only rows with Unids that match that from the list generated above
wrk_index = index[index['Unid'].isin(clust_lst)]

In [None]:
wrk_index.count()

In [None]:
wrk_index.to_csv('temp_index.csv', sep=',', index=False)

Below cells take the 1kp file 'annotations' of the taxonomic classifications for all species in the 1kp dataset, and merges it with the wrk_index dataframe created above, effectively 
redoing the taxonomic classification from the original index file.

In [None]:
anno = pd.read_csv("annotations.csv", sep=",")

In [None]:
anno.head()

In [None]:
# Merges the annotations df with the wrk_index df on the Species columns. This has the added benefit of annotating the taxonomic classification of some of the Phytozome species 
new_index = pd.merge(wrk_index,anno, on=['Species'])

In [None]:
new_index.count()

In [None]:
new_index.to_csv('new_index.csv', sep=',', index=False)

In [None]:
# Manually corrected new_index, now reloading
new_index2 = pd.read_csv("new_index.csv", sep=",")

In [None]:
min_index = new_index.drop_duplicates(subset=['Unid'])

In [None]:
min_index.count()

In [None]:
# Compares df wrk_index with the new one getting that got rid of duplicates and creates a datadrame of rows that were lost. Below cells then add them back in for manual correction
missing = pd.concat([wrk_index,min_index]).drop_duplicates(subset=['Unid'], keep=False)

In [None]:
missing.count()

In [None]:
missing_sp = missing['Unid'].tolist()

In [None]:
missing_index = index[index['Unid'].isin(missing_sp)]

In [None]:
wrk_index2 = pd.concat([min_index, missing_index])

In [None]:
wrk_index2.to_csv('wrk_index.csv', sep=',', index=False)

In [None]:
#manual correction of wrk_index.csv, then reload, then generate list of classifications for factor level ordering in R
class_lst = pd.read_csv("wrk_index.csv", sep=",")
B_class_lst = class_lst['Brief_Classification'].tolist()
B_class_lst = set(B_class_lst)
B_class_lst

wrk_index.csv was then manually curated and saved as wrk_index2.csv which will be used going forward

#  Make a tree file of all the trees in the dir 01.top20_AU

In [None]:
#did not end up using the final consensus tree of these 20 trees.

#Makes a list of files in the directory
ran = []

for r, d, f in os.walk("01.top20_AU"):
    ran = f

In [None]:
print('List of files: ')
print(ran)
print('')
print('Number of files: ')
print(len(ran))

In [None]:
def append_file(name, lst):
    '''
    This function takes a list of files and appends them to a new txt file.
    The function takes two arguments, the name (without the file extension) of the new file
    and a text document list of files without the file extension
    '''
    from log_func import log

    with open(f'{name}.nwk', 'a') as op_file:

        for i in lst:
            count = 0
            with open(f'01.top20_AU/{i}', 'r') as temp_file:
                for line in temp_file:
                    op_file.write(line + '\n')
                    count += 1
   #             log(f'{count} lines were added to {name}.txt')

    op_file.close()

In [None]:
append_file("top20_AU", ran) #take all 20 trees and makes one tree file out of all of them for use in consensus tree building 

# Find species that are in all three homologues

In [None]:
index = pd.read_csv('wrk_index2.csv', sep=',')
index["Details"] = index["Species"] + "\t" + index["Full_Classification"] #creates a new column that merges the Species and Full_Classification columns
index = index.sort_values(by = 'Details')

files = ['red', 'green', 'blue'] #refers to the final fasta files that made up the 3 clades in the 200909_profile_aln.fasta file that was used to build the
                                 # GADI trees. Does not include the AtSALs

red_sp = []
green_sp = []
blue_sp = []

df_list = []

#Makes a list of all unids in fasta file i
for i in files:
    with open(f'10.Genes_for_expression/{i}_f.fasta', 'r') as clust:
        clust_lst = []  
        for line in clust:
            if line.startswith('>'):
                a = line.rstrip('\n').lstrip('>')
                a = str(a)
                clust_lst.append(a)
            else:
                pass   

#Makess dfs for each of the files i which are subsets of wrk_index2, and lists of the 'Details' columns of each of these dfs
        if i == 'red':
            df_red = index[index['Unid'].isin(clust_lst)]
            red_sp = df_red["Details"].to_list()
        elif i == 'green':
            df_green =  index[index['Unid'].isin(clust_lst)]
            green_sp = df_green["Details"].to_list()
        elif i == 'blue':
            df_blue =  index[index['Unid'].isin(clust_lst)]
            blue_sp = df_blue["Details"].to_list()
        else:
            break
        print(len(clust_lst))



In [None]:
#Keep, shows species that are in all three homologues

#Makes a new dataframe from the three generated in the cell above
rgb = red_sp + green_sp + blue_sp

rgb_dict =  {i:rgb.count(i) for i in rgb}
red_dict = {i:red_sp.count(i) for i in red_sp}
green_dict = {i:green_sp.count(i) for i in green_sp}
blue_dict = {i:blue_sp.count(i) for i in blue_sp}

red_k = set(red_dict.keys())
green_k = set(green_dict.keys())
blue_k = set(blue_dict.keys())
rgb_k = set(rgb_dict.keys())

intersect_all = red_k & green_k & blue_k
intersect_r_g = (red_k & green_k) - (red_k & green_k & blue_k)
intersect_r_b = (red_k & blue_k) - (red_k & green_k & blue_k)


In [None]:
iall_df = index[index['Species'].isin(intersect_all)]
ir_g_df = index[index['Species'].isin(intersect_r_g)]
ir_b_df = index[index['Species'].isin(intersect_r_b)]

iall_df['fasta'] = '>' + iall_df["Unid"] + '|' + iall_df["Species"] + "|" + iall_df["Full_Classification"]+ '\n' + iall_df["Sequence"]
ir_g_df['fasta'] = '>' + ir_g_df["Unid"] + '|' + ir_g_df["Species"] + "|" + ir_g_df["Full_Classification"]+ '\n' + ir_g_df["Sequence"]
ir_b_df['fasta'] = '>' + ir_b_df["Unid"] + '|' + ir_b_df["Species"] + "|" + ir_b_df["Full_Classification"]+ '\n' + ir_b_df["Sequence"]

iall = iall_df['fasta'].tolist()
ir_g = ir_g_df['fasta'].tolist()
ir_b = ir_b_df['fasta'].tolist()

with open('Species_in_all.fasta', 'w') as ofile:
    for i in iall:
        ofile.write(i + '\n')
        
with open('Species_in_SAL_AHL.fasta', 'w') as ofile:
    for i in ir_g:
        ofile.write(i + '\n')

with open('Species_in_SAL_Unid.fasta', 'w') as ofile:
    for i in ir_b:
        ofile.write(i + '\n')

In [None]:
#Creates a file that lists which species are found in all three clades, and which are found in SAL+AHL / SAL+Unid

with open('Species_distro_across_homologues.txt', 'w') as ofile:
    print(date_time + '\n')
    ofile.write(date_time + '\n')
    print('Species in all homologues: ')
    ofile.write('Species in all homologues: \n')
    for k, v in rgb_dict.items():
        if k in intersect_all:
            print(str(v) + "\t" + k)
            ofile.write(str(v) + "\t" + k + '\n')
    print("")
    ofile.write("\n")
    print('Species in SAL & AHL (but not Unid): ')
    ofile.write('Species in SAL & AHL (but not Unid): \n')
    for k, v in rgb_dict.items():
        if k in intersect_r_g:
            print(str(v) + "\t" + k)
            ofile.write(str(v) + "\t" + k + '\n')
    print("")
    ofile.write("\n")
    print('Species in SAL & Unid (but not AHL): ')
    ofile.write('Species in SAL & AHL (but not Unid): \n')
    for k, v in rgb_dict.items():
        if k in intersect_r_b:
            print(str(v) + "\t" + k)
            ofile.write(str(v) + "\t" + k + '\n')

The below cells generate a file with all the algal sequences in the phylogeny 

In [None]:
algae_lst = ['Rhodophyta', 'Chromista', 'Glaucophyta', 'Streptophyta', 'Chlorophyta']

a_red = df_red.loc[df_red['Brief_Classification'].isin(algae_lst)]

a_green = df_green.loc[df_green['Brief_Classification'].isin(algae_lst)]

a_blue = df_blue.loc[df_blue['Brief_Classification'].isin(algae_lst)]



algae_df = pd.concat([a_red, a_green, a_blue], join = 'outer')

In [None]:
algae_df['Fasta'] = '>' + algae_df["Unid"] + '|' + algae_df["Species"] + "|" + algae_df["Brief_Classification"]+ '\n' + algae_df["Sequence"]

In [None]:
a_lst = algae_df['Fasta'].tolist()

with open('Algae_candidates.fasta', 'w') as ofile:
    for i in a_lst:
        ofile.write(str(i) + '\n')

In [None]:
algae_df.count()

# For Ancestral Sequence Reconstruction (ASR)

In [None]:
def raw_clust2():
    """
    Takes the Unids from the alignment used to make the phylogeny, and remakes the fasta file with 
    the raw sequences taken from the master_index
    """

    cluster = ['red_f', 'green_f', 'blue_f']
    index = pd.read_csv("wrk_index2.csv", sep=",")
    
    for i in cluster:    
        clust_lst = [] 
        with open(f'{i}.fasta', 'r') as clust:
            for line in clust:
                if line.startswith('>'):
                    a = line.rstrip('\n').lstrip('>')
                    clust_lst.append(a)
                else:
                    pass

        # Makes a new index dataframe of unids found in subset
        subset_df = index[index['Unid'].isin(clust_lst)]

        subset_df['Unid'] = subset_df.Unid.astype(str)
        subset_df['new_head'] = '>' + subset_df["Unid"] + '_' + subset_df["Species"] + "_" + subset_df["Brief_Classification"]

        # Converts subset_df into a dictionary where the key is the unid and the value is the sequence
        fdict = dict(zip(subset_df.new_head, subset_df.Sequence))

        ofile = open(f'ASR_{i}.fasta', 'w')            
        # Writes a new fasta file with the unique ids found in the dictionary fdict
        for x, y in fdict.items():
            ofile.write('>' + str(x) + '\n')
            ofile.write(y + '\n')

        ofile.close()    
        
        log(f'ASR_{i}.fasta was created with {len(fdict)} sequences from the {i}.fasta file. This has the raw, unmodified sequences taken from wrk_index2.csv that are in the final phylogeny ')

In [None]:
raw_clust2()

In [None]:
files = ['red', 'green', 'blue'] #refers to the final fasta files that made up the 3 clades in the 200909_profile_aln.fasta file that was used to build the
                                 # GADI trees. Does not include the AtSALs

#Makes a list of all unids in fasta file i

for i in files:
    ofile = open(f'00.ASR_aln_files/ASR_{i}.fasta', 'w')  
    with open(f'00.ASR_aln_files/ASR_{i}_f_aln.fasta', 'r') as clust:
        for line in clust:
            if line.startswith('>'):
                a = line.lstrip('>_')
                ofile.write('>' + a[:5] + '\n')
            else:
                ofile.write(line)

    ofile.close()

In [None]:
# Quality control script to check that all sequences in the phylogeny are in the ASR alignment

l1 = []
l2 = []

with open('tree_lst.txt', 'r') as tree_lst:
    for i in tree_lst:
        l1.append(str(i.rstrip('\n')))

with open('ASR_lst.fasta', 'r') as ASR:
    for i in ASR:    
        for line in ASR:
            if line.startswith('>'):
                a = line.rstrip().lstrip('>')
                l2.append(a)
                print(a)
            else:
                pass        
            
print("Sequences in tree but not aln: ")
(set(l1) - set(l2))            
            
print("Sequences in aln but not tree: ")
(set(l2) - set(l1))

for i in l1:
    if i == '18271':
        print(i)
    else:
        pass
    
print(len(l2))

# Makes three fasta files (SAL, AHL, UIM) from the final aligned file, but from the 3 fasta files that have the same number of sequences for Machine Learning 

In [None]:
# Takes trimmed clusters (not aligned), generates a litst of unids and then takes the aligned files and makes a new fasta with the intersection
# this new fasta file has the same actual sequences as the trimmed file but uses the aligned versions from the orignal file

def final_clust(cluster):

    clust_lst = []  

    with open(f'{cluster}.fasta', 'r') as clust:
        for line in clust:
            if line.startswith('>'):
                a = line.rstrip('\n').lstrip('>')
                a = str(a)
                clust_lst.append(a)
            else:
                pass
           
    with open ('200909_profile_aln.fasta', 'r') as aln:

        temp_dict = {}
        x = ''
        y = ''

        # Converts the orignal, aligned and not trimmed, fasta file into a dictionary where the key is fasta header and the value is the sequence
        for line in aln:

            if line.startswith('>'):
                line = line[:6]
                x = line.strip('\n').lstrip('>')
                x = str(x)
            else:
                y = line.strip('\n')
            temp_dict.update({x: y}) 

        # Creates a new dictionary based off whether the unique id was found in the orignal SSN cluster csv
        fdict = {x: y for x, y in temp_dict.items() if x in clust_lst}

        ofile = open(f'{cluster}_f.fasta', 'w')        
        # Writes a new fasta file with the unique ids found in the dictionary w_bdict
        for x, y in fdict.items():
            ofile.write('>' + str(x) + '\n')
            ofile.write(y + '\n')

        ofile.close()    

    log(f'{cluster}_f.fasta was created with {len(fdict)} sequences from the 200909_profile_aln.fasta file - the final alignment. {cluster}.fasta is just a list of 150 sequences from that clade')

In [None]:
final_clust('sal_ml')

In [None]:
# Takes list of sequences from branches on the tree (from R) and then generates a fasta file of all sequences on that branch

temp_dict = {}

with open(f'ASR_lst.fasta', 'r') as fasta:
        k = ""
        v = ""
        for line in fasta:
            if line.startswith('>'):
                k = line.rstrip().lstrip(">")
            else:
                v = line.strip('\n')
            temp_dict.update({k: v})    

with open('AHL_split_clades.txt', 'r') as tips:
    for i in tips:
        dec = []
        if i.startswith("Node:  "):
            anc = i[-5:]
            anc = anc.strip()
            anc = str(anc)
        else:
            dec.append(i.strip())
        with open(f'Node_{anc}_decendents.fasta', 'a') as fasta:
            w_dict = {k: v for k, v in temp_dict.items() if k in dec} 
            for k,v in w_dict.items():
                fasta.write('>' + k + '\n')
                fasta.write(v + '\n')

Takes a list of unids and then makes a fasta file from the original unedited sequences

In [30]:
def orig(node):

    index = pd.read_csv("Phytozome_only.csv", sep=",")

    dec = []

    with open(f'node_{node}.txt', 'r') as tips:
        for i in tips:
            dec.append(int(i.strip()))
    print(dec)
    # Converts subset_df into a dictionary where the key is the unid and the value is the sequence
    fdict = dict(zip(index.Unid, index.Sequence))       
    with open(f'Node_{node}_decendents.fasta', 'a') as fasta:
        w_dict = {k: v for k, v in fdict.items() if k in dec} 
        for k,v in w_dict.items():
            fasta.write('>' + str(k) + '\n')
            fasta.write(v + '\n')       


In [31]:
orig(519)

[90464, 81871, 33984, 25695, 41913, 87157, 23667, 66549, 52038, 62990, 97858, 47245, 63293, 58119, 38520, 80724, 94141, 98974, 77267, 96564, 92671, 54419, 56488, 54601, 73058, 57806, 80579, 97730, 17105, 76271, 20909, 93553, 23732, 51398, 99371, 90543, 17083, 81896, 31117, 69989, 79765, 21743, 89136, 66767, 61883, 33970, 84340, 49821, 95740, 66145, 59090, 19472, 26614, 36524, 29668, 82788, 29175, 59641, 20126, 23433, 89288, 56570, 84126, 65326, 31950, 69093, 76470, 49138, 81778, 19111, 53402, 61760, 83804, 85491, 90152, 82085, 14745, 86606, 77008, 71776, 73908, 43082, 73472, 19707, 54413, 77394, 61911, 12096, 48080, 52894, 82438, 31442, 67835, 56100, 64520, 44350, 18248, 90688, 50775, 67960, 57679, 60898, 30415, 79889, 51878, 60947, 60798, 68115, 41591, 90087, 95143, 60720, 50477, 73944, 97561, 95815, 21163, 33450, 17937, 74279, 24967, 51263, 62592, 76260, 58274, 31151, 31679, 16761, 98862, 81650, 49793, 94525, 40121, 54197, 47032, 57719, 33655, 30263, 54625, 85154, 76490, 76307, 28130

Protein analysis. Takes a list of fasta files of amino acid sequences and generates a csv with sequence attributes:
* Unid
* Sequence length
* Aromaticity 
* Instability point (iI)
* Isoelectric point (pI)
* Extinction coefficient (eC)
* Clade (the fasta file name)

In [12]:
#Takes list of fasta files uses BioPython to calculate attributes
def seq_analysis(*args):
    
    # Takes the list of fasta files as ints from files named "Node_{int}_decendents.fasta"
    clade_lst = args   
    
    #Just makes a string of the clade number for labelling the final file
    clades = ''
    for clade in clade_lst:
        clades = clades + '_' + str(clade)
    
    # Initialises a list for appending dictionaries of 
    protein_features = []
   
    # Iterates through list of files/clades
    for i in clade_lst:
        with open(f"Node_{i}_decendents.fasta") as handle:
            # Parses fasta file with BioPython
            for record in SeqIO.parse(handle, "fasta"):
                temp_dict = {}

                # Calculates properties of each sequence
                k = record.id
                length = len(record.seq)
                analysed_seq = ProteinAnalysis(str(record.seq))
                aro = analysed_seq.aromaticity()
                iI = analysed_seq.instability_index()
                pI = analysed_seq.isoelectric_point()
                eC = analysed_seq.molar_extinction_coefficient()[0]

                # Uses Pandas to create a dictionary with property names as the keys and values as the value
                # for each property it appends to the dictionary 'temp_dict'
                temp_dict['Unid'] = k
                temp_dict['Length'] = length
                temp_dict['Aromaticity'] = aro
                temp_dict['Instability_point'] = iI
                temp_dict['Isoelectric_point'] = pI
                temp_dict['Extinction_coefficient'] = eC
                temp_dict['Clade'] = i
                
                # Adds dictionary to the list 'protein_features'
                protein_features.append(temp_dict)

    # Pandas takes the list 'protein_features' and coverts each dictionary into a row in the dataframe 'df'
    # then prints the df to csv
    df = pd.DataFrame.from_dict(protein_features)
    df.to_csv(f'clades{clades}_stats.csv', sep=',', index=False)
    
#run t-tests in R comparing the clades

In [None]:
seq_analysis(693,734)

Get the localisation data for a specific set of tips from a branch

In [10]:
# This function takes a csv file 'master_loc' manulally created fro DeepLoc1.0 output with columns called 'Unid' and 'Localisation'
# It then writes a new localisation info csv file with a subset of Unid's taken from a text file with only Unid's (one per row)

def loc(node):

    index = pd.read_csv("master_loc.csv", sep=",")

    dec = []

    # Generates a list 'dec' of Unids (tips) from the node file
    with open(f'node_{node}.txt', 'r') as tips:
        for i in tips:
            dec.append(i.strip())

    # Converts 'index' into a dictionary where the key is the unid and the value is the localisation
    fdict = dict(zip(index.Unid.astype(str), index.Localisation))   
    # Creates/appends a csv file
    with open(f'Node_{node}_localisation.csv', 'a') as doc:
        # Creates a dictionary (w_dict) but only if the key in fdict matches an element in list 'dec'
        w_dict = {k: v for k, v in fdict.items() if k in dec} 
        # Unpacks w_dict and writes to the csv with the key (Unid) seperated with the value (Localisation) seperated by a coma per row
        for k,v in w_dict.items():
            doc.write(k + ',' + v + '\n')

In [12]:
loc(693)