# Generate Fasta Files 

Create fasta files for species pan genomics

In [1]:
import pandas as pd
import numpy as np
import os


data_dir = os.path.join(os.getcwd(), 'data', 'files')

def _make_path(subd, fname):
    return os.path.join(subd, fname)

tval = pd.read_csv(_make_path(data_dir, 'train_values.csv'), index_col='sequence_id')
tlab = pd.read_csv(_make_path(data_dir, 'train_labels.csv'), index_col='sequence_id')

len(tval), len(tlab)

(63017, 63017)

In [2]:
species_cols = [col for col in tval.columns if 'species' in col]
species_names = ['_'.join(col.split('_')[1:]) for col in species_cols]
species_cols, species_names

(['species_budding_yeast',
  'species_fly',
  'species_human',
  'species_mouse',
  'species_mustard_weed',
  'species_nematode',
  'species_other',
  'species_rat',
  'species_synthetic',
  'species_zebrafish'],
 ['budding_yeast',
  'fly',
  'human',
  'mouse',
  'mustard_weed',
  'nematode',
  'other',
  'rat',
  'synthetic',
  'zebrafish'])

In [6]:
import os
from io import StringIO


def filter_species(data, species):
    temp = data[data[species]==1]
    return temp['sequence']


def _make_path(*args):
    return os.path.join(*args)


def _split_lines(sequence):
    s = ["\n"+sequence[i:i + 80] for i in range(0, len(sequence), 80)]
    return s


def write_fasta(series, species, savedir):
    tdict = series.to_dict()
    for seqname in tdict:
        save_path = _make_path(savedir, species)
        fpath = _make_path(savedir, species, seqname)+'.txt'
        sequence = tdict[seqname]
        
        if not os.path.isdir(save_path):
            os.mkdir(save_path)
            
        with open(fpath, 'w+') as file:
            file.write(f'>{seqname}')
            file.writelines(_split_lines(sequence))

In [34]:
s = species_cols[0]
for s in species_cols:
    ff = filter_species(tval, s)
    write_fasta(ff, s, data_dir)

## Build genomefile from directory

Create genomefile, a list of full paths of FASTA format text files.

In [1]:
import os


data_dir = os.path.join(os.getcwd(), 'data', 'files')

species = ['budding_yeast',
  'fly',
  'human',
  'mouse',
  'mustard_weed',
  'nematode',
  'other',
  'rat',
  'synthetic',
  'zebrafish']

In [7]:
s = species[0]
spec_dir = os.path.join(data_dir, 'species_'+s)

genomefile_path = os.path.join(spec_dir, 'genome_list.txt')

with open(genomefile_path, 'w+') as file:
    for fname in os.listdir(spec_dir):
        file.write(_make_path(spec_dir, fname) + "\n")

In [5]:
genomefile_path

'/mnt/c/Users/vince/Bin/ddata_geneatt/data/files/species_budding_yeast/genome_list.txt'