# Phase 1i: Seperating Metadata

```
Parameters
-------------
save_dir: str  
    Path to directory for saving outputs in.

cache_dir: str 
       Path to directory for cached objects in.

metadata_path: str
       Path to csv or tsv containing metadata.

fasta_path: str
    Path to fasta file containing sequences.

xml_set_definitions : dict {str: str}
        The definitions for the xml_sets you wish to use.
        Keys:   The name used for the xml_set. Will be used to name directories so certain characters should be
                   avoided see https://www.mtu.edu/umc/services/websites/writing/characters-avoid/.
        Values: Will be used with pandas DataFrame.query to seperate out your data see:
                        * https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html
                        * https://sparkbyexamples.com/pandas/pandas-dataframe-query-examples/
                        * https://www.slingacademy.com/article/pandas-working-with-the-dataframe-query-method-5-examples/

root_strain_names: list of strs
    IDs of sequences to be used as root.

sample_id_field: str
    Name of field in metadata_db containing sequence IDs.

collection_date_field: str
    Name of field in metadata_db containing collection dates of sequences. Should be format YYYY-MM-DD.

data_filter: str
    Optional can be an empy string, None or 'None'. Additional filter applieid to metadata_db when selecting 
    sequences and metadata to be used on pipeline. Must conform to [pandas documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html), see further [example](https://www.slingacademy.com/article/pandas-working-with-the-dataframe-query-method-5-examples/). 
```


In [None]:
save_dir = 'runs_of_pipeline/2025-02-05'
cache_dir = 'cache'
metadata_path = None
xml_set_definitions = None
sample_id_field = 'strain'
collection_date_field = 'date'
data_filter = None
fasta_path = None

Import pakages

In [None]:
import pandas as pd
import os
from Bio import SeqIO
import json

If pipeline_run_info.json exists open it. If not create empty dict.

In [None]:
if os.path.isfile(f'{save_dir}/pipeline_run_info.json'):
    with open(save_dir + "/pipeline_run_info.json", "r") as file:
        data = file.read()
    file.close()
    pipeline_run_info = json.loads(data)
else:
    pipeline_run_info = {}

## Load metadata and filter.

In [None]:
import json
if metadata_path.endswith('.tsv'):
    delimiter = '\t'
elif metadata_path.endswith('.csv'):
    delimiter = ','
else:
    raise TypeError(f"metadata_db must be a csv or tsv file, ending with the apporpraite file extension. Value given is {metadata_path}")
metadata_all_df = pd.read_csv(metadata_path,
                              sep=delimiter,
                              parse_dates=[collection_date_field]
                              )

if data_filter is not None:
    metadata_all_df = metadata_all_df.query(data_filter)

## Seperate metadata & Sequences

In [None]:
xml_set_directories = {}
xml_set_metadata = {}
for xml_set, pd_query in xml_set_definitions.items():
    xml_set_path = f'{save_dir}/{xml_set}'
    os.makedirs(xml_set_path)
    xml_set_directories[xml_set] = xml_set_path
    if data_filter is not None:
        pd_query = f"({data_filter}) & {pd_query}"
    xml_set_metadata = metadata_all_df.query(pd_query)
    xml_set_metadata.to_csv(f'{xml_set_path}/metadata.csv', index=False)
    ids = xml_set_metadata[sample_id_field].to_list()
    selected_seqs = [seq_record for seq_record in SeqIO.parse(fasta_path, 'fasta') if seq_record.id in ids]
    with open(f'{xml_set_path}/sequences.fasta', 'w') as handle:
            SeqIO.write(selected_seqs, handle, 'fasta')


# Saving information to pass onto the next Phases

In [None]:
pipeline_run_info = {
    'xml set directories': xml_set_directories,
    'xml set definitions': xml_set_definitions,
                     }


with open(save_dir +'/pipeline_run_info.json', 'w') as fp:
    json.dump(pipeline_run_info, fp, sort_keys=True, indent=4)

fp.close()