# Prepare data for Nextstrain workflow

## Import `Python` modules

In [1]:
import os
import sys
import glob
import pandas as pd
from collections import defaultdict
import lzma

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from Bio import SeqIO
from Bio.Seq import MutableSeq

## Take files with all GISAID sequences and metadata and create new files with a subset of the sequences

In [2]:
# Get a list of strains from a recent 12yr build
ack_df = pd.read_csv('profiles/dmsa-phenotype/sequences/nextstrain_flu_seasonal_h3n2_ha_12y_acknowledgements.tsv', sep='\t')
strains_to_extract = list(ack_df['strain'].unique())
print(len(strains_to_extract), 'strains to extract')

# Get metadata for reference sequence
ref_metadata_df = pd.read_csv('profiles/dmsa-phenotype/sequences/6y_metadata_w_HK19.tsv.xz', sep='\t')
ref_metadata_df = ref_metadata_df[ref_metadata_df['strain'] == 'A/HK/45/2019']

# Read in metadata for all sequences, then write an output with subset from 12yr build plus reference
all_metadata_df = pd.read_csv('profiles/dmsa-phenotype/sequences/metadata.tsv.xz', sep='\t')
metadata_12yr_df = all_metadata_df[all_metadata_df['strain'].isin(strains_to_extract)]
output_f = 'profiles/dmsa-phenotype/sequences/12y_metadata.tsv'
if not os.path.isfile(output_f):
    print(f'Extracted {len(metadata_12yr_df)} strains from metadata')
    metadata_12yr_df = pd.concat([metadata_12yr_df, ref_metadata_df])
    metadata_12yr_df.to_csv(output_f, index=False)

# Get FASTA entry for reference sequence 
input_f = 'profiles/dmsa-phenotype/sequences/6y_sequences_w_HK19.fasta.xz'
with lzma.open(input_f, mode='rt', encoding='utf-8') as f:
    for record in SeqIO.parse(f, 'fasta'):
        if record.id == 'A/HK/45/2019':
            ref_id = record.id
            ref_seq = record.seq

# Read in FASTA of all sequences, then write an output with subset from 12yr build plus reference
input_f = 'profiles/dmsa-phenotype/sequences/sequences.fasta.xz'
output_f = 'profiles/dmsa-phenotype/sequences/12y_sequences.fasta'
if not os.path.isfile(output_f):
    seqs_extracted = 0
    with lzma.open(input_f, mode='rt', encoding='utf-8') as fasta_f:
        with open(output_f, 'w') as f:
            for record in SeqIO.parse(fasta_f, 'fasta'):
                if record.id in strains_to_extract:
                    f.write(f'>{record.id}\n{record.seq}\n')
                    seqs_extracted += 1
            f.write(f'>{ref_id}\n{ref_seq}\n')
    print(f'Extracted {seqs_extracted} from the FASTA file')

1478 strains to extract


Save a file with the GISAID accession number for each strain, along with data acknowledging contributing labs.

In [9]:
metadata_12yr_df[[
    'strain', 'virus', 'accession_ha', 'originating_lab', 'submitting_lab'
]].to_csv('profiles/dmsa-phenotype/sequences/nextstrain_flu_seasonal_h3n2_ha_12y_acknowledgements_and_accession_numbers.tsv', sep='\t', index=False)

## Curate files with mutational effects

First, read in files with all mutational effects and then make new files with only mutations that pass certain filters, such as the number of times a mutation was observed in the experiment.

In [3]:
times_seen = 3
n_models = 2
mut_effects_dir = 'profiles/dmsa-phenotype/antibody_escape/'
filtered_mut_effects_dir = os.path.join(mut_effects_dir, 'filtered_data/')
if not os.path.isdir(filtered_mut_effects_dir):
    os.makedirs(filtered_mut_effects_dir)
fs = glob.glob(os.path.join(mut_effects_dir, '*_avg.csv'))
for f in fs:
    output_f = f.replace(mut_effects_dir, filtered_mut_effects_dir)
    if not os.path.isfile(output_f):
        df = pd.read_csv(f)
        starting_len = len(df)
        df = df[
            (df['times_seen'] >= times_seen) &
            (df['n_models'] >= n_models)
        ]
        ending_len = len(df)
        #print(starting_len, ending_len, os.path.basename(f))
        if ending_len > 0:
            df.to_csv(output_f, index=False)

Get data on the cohort associated with a given serum sample

In [4]:
cohort_dict = {
    'children' : [
        3944,
        2389,
        2323,
        2388,
        3973,
        4299,
        4584,
        2367,
    ],
    'teenagers' : [
        2350,
        2365,
        2382,
        3866,
        2380,
        3856, # outlier
        3857,
        3862,
    ],
    'adults' : [
        '33C',
        '34C',
        '197C',
        '199C',
        '215C',
        '210C',
        '74C',
        '68C',
        '150C',
        '18C',
    ],
    'elderly' : [
        'AUSAB-13'
    ],
    'infant' : [
        2462
    ]
}
serum_to_cohort_dict = defaultdict(list)
for (cohort, sera) in cohort_dict.items():
    for serum in sera:
        serum_to_cohort_dict['cohort'].append(cohort)
        serum_to_cohort_dict['serum'].append(str(serum))
serum_to_cohort_df = pd.DataFrame(serum_to_cohort_dict)

Average mutational effects across all sera in a given cohort

In [5]:
cohorts = [
    'children', 'teenagers', 'adults'
    ]
for cohort in cohorts:
    sera = cohort_dict[cohort]
    dfs = []
    for serum in sera:
        f = os.path.join(filtered_mut_effects_dir, f'{serum}_avg.csv')
        assert os.path.isfile(f)
        df = pd.read_csv(f)
        df['serum'] = serum
        dfs.append(df)
    df = pd.concat(dfs)
    df.groupby(['mutation'])[['escape_median']].mean().reset_index()
    output_f = os.path.join(filtered_mut_effects_dir, f'avg_{cohort}.csv')
    if not os.path.isfile(output_f):
        df.to_csv(output_f, index=False)