Prepare sequencing data and metadata.

Sequencing data is loaded on the run basis. Metadata is loaded on the sample set basis. Only overlapping samples are taken into further analysis.

Manual steps: setting sample sets and sequencing runs; setting species taxonomy

In [1]:
%run common.ipynb

In [2]:
# input files
RAW_SEQ_FILE = '../../../data/phylo_ampl_dada2/run{}/dada2/output/haplotypes.tsv'
SAMPLESET_FILE = 'src/samples_{}.csv'

# Sequencing data

In [3]:
# read amplicon sequencing data
seq_data = []
for i in (1,2,9):
    s = pd.read_csv(RAW_SEQ_FILE.format(i), sep='\t', dtype=str)
    s['source'] = 'run{}'.format(i)
    seq_data.append(s)
seq_data = pd.concat(seq_data)
seq_data.head()

Unnamed: 0,s_Sample,target,consensus,reads,total_reads,frac_reads,source
0,3D7,10,GTGGAGAGCTGCCAGACAGAAGTTTCCGAGGTACGGTAATCGCCCT...,12,12,1.0,run1
1,3D7,11,ATCGTATCTGGAAGGGTATTTGAAAGCAGACAGAAAGCAAATGTAG...,10,20,0.5,run1
2,3D7,11,ATCGTATCTGGAAGTGTATTTGAAAGCAGACAGAAAGCAAATGTAG...,10,20,0.5,run1
3,3D7,13,GATTTTCCATTCCTCCCATTGCACCCGGTGGATTAATTAATGCGTA...,46,46,1.0,run1
4,3D7,15,AAGCGGTCGCGCATCGGGCCGGCGGTCCGCATCCTCGGCATGATCG...,21,21,1.0,run1


In [4]:
# assert sample names are not reused between runs
is_reused = seq_data.groupby('s_Sample')['source'].nunique() > 1
is_reused[is_reused]

Series([], Name: source, dtype: bool)

# Sample metadata

In [5]:
# concatenate sample metadata from key sources 
sample_meta = list()
for sampleset in 'ga','kh','sa':
    f = SAMPLESET_FILE.format(sampleset)
    print(f)
    df = pd.read_csv(f, dtype=str)
    df['Sample Set'] = sampleset
    sample_meta.append(df)
sample_meta = pd.concat(sample_meta, sort=False)
display(sample_meta.shape)
display(sample_meta['Sample Set'].value_counts())
sample_meta.columns.sort_values()

src/samples_ga.csv
src/samples_kh.csv
src/samples_sa.csv


(268, 16)

kh    159
ga    106
sa      3
Name: Sample Set, dtype: int64

Index(['Collection Context', 'Collection Method', 'Collection Time', 'Country',
       'Date of Collection', 'External ID', 'Extraction Method', 'Latitude',
       'Location', 'Longitude', 'Sample Set', 'Sample Type', 'Species',
       'Specimen Sex', 'Specimen Type', 's_Sample'],
      dtype='object')

In [6]:
# samples with missing species ID
sample_meta.Species.isna().sum()

0

In [7]:
# check for missing and duplicated sample IDs
assert sample_meta.s_Sample.isna().sum() == 0
# output is sorted by reverse value counts
sample_meta.s_Sample.value_counts().head()

Athe-2        2
VBS00091      1
VBS00045      1
VBS00106      1
Aimp-M0030    1
Name: s_Sample, dtype: int64

In [8]:
sample_meta[sample_meta.s_Sample.isna()]

Unnamed: 0,s_Sample,External ID,Date of Collection,Location,Country,Latitude,Longitude,Species,Collection Context,Collection Method,Collection Time,Specimen Sex,Specimen Type,Extraction Method,Sample Type,Sample Set


# Overlap sequencing data with metadata

In [9]:
# sequenced sample names matching sample metadata
# note that not for all samples sequencing data was generated
overlap_ids = seq_data[seq_data.s_Sample.isin(sample_meta.s_Sample)]['s_Sample'].unique()
overlap_ids.shape

(135,)

In [10]:
# subset samples to sequenced ones
subset_sample_meta = sample_meta[sample_meta.s_Sample.isin(overlap_ids)].copy()
# write sample metadata to file
display(sample_meta.shape, subset_sample_meta.shape)

(268, 16)

(136, 16)

In [11]:
# subset sequencing data to samples with matching sample metadata
subset_seq_data = seq_data[seq_data.s_Sample.isin(overlap_ids)].copy()
display(seq_data.shape, subset_seq_data.shape)

(58824, 7)

(8685, 7)

## Metadata - add source

In [12]:
# copy source info from sequencing to metadata
sample_source_mapping = subset_seq_data.groupby('s_Sample')['source'].max().to_dict()
subset_sample_meta['Source'] = subset_sample_meta.s_Sample.replace(sample_source_mapping)
subset_sample_meta.Source.value_counts()

run1    71
run2    62
run9     3
Name: Source, dtype: int64

In [13]:
# write expanded sample metadata
subset_sample_meta.to_csv(SEQ_META_FILE, index=False)

## Sequencing data - remove plasmodium amplicons, limit columns

In [14]:
# remove excessive columns
display(subset_seq_data.shape)
subset_seq_data = subset_seq_data.drop(columns=['source','frac_reads','total_reads'])
display(subset_seq_data.shape)

(8685, 7)

(8685, 4)

In [15]:
# remove non-mosquito amplicons
display(subset_seq_data.shape)
subset_seq_data = subset_seq_data[subset_seq_data.target.isin(AMPLS)]
display(subset_seq_data.shape)

(8685, 4)

(8632, 4)

In [16]:
# write subsetted sequencing data
subset_seq_data.to_csv(SEQ_FILE, index=False)

## Combine metadata

In [17]:
meta = list()

for f in (SEQ_META_FILE, REF_META_FILE):
    meta.append(pd.read_csv(f))
meta = pd.concat(meta, sort=False)
meta.sample(3)

Unnamed: 0,s_Sample,External ID,Date of Collection,Location,Country,Latitude,Longitude,Species,Collection Context,Collection Method,Collection Time,Specimen Sex,Specimen Type,Extraction Method,Sample Type,Sample Set,Source
28,Aten-185,185,2015-02-09,Benguia,Gabon,-1.63281,13.49217,Anopheles_tenebrosus,Outdoors,Human Landing Catch,,Female,Wild caught adult,QIAGEN,Genomic DNA,ga,run1
14,Acou-71,71,2015-02-09,Benguia,Gabon,-1.63281,13.49217,Anopheles_coustani,Outdoors,Human Landing Catch,,Female,Wild caught adult,QIAGEN,Genomic DNA,ga,run1
23,Azie-334,334,2015-09-15,Benguia,Gabon,-1.63281,13.49217,Anopheles_ziemanni,Outdoors,Human Landing Catch,,Female,Wild caught adult,QIAGEN,Genomic DNA,ga,run1


In [18]:
# any missing sample IDs?
assert ~meta.s_Sample.isna().any()

In [19]:
# any missing species IDs?
assert ~meta.Species.isna().any()

In [20]:
meta.to_csv(COMB_META, index=False)

## Combine sequencing data, add metadata

In [21]:
seq = list()

for f in (SEQ_FILE, REF_FILE):
    seq.append(pd.read_csv(f, dtype=str))
seq = pd.concat(seq, sort=False)
seq.sample()

Unnamed: 0,s_Sample,target,consensus,reads
7850,VBS00145,43,TTGAGTACTCGGGCGTAAGCGGGATGCTTTTCCCTCGTCCAAATTC...,93


In [22]:
sample_species = meta.set_index('s_Sample')['Species'].to_dict()
seq['species'] = seq.s_Sample.replace(sample_species)

In [23]:
# assign IDs to unique sequences
combuids = dict()
for tgt, group in seq.groupby(['target']):
    for (i, cons) in enumerate(group['consensus'].unique()):
        combuids[tgt+cons] = '{}-{}'.format(tgt, i)
seq['combUID'] = (seq.target + seq.consensus).replace(combuids)
seq.sample(3)

Unnamed: 0,s_Sample,target,consensus,reads,species,combUID
2509,Amou-2-5,30,CCAATATGTCGAGGCTTCTTTGAAAACACATGAAAAATATTGATAA...,217,Anopheles_moucheti,30-19
1954,Amar-3-1,43,TTGAGTACTCGGGCGTAAGCGGGATGCTTTTCCCTCGTCCAAATTC...,223,Anopheles_marshallii,43-20
6494,VBS00059,20,CAAACTGCACATCGATGATGTTGGTAAACTCGCCCGGCTTCTCCAG...,260,Anopheles_minimus_A,20-45


In [24]:
seq.to_csv(COMB_SEQ, index=False)

## Generate alignments

In [25]:
# generate alignments
for ampl in AMPLS:
    sys.stdout.write('\r' + ampl)
    # subset amplicon data
    ampl_data = seq[seq.target == ampl]
    # get only unique sequences
    ampl_data = ampl_data[~ampl_data.combUID.duplicated()]
    # write under combUID name
    with open('temp.fa', 'w') as o:
        for (i, row) in ampl_data.iterrows():
            o.write('>{}\n{}\n'.format(row.combUID,
                                       row.consensus))
    # align
    ! mafft temp.fa > {ALN_ALL.format(ampl)} 2> /dev/null
    ! rm temp.fa
print('\nDone!')

61
Done!


## Generate taxonomy

In [28]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()
ncbi.update_taxonomy_database()

Downloading taxdump.tar.gz from NCBI FTP site (via HTTP)...
Done. Parsing...


Loading node names...
2248558 names loaded.
218746 synonyms loaded.
Loading nodes...
2248558 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /Users/am60/.etetoolkit/taxa.sqlite ...
 2248000 generating entries...  generating entries... 
Uploading to /Users/am60/.etetoolkit/taxa.sqlite


Inserting synonyms:      25000 




Inserting taxid merges:  55000   





Inserting taxids:       2245000 




In [29]:
all_spp = meta.Species.str.replace('_',' ').unique()
ncbi_spp = ncbi.get_name_translator(all_spp)
# species not in NCBI Taxonomy - to be replaced later by closest representative
print(set(all_spp) - set(ncbi_spp.keys()))

{'Anopheles nili ss', 'Anopheles brohieri', 'Anopheles dureni', 'Anopheles maculatus B', 'Anopheles maculatus A'}


In [30]:
# replace unparsed species IDs with the close species
# based on Harbach
def subst_sp(orig_sp, other_sp):
    '''
    Substitute species ID with the one existing in NCBI
    '''
    try:
        taxid = ncbi.get_name_translator([other_sp])[other_sp]
        return ({orig_sp:taxid})
    except:
        print('{} not found in NCBI taxonomy'.format(other_sp))
        
        return 0
    
    
ncbi_spp.update(subst_sp('Anopheles nili ss', 'Anopheles nili'))
ncbi_spp.update(subst_sp('Anopheles maculatus A', 'Anopheles maculatus'))
ncbi_spp.update(subst_sp('Anopheles maculatus B', 'Anopheles maculatus'))
ncbi_spp.update(subst_sp('Anopheles dureni', 'Anopheles vinckei'))
ncbi_spp.update(subst_sp('Anopheles brohieri', 'Anopheles hancocki'))
len(ncbi_spp)

58

In [31]:
# check ranks, report non-species ranks with complete lineage
ncbi_ranks = ncbi.get_rank([x[0] for x in ncbi_spp.values()])
for taxid, rank in ncbi_ranks.items():
    if rank != 'species':
        lineage = ncbi.get_lineage(taxid)
        names = ncbi.get_taxid_translator(lineage)
        ranks = ncbi.get_rank(lineage)
        for t in lineage:
            print(names[t],ranks[t])

root no rank
cellular organisms no rank
Eukaryota superkingdom
Opisthokonta no rank
Metazoa kingdom
Eumetazoa no rank
Bilateria no rank
Protostomia no rank
Ecdysozoa no rank
Panarthropoda no rank
Arthropoda phylum
Mandibulata no rank
Pancrustacea no rank
Hexapoda subphylum
Insecta class
Dicondylia no rank
Pterygota subclass
Neoptera infraclass
Holometabola cohort
Diptera order
Nematocera suborder
Culicomorpha infraorder
Culicoidea superfamily
Culicidae family
Anophelinae subfamily
Anopheles genus
Cellia subgenus
Neomyzomyia no rank
leucosphyrus group species group
leucosphyrus subgroup species subgroup
dirus species complex no rank
Anopheles dirus species
Anopheles dirus A no rank


In [32]:
# replace non-species IDs with upstream species
ncbi_spp.update(subst_sp('Anopheles dirus A', 'Anopheles dirus'))
len(ncbi_spp)

58

In [33]:
# parse out relevant parts of taxonomy
taxonomy = dict()
for (species, taxids) in ncbi_spp.items():
    
    # init taxonomy dict for species
    taxonomy[species] = dict()
    # get species lineage
    taxid = taxids[0]
    lineage = ncbi.get_lineage(taxid)
    # start from smallest taxa
    lineage.reverse()

    names = ncbi.get_taxid_translator(lineage)
    ranks = ncbi.get_rank(lineage)
    # iterate
    i = 1
    for t in lineage:
        rank = ranks[t]
        name = names[t]
        # existing rank - save as is
        if rank != 'no rank':
            taxonomy[species][rank] = name
            # terminate at subgenus
            if rank == 'subgenus':
                break
            else:
                continue
        name_split = name.split(' ')
        # unknown rank
        # split names - series, section, complex
        if len(name_split) > 1:
            # extract rank from name
            rank = name_split[-1]
            name = name_split[0].capitalize()
            taxonomy[species][rank] = name
            continue
        # non-split name - should be only series
        else:
            rank = rank + ' ' + str(i)
            taxonomy[species][rank] = name
            i += 1

In [34]:
# examine the output
td = pd.DataFrame(taxonomy).T.fillna('')
# td.head()
# corrections
# `no rank 1` is indeed series, combine
td['series'] = td['no rank 1'] + td['series']
# rename some ranks for consistency
td = td.rename(columns={'species group':'group',
                        'species subgroup':'subgroup',
                        'species':'species_ncbi'})
# order taxonomically
td = td[['subgenus','section','series','group','subgroup','complex','species_ncbi']]
# Get titile from long group/subgroup name
# heterogeneity - `minimus group`, but `Anopheles annularis group`
for col in ('group','subgroup'):
    td[col] = td[col].str.split(' ').str.get(-2).str.capitalize().fillna('')
# return underscores in species names
td.index = td.index.str.replace(' ','_')
td.index.name = 'species'
# sort
td = td.sort_index()
# inspect
td.head()

Unnamed: 0_level_0,subgenus,section,series,group,subgroup,complex,species_ncbi
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Anopheles_aconitus,Cellia,,Myzomyia,Minimus,,,Anopheles aconitus
Anopheles_albimanus,Nyssorhynchus,Albimanus,Albimanus,,,,Anopheles albimanus
Anopheles_annularis,Cellia,,Neocellia,Annularis,,Annularis,Anopheles annularis
Anopheles_aquasalis,Nyssorhynchus,Albimanus,Oswaldoi,Oswaldoi,Oswaldoi,,Anopheles aquasalis
Anopheles_arabiensis,Cellia,,Pyretophorus,,,Gambiae,Anopheles arabiensis


In [35]:
td[td.index.str.contains('quad')]

Unnamed: 0_level_0,subgenus,section,series,group,subgroup,complex,species_ncbi
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Anopheles_quadriannulatus,Cellia,,Pyretophorus,,,Gambiae,Anopheles quadriannulatus


In [36]:
# manually add missing values from Harbach
td.loc['Anopheles_marshallii', 'group'] = 'Marshallii'
td.loc['Anopheles_brohieri', 'group'] = 'Marshallii'
td.loc['Anopheles_hancocki', 'group'] = 'Marshallii'
td.loc['Anopheles_carnevalei', 'group'] = 'Ardensis'
td.loc['Anopheles_carnevalei', 'complex'] = 'Nili'
#
td.loc['Anopheles_theileri', 'series'] = 'Myzomyia'
td.loc['Anopheles_rhodesiensis', 'series'] = 'Neomyzomyia'
td.loc['Anopheles_jebudensis', 'series'] = 'Neomyzomyia'
td.loc['Anopheles_carnevalei', 'series'] = 'Neomyzomyia'


In [37]:
td.to_csv(TAXONOMY_FILE)

## Sandbox - manual taxonomy input

In [None]:
# x = pd.DataFrame(meta.Species.unique(), columns=['species'])
# x['subgenus'] = 'Cellia'
# x.loc[x.species.isin(['Anopheles_paludis',
#            'Anopheles_ziemanni',
#            'Anopheles_tenebrosus',
#            'Anopheles_coustani',
#            'Anopheles_sinensis',
#            'Anopheles_hyrcanus',
#            'Anopheles_barbirostris',
#            'Anopheles_atroparvus',
#            'Anopheles_implexus']), 'subgenus'] = 'Anopheles'
# x.loc[x.species.isin(['Anopheles_albimanus',
#            'Anopheles_aquasalis',
#            'Anopheles_darlingi',
#            'Anopheles_oryzalimnetes']), 'subgenus'] = 'Kertezsia'
# x.loc[x.species.isin(['Anopheles_cruzii',
#            'Anopheles_bellator']), 'subgenus'] = 'Nyssorhynchus'
# x.head()

In [None]:
# x['series'] = ''
# x.loc[x.species.isin(
#     ['Anopheles_nili_ss',
#     'Anopheles_nili',
#     'Anopheles_carnevalei',
#     'Anopheles_koliensis',
#     'Anopheles_farauti',
#     'Anopheles_punctulatus',
#     'Anopheles_tessellatus',
#     'Anopheles_cracens',
#     'Anopheles_balabacensis',
#     'Anopheles_dirus_A',
#     'Anopheles_dureni',
#     'Anopheles_vinckei',
#     'Anopheles_rhodesiensis']), 'series'] = 'Neomyzomiya'
# x.loc[x.species.isin(
#     ['Anopheles_coluzzii',
#     'Anopheles_gambiae',
#     'Anopheles_arabiensis',
#     'Anopheles_quadriannulatus',
#     'Anopheles_melas',
#     'Anopheles_merus',
#     'Anopheles_epiroticus',
#     'Anopheles_sundaicus',
#     'Anopheles_vagus',
#     'Anopheles_christyi']), 'series'] = 'Pyretophorus'
# x.loc[x.species.isin(
#     ['Anopheles_maculatus_A',
#     'Anopheles_maculatus_B',
#     'Anopheles_rampae',
#     'Anopheles_maculipalpis',
#     'Anopheles_jamesii',
#     'Anopheles_annularis',
#     'Anopheles_stephensi']), 'series'] = 'Neocellia'
# x.loc[x.species.isin(
#     ['Anopheles_marshalli',
#     'Anopheles_hancocki',
#     'Anopheles_demeilloni',
#     'Anopheles_theileri',
#     'Anopheles_jebudensis',
#     'Anopheles_merus',
#     'Anopheles_epiroticus',
#     'Anopheles_sundaicus',
#     'Anopheles_vagus',
#     'Anopheles_christyi']), 'series'] = 'Pyretophorus'