# Homogenize metadata of joined datasets

In [1]:
import pandas as pd
import os
import numpy as np
from Bio import Phylo

os.chdir("/FastData/czirion/WeavePop_Cneoformans/")

Input

In [2]:
metadata_ashton_path = "Crypto_Ashton/config/metadata_all_ashton_and_vni_desj.csv"
metadata_desjardins_path = "Crypto_Desjardins/config/metadata.csv"

ploidy = "analyses/results/tables/ploidy.tsv"
ashton_mapping_stats_path = "Crypto_Ashton/results/02.Dataset/depth_quality/mapping_stats.tsv"

Output

In [3]:
metadata_path =  "analyses/data/processed/metadata_all_H99_complete.csv"
metadata_final_path = "analyses/data/processed/metadata_ashton_desj_all_weavepop_final_H99.csv"
metadata_vni_path = "analyses/data/processed/metadata_ashton_desj_vni_weavepop_final_H99.csv"

Get original metadata of all samples in Ashton paper and Desjardins samples.

In [4]:
metadata_ashton = pd.read_csv(metadata_ashton_path, header=0)
metadata_desjardins = pd.read_csv(metadata_desjardins_path, header=0)

### Join both tables
Keeping the run label and VNI sublineage infromation from the Ashton metadata.

Create run column to maintain run accession of samples in the Ashton tree, which are the values used as tip labels for the Desjardins samples.  
Create dataset column in both tables.  
Remove VNI subdivision column from Desjardins table.

In [5]:
metadata_ashton = metadata_ashton.rename(columns={"study": "dataset"})
metadata_ashton['run'] = metadata_ashton['sra_accession']
metadata_desjardins['dataset'] = 'Desjardins'
metadata_desjardins.drop(columns=['vni_subdivision'], inplace=True)

Make an outer join using all the common columns.   

In [6]:
common_columns = ['sample', 'lineage', 'source', 'country_of_origin','dataset']
metadata = pd.merge(metadata_ashton, metadata_desjardins, how='outer', on= common_columns, suffixes=('_ashton', '_desjardins'))

Keep the strain names from the corresponging table (because the Ashton table has sra accession as strain names in the Desjardins samples).  
Keep the sra accession from the corresponding table.

In [7]:
metadata.loc[metadata["dataset"] == "Desjardins", "strain"] = metadata["strain_desjardins"]
metadata.loc[metadata["dataset"] == "Ashton", "strain"] = metadata["strain_ashton"]
metadata.loc[metadata["dataset"] == "Desjardins", "sra_accession"] = metadata["sra_accession_desjardins"]
metadata.loc[metadata["dataset"] == "Ashton", "sra_accession"] = metadata["sra_accession_ashton"]
metadata.drop(columns=['strain_ashton', 'strain_desjardins', 'sra_accession_ashton', 'sra_accession_desjardins'], inplace=True)

### Add the continent information to the Desjardins samples

Fill the continent column in the samples (Desjardins) that don't have it.  
Make a dataframe with the unique combinations of country and continent.   
Make a dataframe of the countries that do have contient information.  

In [8]:
country_continent = metadata[['country_of_origin', 'continent_of_origin']].drop_duplicates().sort_values(by='country_of_origin')
countries = country_continent.dropna()

Make sure all countries have a continent assigned.

In [9]:
country_continent['country_of_origin'].unique() == countries['country_of_origin'].unique()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

Add the continent column to the metadata of the samples.

In [10]:
metadata = metadata.drop(columns=['continent_of_origin'])
metadata = metadata.merge(countries, on='country_of_origin', how='left')

Reorder columns

In [11]:
first_cols = ['sample', 'strain', 'run', 'lineage', 'vni_subdivision', 'vnia_subdivision', 'dataset', 'source', 'country_of_origin', 'continent_of_origin']
metadata = metadata[first_cols + [col for col in metadata.columns if col not in first_cols]]

### Add information about excluded samples

Add information about putative ploidy of some samples

In [12]:
ploidy_df = pd.read_csv(ploidy, sep="\t", header=0)
metadata = metadata.merge(ploidy_df, on='sample', how='left')

Add information about bad quality samples

In [13]:
quality_df = pd.read_csv(ashton_mapping_stats_path, sep="\t", header=0)
quality_df = quality_df[['sample', 'quality_warning']]
quality_df = quality_df.loc[quality_df['quality_warning'].notna()]
metadata = metadata.merge(quality_df, on='sample', how='left')

Make column with the reason for exclusion from the dataset of some samples.

In [14]:
metadata['excluded'] = metadata.apply(
    lambda row: row['ploidy'] if pd.notna(row['ploidy']) else 
                row['quality_warning'] if pd.notna(row['quality_warning']) else 
                row['lineage'] if (row['dataset'] == 'Ashton' and row['lineage'] != 'VNI') else 
                'missing' if row['run'] == 'ERR2624135' else
                np.nan,
    axis=1)

### Add metadata of

Add H99 GCF_000149245

In [15]:
H99 = {'sample': 'GCF_000149245', 'run': 'GCF_000149245', 'strain': 'H99', 'lineage': 'VNI', 'vni_subdivision': 'VNIb', 'dataset': 'Reference', 'source': 'Clinical', 'mating_type': 'α', 'country_of_origin': 'USA', 'continent_of_origin': 'North America'}

In [16]:
metadata = pd.concat([metadata, pd.DataFrame([H99])], ignore_index=True)


## Save metadata table of multiple subsets 

### All

In [17]:
len(metadata)

1087

In [18]:
metadata.groupby(['dataset', 'lineage'], observed=True).size().reset_index(name='counts')

Unnamed: 0,dataset,lineage,counts
0,Ashton,AD_hybrid,5
1,Ashton,VNI,678
2,Ashton,VNII,4
3,Ashton,gattii,12
4,Desjardins,VNBI,122
5,Desjardins,VNBII,64
6,Desjardins,VNI,185
7,Desjardins,VNII,16
8,Reference,VNI,1


In [19]:
metadata.to_csv(metadata_path, index=False)

### WeavePop

In [20]:
metadata_weavepop = metadata[metadata['excluded'].isna()].copy()
len(metadata_weavepop)

1026

In [21]:
metadata_weavepop.groupby(['dataset', 'lineage'], observed=True, dropna=False).size().reset_index(name='counts')

Unnamed: 0,dataset,lineage,counts
0,Ashton,VNI,646
1,Desjardins,VNBI,120
2,Desjardins,VNBII,62
3,Desjardins,VNI,181
4,Desjardins,VNII,16
5,Reference,VNI,1


In [22]:
metadata_weavepop.groupby(['lineage', 'source'], observed=True, dropna=False).size().reset_index(name='counts')

Unnamed: 0,lineage,source,counts
0,VNBI,Clinical,49
1,VNBI,Environmental,71
2,VNBII,Clinical,59
3,VNBII,Environmental,3
4,VNI,Clinical,800
5,VNI,Environmental,28
6,VNII,Clinical,15
7,VNII,Environmental,1


In [23]:
metadata_weavepop.groupby(['lineage', 'continent_of_origin'], observed=True, dropna=False).size().reset_index(name='counts')

Unnamed: 0,lineage,continent_of_origin,counts
0,VNBI,Africa,120
1,VNBII,Africa,62
2,VNI,Africa,265
3,VNI,Asia,524
4,VNI,Europe,26
5,VNI,North America,12
6,VNI,South America,1
7,VNII,Africa,7
8,VNII,Australasia,1
9,VNII,Europe,3


In [24]:
metadata_weavepop.groupby(['lineage', 'mating_type'], observed=True, dropna=False).size().reset_index(name='counts')

Unnamed: 0,lineage,mating_type,counts
0,VNBI,a,33
1,VNBI,α,86
2,VNBI,,1
3,VNBII,a,8
4,VNBII,α,54
5,VNI,a,3
6,VNI,α,179
7,VNI,,646
8,VNII,α,16


In [25]:
metadata_weavepop.to_csv(metadata_final_path, index=False)

### WeavePop VNI

In [26]:
metadata_vni = metadata_weavepop[metadata_weavepop['lineage'] == 'VNI'].copy()
len(metadata_vni)

828

In [27]:
metadata_vni.groupby(['lineage', 'dataset'], observed=True, dropna=False).size().reset_index(name='counts')

Unnamed: 0,lineage,dataset,counts
0,VNI,Ashton,646
1,VNI,Desjardins,181
2,VNI,Reference,1


In [28]:
metadata_vni.to_csv(metadata_vni_path, index=False)

## Compare names in the Ashton phylogeny to metadata

In [29]:
tree = Phylo.read('/FastData/czirion/WeavePop_Cneoformans/analyses/data/raw/2017.06.09.all_ours_and_desj.snp_sites.mod.fa.cln.tree', 'newick')
tips =[tip.name for tip in tree.get_terminals()]
print(len(tips))
tips_not_in_metadata = [tip for tip in tips if tip not in metadata['strain'].values and tip not in metadata['run'].values]
tips_not_in_metadata

865


['04CN-63-018']

The Ashton tree has the strain names for the Ashton samples and the run accession for the Desjardins VNI samples.  
There is one sample in the Ashton tree that is not in the metadata table.

In [30]:
strains_not_in_tree = metadata[~metadata['strain'].isin(tips)].reset_index(drop = True)
strains_not_in_tree.groupby(['lineage'], observed=True).size().reset_index(name='counts')

Unnamed: 0,lineage,counts
0,AD_hybrid,5
1,VNBI,122
2,VNBII,64
3,VNI,186
4,VNII,20
5,gattii,12
