Libraries

In [1]:
import pandas as pd
import os

Paths

In [2]:
os.chdir("/FastData/czirion/WeavePop_Cneoformans/")

In [3]:
# Input
metadata_weavepop_joined_path = "Crypto_Desjardins_Ashton/results/02.Dataset/metadata.csv"
metadata_from_ashton_path = "Crypto_Ashton/config/metadata_all_ashton_and_vni_desj.csv"
#Output
metadata_fixed_path =  "analyses/data/processed/metadata_ashton_desj_all_weavepop_complete_info.csv"
metadata_all_H99_path = "analyses/data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
metadata_vni_path = "analyses/data/processed/metadata_ashton_desj_vni_weavepop.csv"

Get metadata of both Desjardins and Ashton datasets from the WeavePop joined analysis.

In [4]:
metadata_weavepop_joined = pd.read_csv(metadata_weavepop_joined_path, header=0)

Homogenize some values

In [5]:
metadata_weavepop_joined['source'] = metadata_weavepop_joined['source'].replace('Environment', 'Environmental')
metadata_weavepop_joined['country_of_origin'] = metadata_weavepop_joined['country_of_origin'].str.strip()

Get the metadata from the Ashton paper and drop the columns that are already in the other metadata.

In [6]:
metadata_from_ashton = pd.read_csv(metadata_from_ashton_path, header=0)
metadata_from_ashton.columns = metadata_from_ashton.columns.str.lower().str.replace(' ', '_')
metadata_from_ashton.drop(columns=['country_of_origin', 'species_id_from_mash_anlaysis', 'study',
       'hiv_status', 'continent_of_origin', 'year_of_origin',
       'mean_depth_of_mapping_with_mq_>_30_across_whole_genome',
       'proportion_of_genome_covered_by_at_least_5_reads_which_mapped_with_mq_>_30', 'source'], inplace=True)


Merge them only on the column sample keeping the sample in the WeavePop analysis.

In [7]:
metadata_fixed = metadata_weavepop_joined.merge(metadata_from_ashton, on = 'sample', how="left")

In the Desjardins samples keep the strain and lineage from the WeavePop analysis, and in the Ashton samples keep the species and lineage from the Ashton metadata.

In [8]:
metadata_fixed.loc[metadata_fixed["dataset"] == "Desjardins", "strain"] = metadata_fixed["strain_x"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Desjardins", "lineage"] = metadata_fixed["lineage_x"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Ashton", "strain"] = metadata_fixed["strain_y"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Ashton", "lineage"] = metadata_fixed["lineage_y"]

Keep the run accessions from the Ashton metadata where available to be able to link the Desjarins samples to the Ashton tree (that includes them).

In [9]:
metadata_fixed.loc[metadata_fixed["sra_accession_y"].notna(), "run"] = metadata_fixed["sra_accession_y"]
metadata_fixed.loc[metadata_fixed["sra_accession_y"].isna(), "run"] = metadata_fixed["sra_accession_x"]

Keep the vni_subdivision from the Ashton metadata

In [10]:
metadata_fixed = metadata_fixed.rename(columns={"vni_subdivision_y": "vni_subdivision"})

Cleanup

In [11]:
metadata_fixed = metadata_fixed.drop(columns=["strain_x", "strain_y", "lineage_x", "lineage_y", "vni_subdivision_x", "sra_accession_x", "sra_accession_y"])
metadata_fixed = metadata_fixed[['sample', 'run', 'strain','lineage', 'vni_subdivision', 'dataset', 'source'] + [col for col in metadata_fixed.columns if col not in ['sample', 'strain','lineage','source','dataset','vni_subdivision','run'] ]]
metadata_fixed = metadata_fixed.sort_values('vni_subdivision')

In [12]:
metadata_fixed.groupby(['dataset', 'lineage'], observed=True).size().reset_index(name='counts')


Unnamed: 0,dataset,lineage,counts
0,Ashton,VNI,668
1,Desjardins,VNBI,122
2,Desjardins,VNBII,64
3,Desjardins,VNI,185
4,Desjardins,VNII,16


In [13]:
len(metadata_fixed)

1055

In [14]:
metadata_fixed.to_csv(metadata_fixed_path, index=False)

Save a version with only the useful columns

In [15]:
metadata = metadata_fixed[['sample', 'run', 'strain','lineage', 'vni_subdivision', 'dataset', 'source', 'mating_type', 'country_of_origin']]


Add H99 GCF_000149245

In [16]:
H99 = {'sample': 'GCF_000149245', 'run': 'GCF_000149245', 'strain': 'H99', 'lineage': 'VNI', 'vni_subdivision': 'VNIb', 'dataset': 'Reference', 'source': 'Clinical', 'mating_type': 'α', 'country_of_origin': 'USA'}

In [17]:
metadata = pd.concat([metadata, pd.DataFrame([H99])], ignore_index=True)
metadata.groupby(['dataset', 'lineage'], observed=True).size().reset_index(name='counts')


Unnamed: 0,dataset,lineage,counts
0,Ashton,VNI,668
1,Desjardins,VNBI,122
2,Desjardins,VNBII,64
3,Desjardins,VNI,185
4,Desjardins,VNII,16
5,Reference,VNI,1


In [18]:
metadata.to_csv(metadata_all_H99_path, index=False)

In [19]:
metadata_ashton_desj_vni_weavepop = metadata[(metadata['dataset'] != 'Reference') & (metadata['lineage'] == 'VNI')]
metadata_ashton_desj_vni_weavepop.groupby(['dataset', 'lineage'], observed=True).size().reset_index(name='counts')
metadata_ashton_desj_vni_weavepop.to_csv(metadata_vni_path, index=False)

In [20]:
metadata_ashton_desj_vni_weavepop.groupby(['dataset', 'lineage'], observed=True).size().reset_index(name='counts')


Unnamed: 0,dataset,lineage,counts
0,Ashton,VNI,668
1,Desjardins,VNI,185
