In [48]:
import pandas as pd

Get metadata of both Desjardins and Ashton datasets from the FungalPop joined analysis.

In [49]:
metadata_fungalpop_joined = pd.read_csv("/FastData/czirion/Crypto_Diversity_Pipeline/Crypto_Desjardins_Ashton/results_joined/02.Dataset/metadata.csv", header=0)

In [50]:
metadata_fungalpop_joined['source'] = metadata_fungalpop_joined['source'].replace('Environment', 'Environmental')

Get the metadata from the Ashton paper and drop the columns that are already in the other metadata.

In [51]:
metadata_from_ashton = pd.read_csv("/FastData/czirion/Crypto_Diversity_Pipeline/Crypto_Ashton/config/metadata_all_ashton_and_vni_desj.csv", header=0)
metadata_from_ashton.columns = metadata_from_ashton.columns.str.lower().str.replace(' ', '_')
metadata_from_ashton.drop(columns=['country_of_origin', 'species_id_from_mash_anlaysis', 'study',
       'hiv_status', 'continent_of_origin', 'year_of_origin',
       'mean_depth_of_mapping_with_mq_>_30_across_whole_genome',
       'proportion_of_genome_covered_by_at_least_5_reads_which_mapped_with_mq_>_30', 'source'], inplace=True)


Merge them only on the column sample keeping the sample in the FungalPop analysis.

In [52]:
metadata_fixed = metadata_fungalpop_joined.merge(metadata_from_ashton, on = 'sample', how="left")

In the Desjardins samples keep the strain and lineage from the FungalPop analysis, and in the Ashton samples keep the species and lineage from the Ashton metadata.

In [53]:
metadata_fixed.loc[metadata_fixed["dataset"] == "Desjardins", "strain"] = metadata_fixed["strain_x"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Desjardins", "lineage"] = metadata_fixed["lineage_x"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Ashton", "strain"] = metadata_fixed["strain_y"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Ashton", "lineage"] = metadata_fixed["lineage_y"]

Keep the run accessions from the Ashton metadata where available to be able to link the Desjarins samples to the Ashton tree (that includes them).

In [54]:
metadata_fixed.loc[metadata_fixed["sra_accession_y"].notna(), "run"] = metadata_fixed["sra_accession_y"]
metadata_fixed.loc[metadata_fixed["sra_accession_y"].isna(), "run"] = metadata_fixed["sra_accession_x"]

Keep the vni_subdivision from the Ashton metadata

In [55]:
metadata_fixed = metadata_fixed.rename(columns={"vni_subdivision_y": "vni_subdivision"})

Cleanup

In [56]:
metadata_fixed = metadata_fixed.drop(columns=["strain_x", "strain_y", "lineage_x", "lineage_y", "vni_subdivision_x", "sra_accession_x", "sra_accession_y"])
metadata_fixed = metadata_fixed[['sample', 'run', 'strain','lineage', 'vni_subdivision', 'dataset', 'source'] + [col for col in metadata_fixed.columns if col not in ['sample', 'strain','lineage','source','dataset','vni_subdivision','run'] ]]
metadata_fixed = metadata_fixed.sort_values('vni_subdivision')

In [57]:
metadata_fixed.groupby(['dataset', 'lineage'], observed=True).size().reset_index(name='counts')


Unnamed: 0,dataset,lineage,counts
0,Ashton,VNI,668
1,Desjardins,VNBI,122
2,Desjardins,VNBII,64
3,Desjardins,VNI,185
4,Desjardins,VNII,16


In [58]:
len(metadata_fixed)

1055

In [59]:
metadata_fixed.to_csv("/FastData/czirion/Crypto_Diversity_Pipeline/analyses/data/processed/metadata_ashton_desj_all_fungalpop_complete_info.csv", index=False)

Save a version with only the useful columns

In [60]:
metadata = metadata_fixed[['sample', 'run', 'strain','lineage', 'vni_subdivision', 'dataset', 'source', 'mating_type', 'country_of_origin']]


Add H99 GCF_000149245

In [61]:
H99 = {'sample': 'GCF_000149245', 'run': 'GCF_000149245', 'strain': 'H99', 'lineage': 'VNI', 'vni_subdivision': 'VNIb', 'dataset': 'Reference', 'source': 'Clinical', 'mating_type': 'α', 'country_of_origin': 'USA'}

In [62]:
metadata = pd.concat([metadata, pd.DataFrame([H99])], ignore_index=True)
metadata.groupby(['dataset', 'lineage'], observed=True).size().reset_index(name='counts')


Unnamed: 0,dataset,lineage,counts
0,Ashton,VNI,668
1,Desjardins,VNBI,122
2,Desjardins,VNBII,64
3,Desjardins,VNI,185
4,Desjardins,VNII,16
5,Reference,VNI,1


In [63]:
metadata.to_csv("/FastData/czirion/Crypto_Diversity_Pipeline/analyses/data/processed/metadata_ashton_desj_all_fungalpop_H99.csv", index=False)

In [64]:
metadata_ashton_desj_vni_fungalpop = metadata[(metadata['dataset'] != 'Reference') & (metadata['lineage'] == 'VNI')]
metadata_ashton_desj_vni_fungalpop.groupby(['dataset', 'lineage'], observed=True).size().reset_index(name='counts')
metadata_ashton_desj_vni_fungalpop.to_csv("/FastData/czirion/Crypto_Diversity_Pipeline/analyses/data/processed/metadata_ashton_desj_vni_fungalpop.csv", index=False)


In [65]:
metadata_ashton_desj_vni_fungalpop.groupby(['dataset', 'lineage'], observed=True).size().reset_index(name='counts')


Unnamed: 0,dataset,lineage,counts
0,Ashton,VNI,668
1,Desjardins,VNI,185
