In [38]:
import pandas as pd

Get metadata of both Desjardins and Ashton datasets from the FungalPop joined analysis.

In [39]:
metadata_fungalpop_joined = pd.read_csv("/FastData/czirion/Crypto_Diversity_Pipeline/Crypto_Desjardins_Ashton/results_joined_241204/02.Dataset/metadata.csv", header=0)

Get the metadata from the Ashton paper and drop the columns that are already in the other metadata.

In [40]:
metadata_from_ashton = pd.read_csv("/FastData/czirion/Crypto_Diversity_Pipeline/Crypto_Ashton/config/metadata_all_ashton_and_vni_desj.csv", header=0)
metadata_from_ashton.columns = metadata_from_ashton.columns.str.lower().str.replace(' ', '_')
metadata_from_ashton.drop(columns=['country_of_origin', 'species_id_from_mash_anlaysis', 'study',
       'hiv_status', 'continent_of_origin', 'year_of_origin',
       'mean_depth_of_mapping_with_mq_>_30_across_whole_genome',
       'proportion_of_genome_covered_by_at_least_5_reads_which_mapped_with_mq_>_30', 'source'], inplace=True)


Merge them only on the column sample keeping the sample in the FungalPop analysis.

In [41]:
metadata_fixed = metadata_fungalpop_joined.merge(metadata_from_ashton, on = 'sample', how="left")

In the Desjardins samples keep the strain and lineage from the FungalPop analysis, and in the Ashton samples keep the species and lineage from the Ashton metadata.

In [42]:
metadata_fixed.loc[metadata_fixed["dataset"] == "Desjardins", "strain"] = metadata_fixed["strain_x"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Desjardins", "lineage"] = metadata_fixed["lineage_x"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Ashton", "strain"] = metadata_fixed["strain_y"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Ashton", "lineage"] = metadata_fixed["lineage_y"]

Keep the run accessions from the Ashton metadata where available to be able to link the Desjarins samples to the Ashton tree (that includes them).

In [43]:
metadata_fixed.loc[metadata_fixed["sra_accession_y"].notna(), "run"] = metadata_fixed["sra_accession_y"]
metadata_fixed.loc[metadata_fixed["sra_accession_y"].isna(), "run"] = metadata_fixed["sra_accession_x"]

Keep the vni_subdivision from the Ashton metadata

In [44]:
metadata_fixed = metadata_fixed.rename(columns={"vni_subdivision_y": "vni_subdivision"})

Cleanup

In [45]:
metadata_fixed = metadata_fixed.drop(columns=["strain_x", "strain_y", "lineage_x", "lineage_y", "vni_subdivision_x", "sra_accession_x", "sra_accession_y"])
metadata_fixed = metadata_fixed[['sample', 'run', 'strain','lineage', 'vni_subdivision', 'dataset', 'source'] + [col for col in metadata_fixed.columns if col not in ['sample', 'strain','lineage','source','dataset','vni_subdivision','run'] ]]
metadata_fixed = metadata_fixed.sort_values('vni_subdivision')

In [46]:
metadata_fixed.to_csv("/FastData/czirion/Crypto_Diversity_Pipeline/analyses/data/derived/metadata_fixed.csv", index=False)

Save a version with only the useful columns

In [53]:
metadata = metadata_fixed[['sample', 'run', 'strain','lineage', 'vni_subdivision', 'dataset', 'source', 'mating_type', 'country_of_origin']]


Add H99 GCF_000149245

In [54]:
H99 = {'sample': 'GCF_000149245', 'run': 'GCF_000149245', 'strain': 'H99', 'lineage': 'VNI', 'vni_subdivision': 'VNIb', 'dataset': 'Reference', 'source': 'Clinical', 'mating_type': 'α', 'country_of_origin': 'USA'}

In [55]:
metadata = pd.concat([metadata, pd.DataFrame([H99])], ignore_index=True)
metadata

Unnamed: 0,sample,run,strain,lineage,vni_subdivision,dataset,source,mating_type,country_of_origin
0,SRS404807,SRR798275,Bt28,VNI,VNIa-32,Desjardins,Clinical,α,Botswana
1,ERS1142739,ERR1671640,20427_2#42,VNI,VNIa-32,Ashton,Clinical,,Vietnam
2,ERS1142780,ERR1756549,20949_2#16,VNI,VNIa-32,Ashton,Clinical,,Vietnam
3,SRS417642,SRR836892,In2632,VNI,VNIa-32,Desjardins,Clinical,α,India
4,ERS1142749,ERR1671650,20427_2#52,VNI,VNIa-32,Ashton,Clinical,,Vietnam
...,...,...,...,...,...,...,...,...,...
1066,ERS2541042,ERR2624151,04CN-65-031,VNII,,Ashton,Clinical,,Uganda
1067,ERS2540950,ERR2624139,04CN-65-106,VNII,,Ashton,Clinical,,Uganda
1068,ERS2540925,ERR2624242,04CN-65-129,AD_hybrid,,Ashton,Clinical,,Uganda
1069,ERS2540931,ERR2624415,04CN-65-133,AD_hybrid,,Ashton,Clinical,,Uganda


In [56]:
metadata.to_csv("/FastData/czirion/Crypto_Diversity_Pipeline/analyses/data/derived/metadata.csv", index=False)