In [15]:
import pandas as pd

Get metadata of both Desjardins and Ashton datasets from the FungalPop joined analysis.

In [16]:
metadata_fungalpop_joined = pd.read_csv("/FastData/czirion/Crypto_Diversity_Pipeline/Crypto_Desjardins_Ashton/results_joined/02.Dataset/metadata.csv", header=0)

Get the metadata from the Ashton paper and drop the columns that are already in the other metadata.

In [17]:
metadata_from_ashton = pd.read_csv("/FastData/czirion/Crypto_Diversity_Pipeline/Crypto_Ashton/config/metadata_all_ashton_and_vni_desj.csv", header=0)
metadata_from_ashton.columns = metadata_from_ashton.columns.str.lower().str.replace(' ', '_')
metadata_from_ashton.drop(columns=['country_of_origin', 'species_id_from_mash_anlaysis', 'study',
       'hiv_status', 'continent_of_origin', 'year_of_origin',
       'mean_depth_of_mapping_with_mq_>_30_across_whole_genome',
       'proportion_of_genome_covered_by_at_least_5_reads_which_mapped_with_mq_>_30', 'source'], inplace=True)


Merge them only on the column sample keeping the sample in the FungalPop analysis.

In [18]:
metadata_fixed = metadata_fungalpop_joined.merge(metadata_from_ashton, on = 'sample', how="left")

In the Desjardins samples keep the strain and lineage from the FungalPop analysis, and in the Ashton samples keep the species and lineage from the Ashton metadata.

In [19]:
metadata_fixed.loc[metadata_fixed["dataset"] == "Desjardins", "strain"] = metadata_fixed["strain_x"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Desjardins", "lineage"] = metadata_fixed["lineage_x"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Ashton", "strain"] = metadata_fixed["strain_y"]
metadata_fixed.loc[metadata_fixed["dataset"] == "Ashton", "lineage"] = metadata_fixed["lineage_y"]

Keep the run accessions from the Ashton metadata where available to be able to link the Desjarins samples to the Ashton tree (that includes them).

In [20]:
metadata_fixed.loc[metadata_fixed["sra_accession_y"].notna(), "run"] = metadata_fixed["sra_accession_y"]
metadata_fixed.loc[metadata_fixed["sra_accession_y"].isna(), "run"] = metadata_fixed["sra_accession_x"]

Keep the vni_subdivision from the Ashton metadata

In [21]:
metadata_fixed = metadata_fixed.rename(columns={"vni_subdivision_y": "vni_subdivision"})

Cleanup

In [22]:
metadata_fixed = metadata_fixed.drop(columns=["strain_x", "strain_y", "lineage_x", "lineage_y", "vni_subdivision_x", "sra_accession_x", "sra_accession_y"])
metadata_fixed = metadata_fixed[['sample', 'run', 'strain','lineage', 'vni_subdivision', 'dataset', 'source'] + [col for col in metadata_fixed.columns if col not in ['sample', 'strain','lineage','source','dataset','vni_subdivision','run'] ]]
metadata_fixed = metadata_fixed.sort_values('vni_subdivision')

In [23]:
metadata_fixed.to_csv("/FastData/czirion/Crypto_Diversity_Pipeline/analyses/data/derived/metadata_fixed.csv", index=False)

Save a version with only the useful columns

In [24]:
metadata = metadata_fixed[['sample', 'run', 'strain','lineage', 'vni_subdivision', 'dataset', 'source', 'mating_type', 'country_of_origin']]


Add H99 GCF_000149245

In [25]:
H99 = {'sample': 'GCF_000149245', 'run': 'GCF_000149245', 'strain': 'H99', 'lineage': 'VNI', 'vni_subdivision': 'VNIb', 'dataset': 'Reference', 'source': 'Clinical', 'mating_type': 'α', 'country_of_origin': 'USA'}

In [26]:
metadata = pd.concat([metadata, pd.DataFrame([H99])], ignore_index=True)
metadata

Unnamed: 0,sample,run,strain,lineage,vni_subdivision,dataset,source,mating_type,country_of_origin
0,ERS1142739,ERR1671640,20427_2#42,VNI,VNIa-32,Ashton,Clinical,,Vietnam
1,ERS2540972,ERR2624485,04CN-64-065,VNI,VNIa-32,Ashton,Clinical,,Uganda
2,ERS2541156,ERR2624113,BMD915,VNI,VNIa-32,Ashton,Clinical,,Vietnam
3,ERS542302,ERR842482,14892_1#7,VNI,VNIa-32,Ashton,Clinical,,Vietnam
4,ERS2541105,ERR2624207,BMD942,VNI,VNIa-32,Ashton,Clinical,,Vietnam
...,...,...,...,...,...,...,...,...,...
1051,SRS520177,"SRX400063, SRX400064",MW-RSA1327,VNII,,Desjardins,Clinical,α,S. Africa
1052,SRS520179,"SRX400068, SRX400067",MW-RSA3956,VNII,,Desjardins,Clinical,α,S. Africa
1053,SRS520181,"SRX400073, SRX400071, SRX400072",MW-RSA913,VNBII,,Desjardins,Clinical,α,S. Africa
1054,SRS520182,"SRX400075, SRX400076",WM626,VNII,,Desjardins,Clinical,α,Australia


In [27]:
metadata.to_csv("/FastData/czirion/Crypto_Diversity_Pipeline/analyses/data/derived/metadata.csv", index=False)