# Prepare reference genomes
* Gather reference genomes reoriented in `cryptococcus_reference_genomes` repository.  
* Rename accessions.  
* Create `chromosmes.csv` file.   
* Rename files.   

The reorientation of the chromosomes of each reference genome created chromosome IDs (accessions) that are in the format of RefID_QueryID (H99_lineage), but we want them in the format of QueryID_R if the chromosome is inverted (reverse complement of original), or QueryID if it is not inverted.

In [3]:
import pandas as pd
import os
from pathlib import Path

os.chdir("/FastData/czirion/WeavePop_Cneoformans/")


In [4]:
%%bash
mkdir -p Crypto_Desjardins/data/references

In [5]:
ref_genomes = pd.read_csv("/FastData/shared/gits/cryptococcus_reference_genomes/reference_genomes.csv")
ref_genomes

Unnamed: 0,Species,Lineage,Strain,Accession
0,C.neoformans,VNI,H99,GCA_000149245.3
1,C.neoformans,VNII,Cockatoo,GCA_022832995.1
2,C.neoformans,VNBI,Bt22,
3,C.neoformans,VNBII,Bt89,GCA_023650575.1
4,C.deneoformans,VNIV,JEC21,GCA_000091045.1
5,C.gattii,VGI,WM276,GCA_000185945.1
6,C.deuterogattii,VGII,R265,GCA_002954075.1
7,C.bacillisporus,VGIII,CA1280,GCA_000836335.2
8,C.tetragattii,VGIV,IND107,GCA_000835755.2
9,C.hyraxii,VGV,MF34,GCA_009650685.1


Copy VNI GFF file to the data directory.

In [6]:
%%bash 
scp /FastData/shared/gits/cryptococcus_reference_genomes/ncbi_datasets/GCA_000149245.3_dataset/ncbi_dataset/data/GCA_000149245.3/genomic.gff Crypto_Desjardins/data/references/VNI.gff

Create a table with the accessions, correspondance with H99 chromosomes and the reorientation information.

In [7]:
%%bash
cd /FastData/shared/gits/cryptococcus_reference_genomes/alignments_to_H99/
echo "lineage,QueryID,RefID,Inverted" > /FastData/czirion/WeavePop_Cneoformans/Crypto_Desjardins/config/chromosome_alignments.csv
for file in C.neoformans_VNI_H99.csv C.neoformans_VNII_Cockatoo.csv C.neoformans_VNBI_Bt22.csv C.neoformans_VNBII_Bt89.csv
do
    lineage=$(echo $file | cut -d'_' -f2)
    tail -n +2 $file | awk -v lineage=$lineage -v FS=, -v OFS=, '{print lineage,$1,$2,$3}' >> /FastData/czirion/WeavePop_Cneoformans/Crypto_Desjardins/config/chromosome_alignments.csv
done

In [8]:
chrom_align = pd.read_csv("/FastData/czirion/WeavePop_Cneoformans/Crypto_Desjardins/config/chromosome_alignments.csv")

Make a column with the accessions that are in the files.

In [9]:
chrom_align["current_accession"] = chrom_align["RefID"].str.cat(chrom_align["QueryID"], sep="_")

Make a column with the accessions that we want to use instead of the current accessions.

In [10]:
chrom_align['accession'] = chrom_align.apply(lambda row: row["QueryID"] + "_R" if row["Inverted"] == True else row["QueryID"], axis=1)

In [11]:
chrom_align.to_csv("/FastData/czirion/WeavePop_Cneoformans/Crypto_Desjardins/config/chromosome_alignments.csv", index=False)

Copy the FASTA files to the data directory.

In [12]:
%%bash
cd /FastData/shared/gits/cryptococcus_reference_genomes/alignments_to_H99/

for file in C.neoformans_VNI_H99_reoriented.fasta C.neoformans_VNII_Cockatoo_reoriented.fasta C.neoformans_VNBI_Bt22_reoriented.fasta C.neoformans_VNBII_Bt89_reoriented.fasta
do
    lineage=$(echo $file | cut -d'_' -f2)
    scp $file /FastData/czirion/WeavePop_Cneoformans/Crypto_Desjardins/data/references/${lineage}.fasta
done

Replace the "current_accession" column with the "accession" column.

In [13]:
%%bash
tail -n +2 Crypto_Desjardins/config/chromosome_alignments.csv | while read line
do
    current_accession=$(echo $line | cut -d',' -f5)
    new_accession=$(echo $line | cut -d',' -f6)
    lineage=$(echo $line | cut -d',' -f1)
    echo "Updating ${current_accession} to ${new_accession} in ${lineage}.fasta"
    sed -i "s/${current_accession}/${new_accession}/g" Crypto_Desjardins/data/references/${lineage}.fasta
done

Updating CP003820.1_CP003820.1 to CP003820.1 in VNI.fasta
Updating CP003821.1_CP003821.1 to CP003821.1 in VNI.fasta
Updating CP003822.1_CP003822.1 to CP003822.1 in VNI.fasta
Updating CP003823.1_CP003823.1 to CP003823.1 in VNI.fasta
Updating CP003824.1_CP003824.1 to CP003824.1 in VNI.fasta
Updating CP003825.1_CP003825.1 to CP003825.1 in VNI.fasta
Updating CP003826.1_CP003826.1 to CP003826.1 in VNI.fasta
Updating CP003827.1_CP003827.1 to CP003827.1 in VNI.fasta
Updating CP003828.1_CP003828.1 to CP003828.1 in VNI.fasta
Updating CP003829.1_CP003829.1 to CP003829.1 in VNI.fasta
Updating CP003830.1_CP003830.1 to CP003830.1 in VNI.fasta
Updating CP003831.1_CP003831.1 to CP003831.1 in VNI.fasta
Updating CP003832.1_CP003832.1 to CP003832.1 in VNI.fasta
Updating CP003833.2_CP003833.2 to CP003833.2 in VNI.fasta
Updating CP003834.1_CP003834.1 to CP003834.1 in VNI.fasta
Updating CP003820.1_CP091247.1 to CP091247.1 in VNII.fasta
Updating CP003830.1_CP091248.1 to CP091248.1_R in VNII.fasta
Updating C

In [14]:
chrom_names = chrom_align[chrom_align["lineage"] == "VNI"][["RefID"]].copy()
chrom_names["chromosome"] = list(range(1,15)) + ["mitochondrion"]

In [15]:
chrom_names

Unnamed: 0,RefID,chromosome
0,CP003820.1,1
1,CP003821.1,2
2,CP003822.1,3
3,CP003823.1,4
4,CP003824.1,5
5,CP003825.1,6
6,CP003826.1,7
7,CP003827.1,8
8,CP003828.1,9
9,CP003829.1,10


In [None]:
chrom_align = chrom_align.merge(chrom_names, on="RefID", how="left")


In [None]:
chromosomes = chrom_align[["lineage", "accession", "chromosome"]].copy()
chromosomes.sort_values(by=["lineage", "chromosome"], inplace=True)
chromosomes = chromosomes[chromosomes["chromosome"]!= "mitochondrion"]
chromosomes.to_csv("/FastData/czirion/WeavePop_Cneoformans/Crypto_Desjardins/config/chromosomes.csv", index=False)