In [1]:
## Load in needed mods
import numpy as np, pandas as pd
from Bio import SeqIO

## Set reference path
refpath = '../DATA/FungiDB-46_CneoformansH99_Genome.fasta'

## Parse reference
REF = [s for s in SeqIO.parse(refpath,format='fasta')]

## Set paths to genotype dataframes
gt_path = '../GENOTYPE/Bt65xH99_F1_progeny-SNPS.csv.gz'

## Load in genotype data
SNP = pd.read_csv(gt_path,index_col=0)

In [2]:
## Make a dataframe of sequence ids and lenghts
clens = pd.DataFrame([(s.id,len(s.seq)) for s in REF],columns=['Seqid','Length'])

## Initialize contig and chrom columns
clens['Contig'] = -1
clens['Chrom'] = -1

## make a contig and chrom dataframe
chrom_remap = pd.DataFrame([(c,int(c.split("_")[-1])) 
                            for c in  sorted(SNP.Contig.unique())],
                           columns = ['Contig','Chrom'])

## Iterate over rows and add contig and chrom to length dataframe
for i,j in chrom_remap.iterrows():
    clens.loc[j.Chrom-1,'Contig'] = j.Contig
    clens.loc[j.Chrom-1,'Chrom'] = int(j.Chrom)
    
## replace last column with mitoconddria    
clens.replace(-1,'M',inplace=True)

## Check our work
assert -1 not in clens.Chrom.tolist()

## Add the cumualtive lengths of chromosomes
clens['Cumsum'] = [0] + list(clens.Length.cumsum()[:-1])

## Add mid points
clens['Midpts'] = clens.Length/2 + clens.Cumsum

## Calculate the number of SNPs per chromosome
clens['Nsnps'] = [SNP[(SNP.Contig==c)].shape[0] for c in clens.Contig]

## Save out dataframe as gzipped csv file
clens.to_csv('../DATA/H99_chrommap.csv.gz',index=False)

## View tail
clens.tail()

Unnamed: 0,Seqid,Length,Contig,Chrom,Cumsum,Midpts,Nsnps
10,CP003830.1,1561994,Chr_11,11,14855526,15636523.0,18069
11,CP003831.1,774062,Chr_12,12,16417520,16804551.0,9015
12,CP003832.1,756744,Chr_13,13,17191582,17569954.0,8509
13,CP003833.2,942867,Chr_14,14,17948326,18419759.5,11172
14,CP003834.1,24919,M,M,18891193,18903652.5,0
