In [1]:
import numpy as np, pandas as pd, sys, glob, hypermutatorqtl as hypf

In [2]:
## Set paths 
## to genotype dataframes
gt_path = '../GENOTYPE/Bt65xH99_F1_progeny-SNPS.csv.gz'

## Set paths to chromosome map
cmpath = '../DATA/H99_chrommap.csv.gz'

## Set path to genome and gff files
refpath = '../DATA/FungiDB-46_CneoformansH99_Genome.fasta'
gffpath = '../DATA/FungiDB-46_CneoformansH99.gff'

## Set chromosome QTL bounds
qtl3left, qtl3righ = 881808, 1118629
qtl11left,qtl11righ = 991049, 1189364

## Set list of hyptheticals
hypos = ['hypothetical protein','hypothetical protein hypothetical protein variant',
         'hypothetical protein hypothetical protein variant 1 hypothetical protein variant 2']

In [3]:
## Load in chromosome lengths
clens = pd.read_csv(cmpath)
clens = clens[:-1]
clens.tail()

Unnamed: 0,Seqid,Length,Contig,Chrom,Cumsum,Midpts,Nsnps
9,CP003829.1,1059964,Chr_10,10,13795562,14325544.0,10982
10,CP003830.1,1561994,Chr_11,11,14855526,15636523.0,18069
11,CP003831.1,774062,Chr_12,12,16417520,16804551.0,9015
12,CP003832.1,756744,Chr_13,13,17191582,17569954.0,8509
13,CP003833.2,942867,Chr_14,14,17948326,18419759.5,11172


In [4]:
## Load in genotype data
## Genotypes per site
SNP = pd.read_csv(gt_path,index_col=0)

## Merge with clens
SNP = SNP.merge(clens)

## Convert chrom to int
SNP['Chrom'] = SNP.Chrom.apply(int)

## View head
SNP.head()

Unnamed: 0,41,17,36,14,12,29,34,26,13,18,...,Altlen,Qual,NALT,AF,Seqid,Length,Chrom,Cumsum,Midpts,Nsnps
0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1,4.526492,1,0.464286,CP003820.1,2291499,1,0,1145749.5,25192
1,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,4.516699,1,0.428571,CP003820.1,2291499,1,0,1145749.5,25192
2,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1,4.731814,1,0.5,CP003820.1,2291499,1,0,1145749.5,25192
3,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1,4.59201,1,0.428571,CP003820.1,2291499,1,0,1145749.5,25192
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1,4.520617,1,0.214286,CP003820.1,2291499,1,0,1145749.5,25192


In [5]:
## Gather chromosome 3 and 11 snps
SNP3 = SNP[(SNP.Chrom==3) & (SNP.Altlen<3)]
SNP11 = SNP[(SNP.Chrom==11) & (SNP.Altlen<3)]
SNP3.shape,SNP11.shape

((15618, 44), (15514, 44))

In [6]:
## Bring in gff file
gff = hypf.loadgff(gffpath)

In [7]:
## Gather chromnames
## for chrom 3
chr3name = SNP3.Seqid.min()

## for chrom 11
chr11name = SNP11.Seqid.min()

chr3name,chr11name

('CP003822.1', 'CP003830.1')

In [8]:
## Gather genes for chr 3
chr3_genes = list(gff[(gff.Type=='gene') &
                 (gff.End>=qtl3left) & 
                  (gff.Start<=qtl3righ) & 
                 (gff.Seqid==chr3name)
                ].Gene.unique())

len(chr3_genes)

108

In [9]:
## Gather genes for chr 11
chr11_genes = list(gff[(gff.Type=='gene') &
                 (gff.End>=qtl11left) & 
                  (gff.Start<=qtl11righ) & 
                 (gff.Seqid==chr11name)
                ].Gene.unique())

len(chr11_genes)

85

In [10]:
## Gather gff for chrom 3 and 11
gff3 = gff[(gff.Gene.isin(chr3_genes))]
gff11 = gff[(gff.Gene.isin(chr11_genes))]

## Gather the list of respective genes
gff3genes = gff3[(gff3.Type=='gene')].copy()
gff11genes = gff11[(gff11.Type=='gene')].copy()

## splice description for each respectively
gff3genes['Description'] = [''.join(g.split('description=')[-1].split('%2C')) 
                           for g in gff3genes.Attribute.tolist()]
gff11genes['Description'] = [''.join(g.split('description=')[-1].split('%2C')) 
                           for g in gff11genes.Attribute.tolist()]

In [11]:
## Run snp effect on progeny 19 for chromosome 3
## NOTE: you will need to make the path ./GENOTYPE/GENES
respath3 = hypf.snpeffect('19',SNP3,'../GENOTYPE/GENES/P19/',gff3,refpath,verbos=False)

In [12]:
## Run snp effect on progeny 19 for chromosome 11
respath11 = hypf.snpeffect('19',SNP11,'../GENOTYPE/GENES/P19/',gff11,refpath,verbos=False)

In [13]:
## Gather snp resutls and run summary ftn
snpspath = '../GENOTYPE/GENES/P19/'
snpres = hypf.snpresults(snpspath)

## Merge with gff files
snpres = snpres.merge(pd.concat([gff3genes,gff11genes])
            [['Gene','Strand','Start','End','Seqid','Description']])

## Save out dataframe
snpres.to_csv('../GENOTYPE/GENES/SNP_results.csv',index=False)

## Print shape and the unique seq ids; there shoudl be two
snpres.shape, snpres.Seqid.unique()

((174, 20), array(['CP003830.1', 'CP003822.1'], dtype=object))

In [14]:
## Show those genes with more than one predicted stop-codon
snpres[(snpres.Altstop!=1)].shape

(8, 20)

In [15]:
## How many unique genes are there with more than one stop-codon?
snpres[(snpres.Altstop!=1)].Gene.unique().shape

(7,)

In [16]:
## View these genes
snpres[(snpres.Altstop!=1)]

Unnamed: 0,Strain,Gene,Parent,Expected,Ref,Alt,Refstop,Altstop,Nonsyn,Nvars,Utr5,Utr3,Exon,Intron,Lengthdif,Strand,Start,End,Seqid,Description
26,19,CNAG_01836,CNAG_01836-t26_1,521.0,521,503,1,2,18,31,0,0,23,8,0.034549,1,1042367,1044262,CP003830.1,long-chain acyl-CoA synthetase
72,19,CNAG_01883,CNAG_01883-t26_1,322.0,322,144,1,2,14,43,0,0,20,23,0.552795,-1,1171247,1173092,CP003830.1,hypothetical protein
127,19,CNAG_02700,CNAG_02700-t26_1,1561.0,1561,95,1,2,22,62,3,3,53,3,0.939142,-1,970996,976412,CP003822.1,C2H2 type zinc finger transcription factor
128,19,CNAG_02700,CNAG_02700-t26_2,1562.0,1562,95,1,2,22,62,3,3,53,3,0.939181,-1,970996,976412,CP003822.1,C2H2 type zinc finger transcription factor
166,19,CNAG_07528,CNAG_07528-t26_1,463.0,463,404,1,2,69,26,0,1,23,2,0.12743,-1,1070491,1072131,CP003822.1,hypothetical protein
170,19,CNAG_07610,CNAG_07610-t26_1,108.0,108,109,1,0,4,8,0,3,3,2,0.009259,1,1119077,1119717,CP003830.1,hypothetical protein
171,19,CNAG_07611,CNAG_07611-t26_1,67.0,67,40,1,2,6,6,0,0,3,3,0.402985,-1,1170371,1170866,CP003830.1,hypothetical protein
173,19,CNAG_07969,CNAG_07969-t26_1,309.0,309,34,1,8,67,25,0,0,22,3,0.889968,1,930460,931520,CP003822.1,hypothetical protein
