In [1]:
import numpy as np, pandas as pd, sys
sys.path.insert(0, '/home/croth/QTL-functions/SCRIPTS/')
import snpy

In [2]:
## Set paths 
## to genotype dataframes
gt_path = '../GENOTYPE/Bt65xH99_F1_progeny-SNPS.csv.gz'
#gffpath = '/home/croth/Hypermutator/FungiDB-43_CneoformansH99.gff'
#refpath = '/home/croth/Desktop/SSK1/FungiDB-43_CneoformansH99_Genome.fasta'

refpath = '/home/croth/Hypermutator_mac/DATA/FungiDB-46_CneoformansH99_Genome.fasta'
gffpath = '/home/croth/QTL-functions/DATA/REFERENCE/FungiDB-46_CneoformansH99.gff.gz'
cmpath = '../GENOTYPE/H99_chrommap.csv.gz'

In [3]:
## Load in chromosome lengths
clens = pd.read_csv(cmpath)
clens = clens[:-1]
clens.tail()

Unnamed: 0,Seqid,Length,Contig,Chrom,Cumsum,Midpts,Nsnps
9,CP003829.1,1059964,Chr_10,10,13795562,14325544.0,10982
10,CP003830.1,1561994,Chr_11,11,14855526,15636523.0,18069
11,CP003831.1,774062,Chr_12,12,16417520,16804551.0,9015
12,CP003832.1,756744,Chr_13,13,17191582,17569954.0,8509
13,CP003833.2,942867,Chr_14,14,17948326,18419759.5,11172


In [4]:
## Load in genotype data
## Genotypes per site
SNP = pd.read_csv(gt_path,index_col=0)

## Merge with clens
SNP = SNP.merge(clens)

SNP['Chrom'] = SNP.Chrom.apply(int)

## View head
SNP.head()

Unnamed: 0,41,17,36,14,12,29,34,26,13,18,...,Altlen,Qual,NALT,AF,Seqid,Length,Chrom,Cumsum,Midpts,Nsnps
0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1,4.526492,1,0.464286,CP003820.1,2291499,1,0,1145749.5,25192
1,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,4.516699,1,0.428571,CP003820.1,2291499,1,0,1145749.5,25192
2,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1,4.731814,1,0.5,CP003820.1,2291499,1,0,1145749.5,25192
3,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1,4.59201,1,0.428571,CP003820.1,2291499,1,0,1145749.5,25192
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1,4.520617,1,0.214286,CP003820.1,2291499,1,0,1145749.5,25192


In [5]:
SNP3 = SNP[(SNP.Chrom==3) & (SNP.Altlen<3)]
SNP11 = SNP[(SNP.Chrom==11) & (SNP.Altlen<3)]
SNP3.shape,SNP11.shape

((15618, 44), (15514, 44))

In [6]:
## Set chromosome 3 CI bounds +/- 1kb
qtl3left, qtl3righ = 881808, 1118629

qtl11left,qtl11righ = 991049, 1189364

In [7]:
## Bring in gff file
gff = snpy.loadgff(gffpath)

In [8]:
## Gather chromnames
chr3name = SNP3.Seqid.min()

chr11name = SNP11.Seqid.min()

chr3name,chr11name

('CP003822.1', 'CP003830.1')

In [9]:
chr3_genes = list(gff[(gff.Type=='gene') &
                 (gff.End>=qtl3left) & 
                  (gff.Start<=qtl3righ) & 
                 (gff.Seqid==chr3name)
                ].Gene.unique())

len(chr3_genes)

108

In [10]:
chr11_genes = list(gff[(gff.Type=='gene') &
                 (gff.End>=qtl11left) & 
                  (gff.Start<=qtl11righ) & 
                 (gff.Seqid==chr11name)
                ].Gene.unique())

len(chr11_genes)

85

In [11]:
gff3 = gff[(gff.Gene.isin(chr3_genes))]
gff11 = gff[(gff.Gene.isin(chr11_genes))]

In [12]:
gff3genes = gff3[(gff3.Type=='gene')].copy()
gff11genes = gff11[(gff11.Type=='gene')].copy()

In [13]:
gff3genes['Description'] = [''.join(g.split('description=')[-1].split('%2C')) 
                           for g in gff3genes.Attribute.tolist()]

gff11genes['Description'] = [''.join(g.split('description=')[-1].split('%2C')) 
                           for g in gff11genes.Attribute.tolist()]

In [14]:
respath3 = snpy.snpeffect('19',SNP3,'../GENOTYPE/GENES/P19/',gff3,refpath,verbos=False)

In [15]:
respath11 = snpy.snpeffect('19',SNP11,'../GENOTYPE/GENES/P19/',gff11,refpath,verbos=False)

In [16]:
import glob
import importlib
importlib.reload(snpy)
snpspath = glob.glob('../GENOTYPE/GENES/P19/*.csv.gz')
snpres = snpy.snpresults(snpspath)

In [17]:
snpres.shape

(174, 14)

In [18]:
snpres = snpres.merge(pd.concat([gff3genes,gff11genes])
            [['Gene','Strand','Start','End','Seqid','Description']])

snpres['Lengthdif'] = np.abs(snpres.Alt - snpres.Ref)/snpres.Ref

In [19]:
gff3genes.shape

(108, 12)

In [20]:
gff11genes.shape

(85, 12)

In [21]:
snpres.shape

(174, 20)

In [22]:
snpres.to_csv('../GENOTYPE/GENES/SNP_results.csv',index=False)

In [23]:
snpres.Seqid.unique()

array(['CP003822.1', 'CP003830.1'], dtype=object)

In [24]:
snpres[(snpres.Altstop!=1)]

Unnamed: 0,Strain,Gene,Parent,Expected,Ref,Alt,Refstop,Altstop,Nonsyn,Nvars,Utr5,Utr3,Exon,Intron,Strand,Start,End,Seqid,Description,Lengthdif
46,19,CNAG_02700,CNAG_02700-t26_1,1561.0,1561,95,1,2,20,66,3,3,57,3,-1,970996,976412,CP003822.1,C2H2 type zinc finger transcription factor,0.939142
47,19,CNAG_02700,CNAG_02700-t26_2,1562.0,1562,95,1,2,20,66,3,3,57,3,-1,970996,976412,CP003822.1,C2H2 type zinc finger transcription factor,0.939181
55,19,CNAG_01883,CNAG_01883-t26_1,322.0,322,144,1,2,14,43,0,0,20,23,-1,1171247,1173092,CP003830.1,hypothetical protein,0.552795
70,19,CNAG_07611,CNAG_07611-t26_1,67.0,67,40,1,2,6,6,0,0,3,3,-1,1170371,1170866,CP003830.1,hypothetical protein,0.402985
76,19,CNAG_07610,CNAG_07610-t26_1,108.0,108,109,1,0,4,8,0,3,3,2,1,1119077,1119717,CP003830.1,hypothetical protein,0.009259
108,19,CNAG_07969,CNAG_07969-t26_1,309.0,309,34,1,8,67,25,0,0,22,3,1,930460,931520,CP003822.1,hypothetical protein,0.889968
124,19,CNAG_07528,CNAG_07528-t26_1,463.0,463,404,1,2,69,27,0,1,24,2,-1,1070491,1072131,CP003822.1,hypothetical protein,0.12743
125,19,CNAG_01836,CNAG_01836-t26_1,521.0,521,503,1,2,20,33,0,0,25,8,1,1042367,1044262,CP003830.1,long-chain acyl-CoA synthetase,0.034549


In [23]:
snpres[(snpres.Altstop!=1)].Gene.unique().shape

(7,)

In [51]:
all_genes = gff3genes.Gene.tolist() + gff11genes.Gene.tolist()

In [52]:
missing = [g for g in all_genes if g not in snpres.Gene.tolist()]
len(missing)

111

In [21]:
gff[(gff.Type=='gene') & (gff.Gene=='CNAG_02680')]

Unnamed: 0,Seqid,Source,Type,Start,End,Score,Strand,Phase,Attribute,Parent,Gene
93511,CP003822.1,EuPathDB,gene,1038093,1043768,.,-1,.,ID=CNAG_02680;description=VPS15 protein kinase,CNAG_02680,CNAG_02680


In [22]:
SNP[(SNP.Pos.isin(np.arange(1038093,1043769))) & (SNP.Chrom==3)][['Alt','Ref','Pos','Altlen']].tail()

Unnamed: 0,Alt,Ref,Pos,Altlen
114904,A.G,A,1043157,1
114905,A.T,A,1043297,1
114906,G.T,G,1043486,1
114907,G.A,G,1043684,1
114908,ACCCCGCCGCA.ACCCGCCGCA,ACCCCGCCGCA,1043753,10


In [23]:
gff[(gff.Gene.isin(missing))]

Unnamed: 0,Seqid,Source,Type,Start,End,Score,Strand,Phase,Attribute,Parent,Gene
634,CP003822.1,EuPathDB,gene,1104486,1105762,.,-1,.,ID=CNAG_12298;description=unspecified product,CNAG_12298,CNAG_12298
635,CP003822.1,EuPathDB,ncRNA,1104486,1105762,.,-1,.,ID=CNAG_12298-t26_1;Parent=CNAG_12298;descript...,CNAG_12298,CNAG_12298
636,CP003822.1,EuPathDB,exon,1104486,1105296,.,-1,.,ID=exon_CNAG_12298-E2;Parent=CNAG_12298-t26_1,CNAG_12298-t26_1,CNAG_12298
637,CP003822.1,EuPathDB,exon,1105353,1105762,.,-1,.,ID=exon_CNAG_12298-E1;Parent=CNAG_12298-t26_1,CNAG_12298-t26_1,CNAG_12298
1078,CP003822.1,EuPathDB,gene,1012931,1013442,.,1,.,ID=CNAG_12288;description=unspecified product,CNAG_12288,CNAG_12288
...,...,...,...,...,...,...,...,...,...,...,...
132809,CP003822.1,EuPathDB,exon,999875,1001950,.,1,.,ID=exon_CNAG_12286-E3;Parent=CNAG_12286-t26_1,CNAG_12286-t26_1,CNAG_12286
133722,CP003820.1,EuPathDB,gene,1183354,1184742,.,-1,.,ID=CNAG_12059;description=unspecified product,CNAG_12059,CNAG_12059
133723,CP003820.1,EuPathDB,ncRNA,1183354,1184742,.,-1,.,ID=CNAG_12059-t26_1;Parent=CNAG_12059;descript...,CNAG_12059,CNAG_12059
133724,CP003820.1,EuPathDB,exon,1183354,1184133,.,-1,.,ID=exon_CNAG_12059-E2;Parent=CNAG_12059-t26_1,CNAG_12059-t26_1,CNAG_12059


In [24]:
snpres[(snpres.Altstop!=1)]

Unnamed: 0,Strain,Gene,Parent,Expected,Ref,Alt,Refstop,Altstop,Nonsyn,Nvars,Utr5,Utr3,Exon,Intron,Strand,Start,End,Seqid,Description,Lengthdif
46,19,CNAG_02700,CNAG_02700-t26_1,1561.0,1561,95,1,2,22,62,3,3,53,3,-1,970996,976412,CP003822.1,C2H2 type zinc finger transcription factor,0.939142
47,19,CNAG_02700,CNAG_02700-t26_2,1562.0,1562,95,1,2,22,62,3,3,53,3,-1,970996,976412,CP003822.1,C2H2 type zinc finger transcription factor,0.939181
55,19,CNAG_01883,CNAG_01883-t26_1,322.0,322,144,1,2,14,43,0,0,20,23,-1,1171247,1173092,CP003830.1,hypothetical protein,0.552795
70,19,CNAG_07611,CNAG_07611-t26_1,67.0,67,40,1,2,6,6,0,0,3,3,-1,1170371,1170866,CP003830.1,hypothetical protein,0.402985
76,19,CNAG_07610,CNAG_07610-t26_1,108.0,108,109,1,0,4,8,0,3,3,2,1,1119077,1119717,CP003830.1,hypothetical protein,0.009259
108,19,CNAG_07969,CNAG_07969-t26_1,309.0,309,34,1,8,67,25,0,0,22,3,1,930460,931520,CP003822.1,hypothetical protein,0.889968
124,19,CNAG_07528,CNAG_07528-t26_1,463.0,463,404,1,2,69,26,0,1,23,2,-1,1070491,1072131,CP003822.1,hypothetical protein,0.12743
125,19,CNAG_01836,CNAG_01836-t26_1,521.0,521,503,1,2,18,31,0,0,23,8,1,1042367,1044262,CP003830.1,long-chain acyl-CoA synthetase,0.034549


In [25]:
plt.plot(snpres.Altstop,snpres.Lengthdif,'.');

NameError: name 'plt' is not defined

In [None]:
qtl = SNP3[(SNP3.Pos>=qtlleft) & (SNP3.Pos<=qtlrigh)]

In [None]:
snpres.Gene.unique().shape[0]

In [None]:
snpres.shape

In [None]:
missing = [g for g in chr3_genes if g not in snpres.Gene.tolist()]
len(missing)

In [None]:
gff3[(gff3.Gene=='CNAG_12291')]

In [None]:
gffgenes[(gffgenes.Gene.isin(missing))]

In [None]:
hypos = ['hypothetical protein','hypothetical protein hypothetical protein variant',
         'hypothetical protein hypothetical protein variant 1 hypothetical protein variant 2']

In [None]:
snpres[(snpres.Description.isin(hypos))].shape

In [None]:
snpres[~(snpres.Description.isin(hypos))].shape

In [None]:
from matplotlib import pyplot as plt

In [None]:
k = snpres[~(snpres.Description.isin(hypos))].Nonsyn.values

In [None]:
counts = [len(k[(k == i)]) for i in range(11)] + [len(k[(k>=11)])]

In [None]:
plt.vlines(list(np.arange(11))+[11],0,counts);

In [None]:
snpres[(snpres.Nonsyn>11)]

In [None]:
len(k[(k>8)])

In [None]:
gff3[(gff3.Gene.isin(snpres[(snpres.Altstop!=1)].Gene.tolist())) & (gff3.Type=='gene')]

In [None]:
snpres[(snpres.Gene=='CNAG_07529')]

In [None]:
'CNAG_07529' in chr3_genes

In [None]:
gff[(gff.Type=='gene') & (gff.Gene=='CNAG_07529')]

In [None]:
chr3name

In [None]:
qtlleft

In [None]:
qtlrigh

In [None]:
test = snpy.loadgene('../GENOTYPE/GENES/P19/CNAG_07529-t26_1.csv.gz')

In [None]:
test.Pos.min()

In [None]:
test.Pos.max()

In [None]:
gff[(gff.Gene=='CNAG_07529') & (gff.Type.isin(foi))].Start.min()

In [None]:
gff[(gff.Gene=='CNAG_07529') & (gff.Type.isin(foi))].End.max()

In [None]:
test

In [None]:
znf3start = gff3[(gff3.Gene=='CNAG_02700') & (gff3.Type=='gene')].Start.min()
znf3end = gff3[(gff3.Gene=='CNAG_02700') & (gff3.Type=='gene')].End.max()

znf3start,znf3end

In [None]:
temp = SNP[(SNP.Seqid==chr3_name) & (SNP.Pos>=znf3start) & (SNP.Pos<=znf3end)]

In [None]:
temp[['19','Pos']]