In [28]:
## Bring in needed mods
import numpy as np, pandas as pd, scipy.stats as ss, seaborn as sns, sys, os, glob
from matplotlib import pyplot as plt
sys.path.insert(0, '/home/croth/QTL-functions/SCRIPTS/')
import snpy

In [2]:
## Set path to GFF file
gffpath ='/home/croth/Downloads/B3502/REF/FungiDB-48_CneoformansJEC21.gff.gz'
## Load in gff
gff = snpy.loadgff(gffpath)

## Set path to reference file
refpath = '/home/croth/Downloads/B3502/REF/FungiDB-48_CneoformansJEC21_Genome.fasta'

## Set genotype data path
geno_path = '../GENOTYPE/B3502_parent_variants.csv.gz'

## Bring in genotype data
genos = pd.read_csv(geno_path,index_col=0)
genos['Ref'] = [p.split('.')[0] for  p in genos.Alleles]
genos['Alt'] = genos.Alleles
genos.head()

Unnamed: 0,Seqid,Chrom,Pos,Gene,Alleles,B3502_A1,B3502_B1,B3502_B7,Ref,Alt
8887,AE017341.1,1,461149,,G.C,0.0,1.0,0.0,G,G.C
27604,AE017341.1,1,1430787,,G.T,0.0,1.0,0.0,G,G.T
38830,AE017341.1,1,2149478,,AATTT.CATTC,0.0,1.0,0.0,AATTT,AATTT.CATTC
53417,AE017342.1,2,428582,,A.T,1.0,0.0,1.0,A,A.T
53419,AE017342.1,2,428598,,T.C,1.0,0.0,1.0,T,T.C


In [6]:
## Gather samples
samples = [s for s in genos.columns if s.split('_')[0]=='B3502']

## Gather info columns
info_cols = [c for c in genos.columns if c not in samples]

## View samples and # of info columns
samples,len(info_cols)

(['B3502_A1', 'B3502_B1', 'B3502_B7'], 7)

In [24]:
## Gather a specific sample
SI = 2
samplename = samples[SI]

## Set savepath
savepath = '../GENES/%s/'%samplename
print(savepath)

## Make a genotype data frame for a given sample
## Gather columns
new_col = info_cols + [samplename]

## subset to dataframe
tempgeno = genos[new_col]

../GENES/B3502_B7/


In [26]:
## Take gff with variants in genes
tempgff = gff[(gff.Gene.isin(tempgeno.Gene.dropna()))]

In [27]:
## Conduct snp imputation
respath = snpy.snpeffect(samplename,tempgeno,savepath,tempgff,refpath)

In [31]:
## Check results
loadpaths = glob.glob('../GENES/*B1/*.csv.gz')
loadpaths

['../GENES/B3502_B1/CNL04210-t26_1.csv.gz',
 '../GENES/B3502_B1/CNF04940-t26_1.csv.gz',
 '../GENES/B3502_B1/CNE05190-t26_1.csv.gz',
 '../GENES/B3502_B1/CNM00880-t26_1.csv.gz',
 '../GENES/B3502_B1/CNL04090-t26_1.csv.gz',
 '../GENES/B3502_B1/CNJ00010-t26_1.csv.gz',
 '../GENES/B3502_B1/CNJ03440-t26_1.csv.gz',
 '../GENES/B3502_B1/CNE05380-t26_1.csv.gz',
 '../GENES/B3502_B1/CNJ01150-t26_1.csv.gz']

In [33]:
snpres = snpy.snpresults(loadpaths)
snpres

Unnamed: 0,Strain,Gene,Parent,Expected,Ref,Alt,Refstop,Altstop,Nonsyn,Nvars,Utr5,Utr3,Exon,Intron
0,B3502_B1,CNL04210,CNL04210-t26_1,970.0,970,969,1,0,120,4,0,0,4,0
1,B3502_B1,CNF04940,CNF04940-t26_1,526.0,526,526,1,1,0,5,0,1,4,0
2,B3502_B1,CNE05190,CNE05190-t26_1,1059.0,1059,1059,1,1,0,1,0,0,1,0
3,B3502_B1,CNM00880,CNM00880-t26_1,343.0,343,343,1,1,0,3,0,3,0,0
4,B3502_B1,CNL04090,CNL04090-t26_1,1477.0,1477,1477,1,1,0,1,0,1,0,0
5,B3502_B1,CNJ00010,CNJ00010-t26_1,1400.0,1400,1400,1,1,1,2,0,0,2,0
6,B3502_B1,CNJ03440,CNJ03440-t26_1,990.0,990,990,1,1,0,2,1,0,1,0
7,B3502_B1,CNE05380,CNE05380-t26_1,938.0,938,938,1,1,0,1,0,0,1,0
8,B3502_B1,CNJ01150,CNJ01150-t26_1,763.0,763,763,1,1,0,1,0,0,1,0


In [36]:
gff[(gff.Gene.isin(snpres.Gene)) & (gff.Type=='gene')]

Unnamed: 0,Seqid,Source,Type,Start,End,Score,Strand,Phase,Attribute,Parent,Gene
10033,AE017345.1,VEuPathDB,gene,1456271,1461020,.,-1,.,ID=CNE05190;description=conserved hypothetical...,CNE05190,CNE05190
16408,AE017352.1,VEuPathDB,gene,113190,119528,.,-1,.,ID=CNL04090;description=phosphatidylinositol 3...,CNL04090,CNL04090
26856,AE017350.1,VEuPathDB,gene,1081812,1085004,.,-1,.,ID=CNJ03440;description=conserved hypothetical...,CNJ03440,CNJ03440
38341,AE017345.1,VEuPathDB,gene,1502425,1506397,.,-1,.,ID=CNE05380;description=hypothetical protein,CNE05380,CNE05380
40809,AE017350.1,VEuPathDB,gene,172,4804,.,1,.,ID=CNJ00010;description=conserved hypothetical...,CNJ00010,CNJ00010
56120,AE017346.1,VEuPathDB,gene,1437068,1438867,.,-1,.,ID=CNF04940;description=hypothetical protein,CNF04940,CNF04940
74031,AE017353.1,VEuPathDB,gene,256057,258338,.,-1,.,ID=CNM00880;description=DNA polymerase process...,CNM00880,CNM00880
77364,AE017350.1,VEuPathDB,gene,322864,325698,.,-1,.,ID=CNJ01150;description=putative GTPase-activa...,CNJ01150,CNJ01150
96822,AE017352.1,VEuPathDB,gene,163732,167381,.,1,.,ID=CNL04210;description=hypothetical protein,CNL04210,CNL04210


In [16]:
## Check results
#noresults = pd.read_csv(respath)
#noresults.head()

In [17]:
## Check tail
#noresults.tail()

In [18]:
## How many failed?
#noresults.shape

In [19]:
## Check a repressor
#mygene = 'CNF04940-t26_1'
#mygene in noresults.Parent.tolist()