In [1]:
## QTL analysis
## Bring in needed mods
import pandas as pd, numpy as np, statsmodels.api as sm
from statsmodels.formula.api import ols
from matplotlib import pyplot as plt
%matplotlib inline

## Bring in needed ftn for QTL analysis
from scipy.stats import kruskal

## Write function for QTL analysis
def crypto_kruskal(site,pheno):
    """
    For the data from the C. deneoformans cross, a non-parametric ANOVA.
    Here we are looking for differences between the median phenotypes of group 0 and 1. 
    """
    pheno = pheno.dropna();assert len(pheno) > 0;
    site = site[pheno.dropna().index.tolist()];
    assert len(site) == len(pheno)
    try:
        pval = -np.log10(kruskal(np.array(pheno)[np.array(site) == 0],
                                        np.array(pheno)[np.array(site) == 1],
                                        nan_policy='omit')[1])
    except ValueError:
        pval = np.nan
    return pval

In [2]:
## Set date of phenotype data to use
filedate = 'Dec172018'

In [3]:
## set path to file, should in in the words, "curation_baselined_median21_AUC_Xstrain.csv'
file_path = '../FILES/Tecan_qtlruns_%s_curated_baselined_median51_AUC_Xstrain.csv'%(
    filedate)

In [4]:
## Set path to genotype data
geno_path = '../FILES/CDx-ill-SNP-INDEL-df-104-blocked.csv'

In [5]:
## Load data
data = pd.read_csv(file_path,index_col=0)

In [6]:
## Check value of parental strain SS-A837
data[(data.strain=='SS-A837') & (data.temp==37) & (data.amphB==0.125)]

Unnamed: 0,temp,amphB,strain,median_AUC
714,37,0.125,SS-A837,8.383125


In [7]:
## What is the shape of the dataframe
data.shape

(1428, 4)

In [8]:
## Check for nan's
data.dropna(axis=0,how='any').shape

(1428, 4)

In [9]:
## Bring in genotype data
bgeno = pd.read_csv(geno_path)

In [10]:
bgeno.head()

Unnamed: 0,Chrom,Pos,Qual,Callrate,Ref,Alt,Altlen,Dp,Type,Vcfix,...,SS-B598,SS-B565,SS-B600,SS-B574,SS-B872_cor,SS-B873_cor,SS-B360,SS-B397,SS-B564,SS-B382
0,Chr01,5016,104492.0,1.0,C,A,1,6510,snp,289,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,Chr01,5299,112647.0,1.0,T,C,1,9711,snp,293,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,Chr01,5464,112658.0,1.0,T,C,1,9375,snp,294,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,Chr01,6120,109003.0,1.0,T,C,1,9311,snp,311,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,Chr01,6166,114638.0,1.0,G,A,1,9269,snp,312,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [11]:
## Check its shape
bgeno.shape

(92103, 125)

In [12]:
unique_blocks = bgeno.Block.unique()

In [13]:
len(unique_blocks)

3108

In [14]:
## Gather a list of the chromosomes
chrlist = sorted(bgeno.Chrom.unique());

In [15]:
## make sure there are 14
assert len(chrlist) >= 14

In [16]:
## Chrom should be the edge of the info in geno df
bgcol = bgeno.columns.tolist()
## gather bgeno info columsn
bginfoix = bgcol.index('Block')+1
## Gather info from bgeno
bgeno_info = bgcol[:bginfoix]
## Gather strain names
bgeno_original_strains = bgcol[bginfoix:]
## How many?
len(bgeno_original_strains)

104

In [17]:
## Check first 5
bgeno_original_strains[:5]

['SS-B407', 'SS-B316', 'SS-B898_cor', 'SS-B369', 'SS-B997']

In [18]:
## Rename the strains here to match those phenotyped by DM
bgeno_strains = []
for s in bgeno_original_strains:
    if len(s.split('_Cor')) > 1:
        bgeno_strains.append(s.split('_Cor')[0]+'_cor')
    elif s in ['SS-A853']:
        bgeno_strains.append('XL280alpha')
    elif s in ['SS-B830']:
        bgeno_strains.append('XL280a')
    else:
        bgeno_strains.append(s)

In [19]:
## Check first 5 of new names
bgeno_strains[:5]

['SS-B407', 'SS-B316', 'SS-B898_cor', 'SS-B369', 'SS-B997']

In [20]:
## Assign new names to bgeno columns
bgeno.columns = bgeno_info + bgeno_strains

In [21]:
## Save data back with new names
#bgeno.to_csv(geno_path)

In [22]:
## How many haplotypes are in the geno df
print([len(bgeno[bgeno.Chrom==chrom]) for chrom in chrlist])

[11530, 7111, 9696, 8893, 7027, 7334, 6633, 5878, 5874, 5021, 5407, 4190, 4011, 3498]


In [23]:
## Find strains in both our phenotype and genotype data
tecan_strains = data.strain.unique().tolist()

In [24]:
## Take these segrants that are in the genotype data
segs = [s for s in tecan_strains if s in bgeno_strains]

In [34]:
## How many do we have?
len(segs)

104

In [35]:
no_segs = [s for s in tecan_strains if s not in segs]

In [33]:
len(no_segs) 

15

In [31]:
no_segs

['SS-B308',
 'SS-B377',
 'SS-B410',
 'SS-B869',
 'SS-B873',
 'SS-B890',
 'SS-B892',
 'SS-B896',
 'SS-B898',
 'SS-B901',
 'SS-B960',
 'SS-C026',
 'SS-C029',
 'SS-C030',
 'SS-C031']

In [36]:
'SS-B879' in segs

False

In [26]:
## make an index to these data 
segs_in_data = data.strain.isin(segs); 

In [38]:
## Take the phenotype w/ strains in genotype data
phenotypes = data.loc[segs_in_data,:].copy() 

In [39]:
## Print the shape of the phenotyep data
phenotypes.shape

(1248, 4)

In [40]:
## Check our work, assert that we didn't leave any strains behind! 
assert len(phenotypes.strain.unique()) == len(segs)

In [41]:
## Check value of parental strain SS-A837
phenotypes[(phenotypes.strain=='SS-A837') & 
           (phenotypes.temp==37) & 
           (phenotypes.amphB==0.125)]

Unnamed: 0,temp,amphB,strain,median_AUC
714,37,0.125,SS-A837,8.383125


In [42]:
## Set the phenotype data's index to the strain name
phenotypes.index = phenotypes.strain

In [43]:
## drop the strain column
phenotypes.drop('strain',inplace=True,axis=1)

In [44]:
## Check value of parental strain SS-A837
phenotypes.loc['SS-A837',:]

Unnamed: 0_level_0,temp,amphB,median_AUC
strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SS-A837,30,0.0,72.846625
SS-A837,30,0.075,6.39675
SS-A837,30,0.125,1.581
SS-A837,30,0.175,2.054625
SS-A837,37,0.0,60.49275
SS-A837,37,0.075,39.584125
SS-A837,37,0.125,8.383125
SS-A837,37,0.175,0.471125
SS-A837,38,0.0,49.26175
SS-A837,39,0.0,17.089875


In [45]:
## Save this filtered data frame 
phenotypes.to_csv('../FILES/'+file_path.split('/')[-1].split('.csv')[0]+'_noclone.csv')

In [46]:
## Set phenotype of interest
pheno_look_at = 'median_AUC'

In [49]:
to_test_ix = [bgeno[(bgeno.Block == b)].index.tolist()[0] 
              for b in unique_blocks]

In [51]:
len(to_test_ix)

3108

In [53]:
## conduct QTL analysis
temp_pvals_df = [] ## initialize list for dataframes
pval_cols = [] ## "..." for new pvalue column names
for i,t in enumerate(phenotypes.temp.sort_values().unique()): ## For each temp
    for j,d in enumerate(phenotypes[(phenotypes.temp==t)].amphB.sort_values().unique()): ## For durg in temp
        pval_cols.append(str(t)+'C_'+str(d)) ## append colu name
        temp_pheno = phenotypes[(phenotypes.temp==t)&
                                (phenotypes.amphB==d)][pheno_look_at] ## Take the phenotype data
        temp_pval = bgeno.iloc[to_test_ix,bginfoix:].apply(crypto_kruskal,axis=1,
                                                  args=[temp_pheno]) ## conduct QTL analysis
        temp_pvals_df.append(temp_pval) ## Append pvalue

In [63]:
## concat dataframes
pvaldf = pd.concat([bgeno.loc[to_test_ix,'Block']]+temp_pvals_df,axis=1)

In [73]:
pvaldf.columns = ['Block'] + pval_cols

In [74]:
bgenop = bgeno.merge(pvaldf,on=['Block'])

In [76]:
bgenop.head()

Unnamed: 0,Chrom,Pos,Qual,Callrate,Ref,Alt,Altlen,Dp,Type,Vcfix,...,30C_0.125,30C_0.175,37C_0.0,37C_0.075,37C_0.125,37C_0.175,38C_0.0,39C_0.0,39C_0.075,39C_0.125
0,Chr01,5016,104492.0,1.0,C,A,1,6510,snp,289,...,0.92899,0.655625,0.170498,0.081444,0.516895,0.060467,0.473662,0.078778,0.411585,0.095533
1,Chr01,5299,112647.0,1.0,T,C,1,9711,snp,293,...,0.92899,0.655625,0.170498,0.081444,0.516895,0.060467,0.473662,0.078778,0.411585,0.095533
2,Chr01,5464,112658.0,1.0,T,C,1,9375,snp,294,...,0.92899,0.655625,0.170498,0.081444,0.516895,0.060467,0.473662,0.078778,0.411585,0.095533
3,Chr01,6120,109003.0,1.0,T,C,1,9311,snp,311,...,0.92899,0.655625,0.170498,0.081444,0.516895,0.060467,0.473662,0.078778,0.411585,0.095533
4,Chr01,6166,114638.0,1.0,G,A,1,9269,snp,312,...,0.92899,0.655625,0.170498,0.081444,0.516895,0.060467,0.473662,0.078778,0.411585,0.095533


In [77]:
## Make the saving path
savepath = '../FILES/'+file_path.split('/')[-1].split('.csv')[0]+'_pvalues.csv'

In [78]:
## Print the save path
savepath

'../FILES/Tecan_qtlruns_Dec172018_curated_baselined_median51_AUC_Xstrain_pvalues.csv'

In [82]:
## Save data
bgenop.to_csv(savepath)

In [80]:
## Variance explained! 
## First approximation
phenotypes.shape

(1248, 3)

In [None]:
i = 4
max_con = pvaldf[pvaldf.columns.tolist()[10:]].max().index[i]
max_pval = pvaldf[pvaldf.columns.tolist()[10:]].max()[i]

In [None]:
max_con

In [None]:
max_pval_ix = pvaldf[pvaldf[max_con] == max_pval].index
max_pval_ix

In [None]:
bgeno.loc[max_pval_ix]

In [None]:
max_genos = bgeno.loc[[max_pval_ix[0]]][bgeno.columns.tolist()[10:]]
max_genos.index = ['geno']

In [None]:
max_phenos = phenotypes[(phenotypes.temp==int(max_con.split('C_')[0])) & 
           (phenotypes.amphB==float(max_con.split('C_')[-1]))]

In [None]:
max_geno_pheno_df = pd.concat([max_phenos,max_genos.T],axis=1,sort=True)

In [None]:
#plt.hist(np.log(max_geno_pheno_df.median_AUC.values));

In [None]:
#max_geno_pheno_df['log_auc'] = np.log(max_geno_pheno_df.median_AUC)

In [None]:
max_geno_pheno_df.tail()

In [None]:
model = ols('median_AUC ~ C(geno)',data=max_geno_pheno_df)

In [None]:
res = model.fit()

In [None]:
-np.log10(res.pvalues)

In [None]:
res.summary()

In [None]:
import seaborn as sns

In [None]:
max_geno_pheno_df['geno_shift'] = abs(max_geno_pheno_df['geno'] - 1)

In [None]:
ax = sns.lmplot(x='geno_shift',y='median_AUC',x_jitter=.2,data=max_geno_pheno_df);
plt.xticks([0,1],['431','XL280'],fontsize=18)
plt.xlabel('Allele',fontsize=16)
plt.ylabel('Median\nArea Under the Curve',fontsize=18);
plt.plot([0],max_geno_pheno_df.loc['SS-A837']['median_AUC'],'o',ms=10,color='red');
plt.plot([1,1],max_geno_pheno_df.loc[['XL280a','XL280alpha']]['median_AUC'],'ko',ms=10);
plt.savefig('/Users/croth/Desktop/CRYPTO_QTL/FIGURES/QTL2_AUC_distirbutions.png',
            dpi=300);

In [None]:
plt.hist(res.resid);