First, import the usual libraries
- `math`: basic math operations
- `os`: enable file manipulation with the OS
- `sys`: enable interaction with commandline
- `glob`: more variable manipulation
- `matplotlib.pyplot`: default plotter (I personally like ggplot waaaaay better. E)
    - `inline`: so that plots are shown in the notebook
- `seaborn`: nicer plots
- `numpy`: all number cruching done here
- `pandas`: data wrangling

In [1]:
import math  
import os   
import glob 
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
import numpy as np
import pandas as pd
from scipy import stats


In [2]:
# Path to the folder where all the raw data is located
src = '/home/ejam/documents/css893/raw_data/'

# Path where we will save outputs
dst = '/home/ejam/documents/css893/output/'

# Matrix with raw SNP data
#file_snp_raw = 'snp_imputed_chr10_sample.csv'

# Matrix with imputed SNP data
file_snp_imp = 'B73_plus_RTAs_snp_matrix_imputed/widiv_942g_979873SNPs_imputed_filteredGenos_withRTA_AGPv4.hmp.txt'

# Matrix with gene expression (FPKM)
file_gen_high_z = 'high_ztpm.csv'

We only need the gene regions right now. Ignore the zTPM values. This saves a lot of memory.

In [3]:
gen_high_z = pd.read_csv(src+file_gen_high_z, usecols=['gene', 'chromosome', 
                                                       'feature_type', 'position_left',
                                                       'position_right'])
#gen_high_z = gen_high_z.drop(columns='Unnamed: 0')
gen_high_z

Unnamed: 0,gene,chromosome,feature_type,position_left,position_right
0,Zm00001d032887,1,gene,241511969,241512341
1,Zm00001d030902,1,gene,166630764,166633673
2,Zm00001d027695,1,gene,11304566,11304881
3,Zm00001d033400,1,gene,261836341,261838185
4,Zm00001d033662,1,gene,269968475,269969325
...,...,...,...,...,...
673,Zm00001d026498,10,gene,147196017,147196308
674,Zm00001d025350,10,gene,115137908,115138444
675,Zm00001d023387,10,gene,4374701,4375148
676,Zm00001d023145,10,lincRNA_gene,97217143,97218735


Same for SNPs. Right now we only care about their position. Instead of using 16GB to load the data we use less than 500Mb!!!!

In [4]:
snp_matrix_imputed = pd.read_table(src + file_snp_imp, 
                                   dtype={'chrom': int}, nrows=899784,
                                   usecols=['chrom', 'pos'])
snp_matrix_imputed

Unnamed: 0,chrom,pos
0,1,44306
1,1,44441
2,1,44879
3,1,45948
4,1,46521
...,...,...
899779,10,150833697
899780,10,150833709
899781,10,150833722
899782,10,150833724


For each chromosome, select only those genes and snps present in such chromosome.

Sort the genes by left position. Loop only once through all the SNPs and gene regions. We assume that genes do not overlap.

Keep a list with all the SNP indices that are contained in highly varying genes.

In [5]:
SNP = []
gene = []
gene_idx = []
chromosome = []
for chrom in range(1,11):

    pos_sorted = gen_high_z[gen_high_z['chromosome'] == chrom]
    pos_sorted = pos_sorted.sort_values(by=['position_left'], ascending=True)
    start = pos_sorted.iloc[0]['position_left']
    end = pos_sorted.iloc[-1]['position_right']

    snps = snp_matrix_imputed[snp_matrix_imputed['chrom'] == chrom]

    current = 0

    for i in range(snps.shape[0]):
        pos = snps.iloc[i]['pos']
        if pos < start or pos > end:
            pass
        else:
            for j in range(current, pos_sorted.shape[0]):
                left, right = pos_sorted.iloc[j][['position_left', 'position_right']]
                if pos <= right:
                    current = j
                    if pos >= left:
                        SNP.append(i)
                        gene.append(pos_sorted.iloc[j]['gene'])
                        gene_idx.append(pos_sorted.index[j])
                        chromosome.append(chrom)
                        in_gene = True
                    break

The SNP indices are unfortunately not global but they are with respect to their chromosome. They need to be shifted appropriately.

In [6]:
SNP = np.array(SNP)
gene = np.array(gene)
gene_idx = np.array(gene_idx)
chromosome = np.array(chromosome)

In [7]:
chr_count = []
snp_count = []
for i in range(1,11):
    chr_count.append(np.sum(chromosome == i))
    snp_count.append(snp_matrix_imputed[snp_matrix_imputed['chrom'] == i].shape[0])

In [8]:
print(chr_count,'\n',snp_count)
trueSNP = np.array([])
total = 0
start = 0
end = chr_count[0]
for i in range(len(chr_count) - 1):
    trueSNP = np.concatenate((trueSNP, SNP[start : end] + total))
    total += snp_count[i]
    start += chr_count[i]
    end += chr_count[i+1]
trueSNP = np.concatenate((trueSNP, SNP[start:] + total))
    
snps_high_z = snp_matrix_imputed.iloc[trueSNP]
snps_high_z = snps_high_z.assign(gene=gene)
snps_high_z = snps_high_z.assign(gene_idx=gene_idx)
snps_high_z

[485, 399, 403, 279, 291, 498, 356, 523, 532, 271] 
 [141951, 108976, 100791, 84532, 108966, 74084, 73575, 79005, 66442, 61462]


Unnamed: 0,chrom,pos,gene,gene_idx
6055,1,5698946,Zm00001d027457,14
6056,1,5698973,Zm00001d027457,14
6057,1,5698991,Zm00001d027457,14
6058,1,5698994,Zm00001d027457,14
6059,1,5699162,Zm00001d027457,14
...,...,...,...,...
894148,10,147354710,Zm00001d026509,644
894149,10,147354719,Zm00001d026509,644
894150,10,147354742,Zm00001d026509,644
894151,10,147354782,Zm00001d026509,644


Verify that the code above does what it is supposed to.

In [9]:
val = 727
idx = snps_high_z.iloc[val]['gene_idx']
print(snps_high_z.iloc[val]['pos'])
print()
print(gen_high_z.iloc[idx][['position_left', 'position_right']])

180248147

position_left     180240055
position_right    180257166
Name: 104, dtype: object


Now load all the SNPs and save only those that are present in highly varying genes. Recall to drop the emp

In [10]:
snps = pd.read_table(src + file_snp_imp, dtype={'chrom': int}, nrows=899784)
snps = snps.iloc[trueSNP]
snps

Unnamed: 0,rs,alleles,chrom,pos,strand,assembly,center,protLSID,assayLSID,panel,...,YANG,YE_4,YE-CHI-HUNG,YELLOW_3-4,YING-55,Yong_28,Yu796_NS,ZS01250,ZS1791,ZS635
6055,rs1_5698946,,1,5698946,,,,,,,...,G,G,G,G,G,G,G,G,G,G
6056,rs1_5698973,,1,5698973,,,,,,,...,N,N,N,N,N,N,C,G,C,C
6057,rs1_5698991,,1,5698991,,,,,,,...,G,G,G,G,G,G,G,G,G,G
6058,rs1_5698994,,1,5698994,,,,,,,...,G,G,G,G,G,G,G,G,G,G
6059,rs1_5699162,,1,5699162,,,,,,,...,C,C,C,C,C,C,C,C,C,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894148,rs10_147354710,,10,147354710,,,,,,,...,T,T,T,T,T,T,G,T,T,T
894149,rs10_147354719,,10,147354719,,,,,,,...,C,C,C,C,C,C,C,C,C,C
894150,rs10_147354742,,10,147354742,,,,,,,...,G,G,G,G,G,G,G,G,G,G
894151,rs10_147354782,,10,147354782,,,,,,,...,C,N,C,C,C,C,C,A,C,C


In [11]:
snps = snps.drop(columns=['rs','alleles', 'strand', 
                                      'assembly','center', 'protLSID', 
                                      'assayLSID','panel', 'QCcode'])
snps = snps.assign(gene=gene)
cols = snps.columns.values[:-1]
cols = np.insert(cols,0,'gene')
snps = snps[cols]
snps.to_csv(dst + 'high_z_snps_all_chr.csv')