First, import the usual libraries
- `math`: basic math operations
- `os`: enable file manipulation with the OS
- `sys`: enable interaction with commandline
- `glob`: more variable manipulation
- `matplotlib.pyplot`: default plotter (I personally like ggplot waaaaay better. E)
    - `inline`: so that plots are shown in the notebook
- `seaborn`: nicer plots
- `numpy`: all number cruching done here
- `pandas`: data wrangling

In [3]:
import math  
import os   
import glob 
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
import numpy as np
import pandas as pd
from scipy import stats


In [22]:
# Path to the folder where all the raw data is located
src = '/home/ejam/documents/css893/raw_data/'

# Path where we will save outputs
dst = '/home/ejam/documents/css893/output/'

# Matrix with raw SNP data
#file_snp_raw = 'snp_imputed_chr10_sample.csv'

# Matrix with imputed SNP data
file_snp_imp = '1231.txt'

# Matrix with gene expression (FPKM)
file_gen_high_z = 'high_ztpm.csv'

file_snp_high_z = 'high_z_snps_all_chr.csv'

We only need the gene regions right now. Ignore the zTPM values. This saves a lot of memory.

In [58]:
snp_high_z = pd.read_csv(src+file_snp_high_z, usecols=['gene','chrom', 'pos'])
#gen_high_z = gen_high_z.drop(columns='Unnamed: 0')
snp_high_z

Unnamed: 0,gene,chrom,pos
0,Zm00001d027457,1,5698946
1,Zm00001d027457,1,5698973
2,Zm00001d027457,1,5698991
3,Zm00001d027457,1,5698994
4,Zm00001d027457,1,5699162
...,...,...,...
4032,Zm00001d026509,10,147354710
4033,Zm00001d026509,10,147354719
4034,Zm00001d026509,10,147354742
4035,Zm00001d026509,10,147354782


In [59]:
snp_high_z['rs'] = 'rs' + snp_high_z.chrom.map(str) + '_' + snp_high_z.pos.map(str)
high_rs = list(snp_high_z['rs'].values)
snp_high_z

Unnamed: 0,gene,chrom,pos,rs
0,Zm00001d027457,1,5698946,rs1_5698946
1,Zm00001d027457,1,5698973,rs1_5698973
2,Zm00001d027457,1,5698991,rs1_5698991
3,Zm00001d027457,1,5698994,rs1_5698994
4,Zm00001d027457,1,5699162,rs1_5699162
...,...,...,...,...
4032,Zm00001d026509,10,147354710,rs10_147354710
4033,Zm00001d026509,10,147354719,rs10_147354719
4034,Zm00001d026509,10,147354742,rs10_147354742
4035,Zm00001d026509,10,147354782,rs10_147354782


sample

In [6]:
sample_snps = []
with open(src+file_snp_imp) as f:
    for i in range(11):
        sample_snps.append(f.readline())

In [12]:
sample_snps[0].split('\t')

['<Marker>',
 'rs1_44306',
 'rs1_44441',
 'rs1_44879',
 'rs1_45948',
 'rs1_46521',
 'rs1_46538',
 'rs1_47157',
 'rs1_47221',
 'rs1_47854',
 'rs1_47872',
 'rs1_48071',
 'rs1_48081',
 'rs1_48140',
 'rs1_48208',
 'rs1_48213',
 'rs1_48381',
 'rs1_48395',
 'rs1_48548',
 'rs1_48714',
 'rs1_48831',
 'rs1_48879',
 'rs1_48890',
 'rs1_48910',
 'rs1_48921',
 'rs1_49094',
 'rs1_49346',
 'rs1_49372',
 'rs1_49373',
 'rs1_49498',
 'rs1_49527',
 'rs1_49616',
 'rs1_51021',
 'rs1_51037',
 'rs1_51053',
 'rs1_51898',
 'rs1_52753',
 'rs1_52778',
 'rs1_53064',
 'rs1_53109',
 'rs1_53802',
 'rs1_55279',
 'rs1_55373',
 'rs1_55419',
 'rs1_55449',
 'rs1_55589',
 'rs1_55654',
 'rs1_55664',
 'rs1_56073',
 'rs1_56125',
 'rs1_108937',
 'rs1_109011',
 'rs1_109355',
 'rs1_138573',
 'rs1_139436',
 'rs1_153876',
 'rs1_154000',
 'rs1_154048',
 'rs1_155203',
 'rs1_155222',
 'rs1_172236',
 'rs1_172386',
 'rs1_172600',
 'rs1_172806',
 'rs1_172921',
 'rs1_173202',
 'rs1_173287',
 'rs1_173317',
 'rs1_173324',
 'rs1_173480',
 

Same for SNPs. Right now we only care about their position. Instead of using 16GB to load the data we use less than 500Mb!!!!

In [39]:
snp_matrix_imputed_raw = pd.read_table(src + file_snp_imp, usecols=high_rs)
snp_matrix_imputed_raw = snp_matrix_imputed_raw*2
snp_matrix_imputed_raw

Unnamed: 0,rs1_5698946,rs1_5698973,rs1_5698991,rs1_5698994,rs1_5699162,rs1_5699203,rs1_9355640,rs1_11304590,rs1_11304666,rs1_11304694,...,rs10_144991120,rs10_147354508,rs10_147354557,rs10_147354610,rs10_147354614,rs10_147354710,rs10_147354719,rs10_147354742,rs10_147354782,rs10_147354788
0,2,2.0,2,2,2,0.0,2,2,2,2,...,2,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0
1,2,2.0,2,2,2,2.0,2,2,0,0,...,2,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0
2,2,0.0,2,2,2,2.0,2,2,2,2,...,2,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0
3,2,2.0,2,2,2,0.0,2,2,2,2,...,2,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0
4,2,2.0,2,2,2,2.0,2,2,2,2,...,2,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,2,,2,2,2,2.0,2,2,2,2,...,2,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0
938,2,2.0,2,2,2,0.0,2,2,2,2,...,2,2,2.0,2.0,2.0,0.0,2,2.0,2.0,0.0
939,2,0.0,2,2,2,2.0,2,2,2,2,...,2,2,2.0,0.0,0.0,2.0,2,2.0,0.0,2.0
940,2,2.0,2,2,2,0.0,2,2,2,2,...,2,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0


In [60]:
snp_high_z_alt = pd.read_csv(src+file_snp_high_z, nrows=10)

In [48]:
snp_matrix_imputed_raw['cultivars'] = snp_high_z_alt.columns.values[4:]
snp_matrix_imputed_raw

Unnamed: 0,rs1_5698946,rs1_5698973,rs1_5698991,rs1_5698994,rs1_5699162,rs1_5699203,rs1_9355640,rs1_11304590,rs1_11304666,rs1_11304694,...,rs10_147354508,rs10_147354557,rs10_147354610,rs10_147354614,rs10_147354710,rs10_147354719,rs10_147354742,rs10_147354782,rs10_147354788,cultivars
0,2,2.0,2,2,2,0.0,2,2,2,2,...,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0,764
1,2,2.0,2,2,2,2.0,2,2,0,0,...,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0,779
2,2,0.0,2,2,2,2.0,2,2,2,2,...,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0,787
3,2,2.0,2,2,2,0.0,2,2,2,2,...,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0,790
4,2,2.0,2,2,2,2.0,2,2,2,2,...,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0,793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,2,,2,2,2,2.0,2,2,2,2,...,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0,Yong_28
938,2,2.0,2,2,2,0.0,2,2,2,2,...,2,2.0,2.0,2.0,0.0,2,2.0,2.0,0.0,Yu796_NS
939,2,0.0,2,2,2,2.0,2,2,2,2,...,2,2.0,0.0,0.0,2.0,2,2.0,0.0,2.0,ZS01250
940,2,2.0,2,2,2,0.0,2,2,2,2,...,2,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0,ZS1791


In [54]:
snp_matrix_imputed_raw = snp_matrix_imputed_raw.set_index('cultivars').T
snp_matrix_imputed_raw

cultivars,764,779,787,790,793,904,911,912,1538,2369,...,YANG,YE_4,YE-CHI-HUNG,YELLOW_3-4,YING-55,Yong_28,Yu796_NS,ZS01250,ZS1791,ZS635
rs1_5698946,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
rs1_5698973,2.0,2.0,0.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,...,,,,,,,2.0,0.0,2.0,2.0
rs1_5698991,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
rs1_5698994,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
rs1_5699162,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rs10_147354710,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0
rs10_147354719,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
rs10_147354742,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
rs10_147354782,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,,2.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0


For each chromosome, select only those genes and snps present in such chromosome.

Sort the genes by left position. Loop only once through all the SNPs and gene regions. We assume that genes do not overlap.

Keep a list with all the SNP indices that are contained in highly varying genes.

The SNP indices are unfortunately not global but they are with respect to their chromosome. They need to be shifted appropriately.

array([ 1,  1,  1, ..., 10, 10, 10])

In [62]:
snps = snp_matrix_imputed_raw.assign(chrom=snp_high_z.chrom.values, pos=snp_high_z.pos.values)
cols = snps.columns.values[:-2]
cols = np.insert(cols,0,'pos')
cols = np.insert(cols,0,'chrom')
snps = snps[cols]
snps.to_csv(dst + 'high_z_snps_tassel.csv', index=False)

In [63]:
snps.to_csv(dst + 'high_z_snps_tassel_NA.csv', na_rep='NA', index=False)