# Convert ldscores and annotations from ldsc.py to .mat format

In [1]:
import glob
import pandas as pd
import numpy as np
import scipy.io as sio
pd.set_option('precision', 4)
np.set_printoptions(precision=4)

if 'ref' not in globals():
    print('load reference')
    ref = pd.read_csv(r'H:\Dropbox\analysis\2016_09_September_19_LDScoreRegression\1m\1m.ref', delim_whitespace=True);

def convert_to_mat(file_chr, out, M=True, M_5_50=True, annot_files=True):
    print('Shape of ref file: {shape}'.format(shape=ref.shape))

    cnames = ['CHR', 'BP', 'CM', 'MAF']
    df_ldscore = pd.concat([pd.read_csv('{f}{c}.l2.ldscore.gz'.format(f=file_chr, c=chr), delim_whitespace=True)  for chr in range(1, 23)])
    for cname in [x for x in cnames if x in df_ldscore]: del df_ldscore[cname]
    print('Shape of ldscore file: {shape}'.format(shape=df_ldscore.shape))
    df_ldscore = pd.merge(ref[['SNP']], df_ldscore, how='left', on='SNP')
    del df_ldscore['SNP']
    print('Shape of ldscore file after merge: {shape}'.format(shape=df_ldscore.shape))
    save_dict = { 'annonames':list(df_ldscore.columns), 'annomat':df_ldscore.values}

    if M_5_50:
        m_5_50 = pd.concat([pd.read_csv('{f}{c}.l2.M_5_50'.format(f=file_chr, c=chr), delim_whitespace=True, header=None) for chr in range(1, 23)])
        m_5_50 = np.atleast_2d(m_5_50.sum().values)
        print('M_5_50={}'.format(m_5_50))
        save_dict['M_5_50']=m_5_50

    if M:
        m = pd.concat([pd.read_csv('{f}{c}.l2.M'.format(f=file_chr, c=chr), delim_whitespace=True, header=None) for chr in range(1, 23)])
        m = np.atleast_2d(m.sum().values)
        print('M={}'.format(m))
        save_dict['M']=m

    if annot_files:
        df_annot = pd.concat([pd.read_csv('{f}{c}.annot.gz'.format(f=file_chr, c=chr), delim_whitespace=True) for chr in range(1, 23)])
        for cname in [x for x in cnames if x in df_annot]: del df_annot[cname]
        print('Shape of annots file: {shape}'.format(shape=df_annot.shape))
        df_annot = pd.merge(ref[['SNP']], df_annot, how='left', on='SNP')
        print('Shape of annots file after merge: {shape}'.format(shape=df_annot.shape))
        del df_annot['SNP']
        save_dict['annomat_bin'] = df_annot.values;
  
    sio.savemat(out, save_dict, format='5', do_compression=False, oned_as='column', appendmat=False)
    print('{c} columns written to {f}'.format(c=list(df_ldscore.columns), f=out))

convert_to_mat('eur_w_ld_chr\\', 'eur_w_ld.mat', M=False, annot_files=False)
#convert_to_mat(r'1000G_Phase3_baselineLD_ldscores\baselineLD.', 'baselineLD.mat')
#convert_to_mat(r'1000G_EUR_Phase3_baseline\baseline.', 'baseline.mat')
#convert_to_mat(r'1000G_Phase3_weights_hm3_no_MHC\weights.hm3_noMHC.', '1000G_Phase3_weights_hm3_no_MHC.mat', M=False, M_5_50=False, annot_files=False)
#for ctg in range(1,11):
#    convert_to_mat(r'1000G_Phase3_cell_type_groups\cell_type_group.{}.'.format(ctg), 'cell_type_group.{}'.format(ctg))


load reference
Shape of ref file: (1190321, 6)
Shape of ldscore file: (1290028, 2)
Shape of ldscore file after merge: (1190321, 1)
M_5_50=[[1173569]]
['L2'] columns written to eur_w_ld.mat
