Skip to content

Commit

Permalink
add support for num chromosomes different from 22
Browse files Browse the repository at this point in the history
  • Loading branch information
omerwe committed Mar 18, 2024
1 parent 5c29ab1 commit af252e4
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 25 deletions.
9 changes: 5 additions & 4 deletions ldsc_polyfun/jackknife.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ class Jackknife_Ridge(Jackknife):
def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=True,
num_lambdas=100, approx_ridge=False,
ridge_lambda=None, use_1se=False, has_intercept=False, standardize=True,
skip_ridge_jackknife=True, num_chr_sets=2):
skip_ridge_jackknife=True, num_chr_sets=2, num_chr=22):

#sanity checks
assert chr_num is not None
Expand All @@ -586,6 +586,7 @@ def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=T
self.use_1se = use_1se
self.verbose=verbose
self.has_intercept = has_intercept
self.num_chr = num_chr

###define chromosome sets
assert num_chr_sets>1
Expand All @@ -596,8 +597,8 @@ def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=T
self.chromosome_sets = []
self.chromosome_sets.append(chromosomes[chromosomes%2==0])
self.chromosome_sets.append(chromosomes[chromosomes%2!=0])
elif num_chr_sets == 22:
self.chromosome_sets = [np.array([c]) for c in range(1,23)]
elif num_chr_sets == self.num_chr:
self.chromosome_sets = [np.array([c]) for c in range(1, self.num_chr+1)]
else:
chr_sizes = np.bincount(chr_num)[1:]
assert num_chr_sets<=len(chr_sizes)
Expand Down Expand Up @@ -705,7 +706,7 @@ def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=T

def _divide_chromosomes_to_sets(self, chr_sizes, num_sets):
chr_order = np.argsort(chr_sizes)[::-1] #np.arange(len(chr_sizes))
chr_assignments = np.zeros(22, dtype=np.int64) - 1
chr_assignments = np.zeros(self.num_chr, dtype=np.int64) - 1
chr_assignments[chr_order[:num_sets]] = np.arange(num_sets)
set_sizes = chr_sizes[chr_order[:num_sets]].copy()
for c_i in chr_order[num_sets : len(chr_sizes)]:
Expand Down
10 changes: 6 additions & 4 deletions ldsc_polyfun/regressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ def __init__(self, y, x, w, N, M, n_blocks, intercept=None, slow=False, step1_ii
skip_ridge_jackknife=True,
num_chr_sets=2,
evenodd_split=False,
nnls_exact=False
nnls_exact=False,
num_chr=22
):

for i in [y, x, w, M, N]:
Expand Down Expand Up @@ -254,7 +255,7 @@ def __init__(self, y, x, w, N, M, n_blocks, intercept=None, slow=False, step1_ii
chr_num=chr_num, verbose=verbose, ridge_lambda=ridge_lambda,
has_intercept=(not self.constrain_intercept),
standardize=standardize_ridge, approx_ridge=approx_ridge,
skip_ridge_jackknife=skip_ridge_jackknife, num_chr_sets=num_chr_sets)
skip_ridge_jackknife=skip_ridge_jackknife, num_chr_sets=num_chr_sets, num_chr=num_chr)

else:
assert not loco
Expand Down Expand Up @@ -393,7 +394,7 @@ def __init__(self, y, x, w, N, M, n_blocks=200, intercept=None, slow=False,
twostep=None, old_weights=False,
chr_num=None, verbose=True,
approx_ridge=False, loco=False, ridge_lambda=None, standardize_ridge=True, skip_ridge_jackknife=True, keep_large=False,
num_chr_sets=22, evenodd_split=False, nn=False, nnls_exact=False):
num_chr_sets=22, evenodd_split=False, nn=False, nnls_exact=False, num_chr=22):
step1_ii = None
if twostep is not None:
step1_ii = y < twostep
Expand All @@ -411,7 +412,8 @@ def __init__(self, y, x, w, N, M, n_blocks=200, intercept=None, slow=False,
evenodd_split=evenodd_split,
nn=nn,
keep_large=keep_large,
nnls_exact=nnls_exact
nnls_exact=nnls_exact,
num_chr=num_chr
)
self.mean_chisq, self.lambda_gc = self._summarize_chisq(y)
if not self.constrain_intercept:
Expand Down
8 changes: 5 additions & 3 deletions ldsc_polyfun/sumstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,13 +149,13 @@ def _read_w_ld(args, log):


def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs):
'''Read files split across 22 chromosomes (annot, ref_ld, w_ld).'''
'''Read files split across chromosomes (annot, ref_ld, w_ld).'''
try:
if not_chr_arg:
log.log('Reading {N} from {F} ...'.format(F=not_chr_arg, N=noun))
out = parsefunc(_splitp(not_chr_arg), **kwargs)
elif chr_arg:
f = ps.sub_chr(chr_arg, '[1-22]')
f = ps.sub_chr(chr_arg, '[1-%d]'%(_N_CHR))
log.log('Reading {N} from {F} ...'.format(F=f, N=noun))
out = parsefunc(_splitp(chr_arg), _N_CHR, **kwargs)
except ValueError as e:
Expand Down Expand Up @@ -282,6 +282,7 @@ def _read_ld_sumstats(args, log, fh, alleles=True, dropna=True):
def estimate_h2(args, log):
'''Estimate h2 and partitioned h2.'''
args = copy.deepcopy(args)
_N_CHR = args.num_chr
if args.samp_prev is not None and args.pop_prev is not None:
args.samp_prev, args.pop_prev = list(map(
float, [args.samp_prev, args.pop_prev]))
Expand Down Expand Up @@ -334,7 +335,8 @@ def estimate_h2(args, log):
evenodd_split=args.evenodd_split,
nn=args.nn,
keep_large=args.keep_large,
nnls_exact=args.nnls_exact
nnls_exact=args.nnls_exact,
num_chr=args.num_chr
)

if args.print_cov:
Expand Down
24 changes: 13 additions & 11 deletions polyfun.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,13 @@ def check_files(args):
if args.compute_h2_L2:
if not os.path.exists(args.sumstats):
raise IOError('Cannot find sumstats file %s'%(args.sumstats))
for chr_num in range(1,23):
for chr_num in range(1, args.num_chr+1):
get_file_name(args, 'ref-ld', chr_num, verify_exists=True, allow_multiple=True)
get_file_name(args, 'w-ld', chr_num, verify_exists=True)
get_file_name(args, 'annot', chr_num, verify_exists=True, allow_multiple=True)

if args.compute_ldscores:
if args.chr is None: chr_range = range(1,23)
if args.chr is None: chr_range = range(1, args.num_chr+1)
else: chr_range = range(args.chr, args.chr+1)

for chr_num in chr_range:
Expand All @@ -143,7 +143,7 @@ def check_files(args):
get_file_name(args, 'bins', chr_num, verify_exists=True)

if args.compute_h2_bins and not args.compute_ldscores:
for chr_num in range(1,23):
for chr_num in range(1, args.num_chr+1):
get_file_name(args, 'w-ld', chr_num, verify_exists=True)
if not args.compute_h2_L2:
get_file_name(args, 'bins', chr_num, verify_exists=True)
Expand Down Expand Up @@ -228,7 +228,8 @@ def run_ldsc(self, args, use_ridge, nn, keep_large, evenodd_split, n_blocks=2):
evenodd_split=evenodd_split,
nn=nn,
keep_large=keep_large,
nnls_exact=args.nnls_exact
nnls_exact=args.nnls_exact,
num_chr=args.num_chr
)

#save the results object
Expand Down Expand Up @@ -309,7 +310,7 @@ def compute_snpvar_chr(self, args, chr_num, use_ridge):
#make sure that the chromosome exists in one set
found_chrom = np.any([chr_num in chr_set for chr_set in jknife.chromosome_sets])
if not found_chrom:
raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all 22 human chromosomes')
raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all chromosomes')

#find the relevant set number
set_num=None
Expand All @@ -326,8 +327,8 @@ def compute_snpvar_chr(self, args, chr_num, use_ridge):
else:
hsqhat = self.hsqhat
jknife = hsqhat.jknife
if len(jknife.est_loco) != 22:
raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all 22 human chromosomes')
if len(jknife.est_loco) != args.num_chr:
raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all chromosomes')
taus = jknife.est_loco[chr_num-1][:hsqhat.n_annot] / hsqhat.Nbar

#save the taus to disk
Expand All @@ -349,7 +350,7 @@ def compute_snpvar(self, args, use_ridge):

#iterate over chromosomes
df_snpvar_chr_list = []
for chr_num in tqdm(range(1,23)):
for chr_num in tqdm(range(1, args.num_chr+1)):
df_snpvar_chr = self.compute_snpvar_chr(args, chr_num, use_ridge=use_ridge)
df_snpvar_chr_list.append(df_snpvar_chr)
df_snpvar = pd.concat(df_snpvar_chr_list, axis=0)
Expand Down Expand Up @@ -519,7 +520,7 @@ def partition_snps_to_bins(self, args, use_ridge):

def save_bins_to_disk(self, args):
logging.info('Saving SNP-bins to disk')
for chr_num in tqdm(range(1,23)):
for chr_num in tqdm(range(1, args.num_chr+1)):

#save bins file to disk
df_bins_chr = self.df_bins.query('CHR==%d'%(chr_num))
Expand Down Expand Up @@ -576,7 +577,7 @@ def save_snpvar_to_disk(self, args, use_ridge, constrain_range):
raise ValueError(error_message + '. If you wish to omit the missing SNPs, please use the flag --allow-missing')

#iterate over chromosomes
for chr_num in tqdm(range(1,23)):
for chr_num in tqdm(range(1, args.num_chr+1)):

#define output file name
output_fname = 'snpvar'
Expand Down Expand Up @@ -616,7 +617,7 @@ def load_bins_chr(self, args, chr_num):
def compute_ld_scores(self, args):
#define the range of chromosomes to iterate over
if args.chr is None:
chr_range = range(1,23)
chr_range = range(1, args.num_chr+1)
else:
chr_range = range(args.chr, args.chr+1)

Expand Down Expand Up @@ -818,6 +819,7 @@ def polyfun_main(self, args):
parser.add_argument('--ld-dir', default=None, help='The path of a directory with UKB LD files (if not specified PolyFun will create a temporary directory)')
parser.add_argument('--output-prefix', required=True, help='Prefix of all PolyFun output file names')
parser.add_argument('--allow-missing', default=False, action='store_true', help='If specified, PolyFun will not terminate if some SNPs with sumstats are not found in the annotations files')
parser.add_argument('--num-chr', type=int, default=22, help='Number of chromosomes for your target organism (default is 22)')

#LDSC parameters
parser.add_argument('--nnls-exact', default=False, action='store_true', help='If specified, S-LDSC will estimate non-negative taus using an exact instead of an approximate solver (this will be slower but slightly more accurate)')
Expand Down
7 changes: 4 additions & 3 deletions polyloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def check_files(args):

#check that required input files exist
if args.compute_ldscores or args.compute_partitions:
if args.chr is None: chr_range = range(1,23)
if args.chr is None: chr_range = range(1, args.num_chr+1)
else: chr_range = range(args.chr, args.chr+1)

for chr_num in chr_range:
Expand All @@ -108,7 +108,7 @@ def check_files(args):
get_file_name(args, 'bins', chr_num, verify_exists=True)

if args.compute_polyloc:
for chr_num in range(1,23):
for chr_num in range(1, args.num_chr+1):
get_file_name(args, 'w-ld', chr_num, verify_exists=True)
if not args.compute_partitions:
get_file_name(args, 'bins', chr_num, verify_exists=True)
Expand Down Expand Up @@ -152,7 +152,7 @@ def polyloc_partitions(self, args):
#add another partition for all SNPs not in the posterior file
if args.bfile_chr is not None:
df_bim_list = []
for chr_num in range(1,23):
for chr_num in range(1, args.num_chr+1):
df_bim_chr = pd.read_table(args.bfile_chr+'%d.bim'%(chr_num), sep='\s+', names=['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], header=None)
df_bim_list.append(df_bim_chr)
df_bim = pd.concat(df_bim_list, axis=0)
Expand Down Expand Up @@ -328,6 +328,7 @@ def polyloc_main(self, args):
parser.add_argument('--output-prefix', required=True, help='Prefix of all PolyLoc out file names')
parser.add_argument('--ld-ukb', default=False, action='store_true', help='If specified, PolyLoc will use UKB LD matrices to compute LD-scores')
parser.add_argument('--ld-dir', default=None, help='The path of a directory with UKB LD files (if not specified PolyLoc will create a temporary directory)')
parser.add_argument('--num-chr', type=int, default=22, help='Number of chromosomes for your target organism (default is 22)')



Expand Down

0 comments on commit af252e4

Please sign in to comment.