add support for num chromosomes different from 22

omerwe · Mar 18, 2024 · af252e4 · af252e4
1 parent 5c29ab1
commit af252e4
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 25 deletions.
diff --git a/ldsc_polyfun/jackknife.py b/ldsc_polyfun/jackknife.py
@@ -573,7 +573,7 @@ class Jackknife_Ridge(Jackknife):
     def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=True,
         num_lambdas=100, approx_ridge=False, 
         ridge_lambda=None, use_1se=False, has_intercept=False, standardize=True,
-        skip_ridge_jackknife=True, num_chr_sets=2):
+        skip_ridge_jackknife=True, num_chr_sets=2, num_chr=22):
 
         #sanity checks
         assert chr_num is not None
@@ -586,6 +586,7 @@ def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=T
         self.use_1se = use_1se
         self.verbose=verbose        
         self.has_intercept = has_intercept
+        self.num_chr = num_chr
 
         ###define chromosome sets
         assert num_chr_sets>1
@@ -596,8 +597,8 @@ def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=T
             self.chromosome_sets = []
             self.chromosome_sets.append(chromosomes[chromosomes%2==0])
             self.chromosome_sets.append(chromosomes[chromosomes%2!=0])
-        elif num_chr_sets == 22:
-            self.chromosome_sets = [np.array([c]) for c in range(1,23)]
+        elif num_chr_sets == self.num_chr:
+            self.chromosome_sets = [np.array([c]) for c in range(1, self.num_chr+1)]
         else:        
             chr_sizes = np.bincount(chr_num)[1:]
             assert num_chr_sets<=len(chr_sizes)
@@ -705,7 +706,7 @@ def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=T
 
     def _divide_chromosomes_to_sets(self, chr_sizes, num_sets):
         chr_order = np.argsort(chr_sizes)[::-1]     #np.arange(len(chr_sizes))
-        chr_assignments = np.zeros(22, dtype=np.int64) - 1
+        chr_assignments = np.zeros(self.num_chr, dtype=np.int64) - 1
         chr_assignments[chr_order[:num_sets]] = np.arange(num_sets)
         set_sizes = chr_sizes[chr_order[:num_sets]].copy()    
         for c_i in chr_order[num_sets : len(chr_sizes)]:

diff --git a/ldsc_polyfun/regressions.py b/ldsc_polyfun/regressions.py
@@ -152,7 +152,8 @@ def __init__(self, y, x, w, N, M, n_blocks, intercept=None, slow=False, step1_ii
         skip_ridge_jackknife=True,
         num_chr_sets=2,
         evenodd_split=False,
-        nnls_exact=False
+        nnls_exact=False,
+        num_chr=22
         ):
 
         for i in [y, x, w, M, N]:
@@ -254,7 +255,7 @@ def __init__(self, y, x, w, N, M, n_blocks, intercept=None, slow=False, step1_ii
                     chr_num=chr_num, verbose=verbose, ridge_lambda=ridge_lambda,
                     has_intercept=(not self.constrain_intercept),
                     standardize=standardize_ridge, approx_ridge=approx_ridge,
-                    skip_ridge_jackknife=skip_ridge_jackknife, num_chr_sets=num_chr_sets)
+                    skip_ridge_jackknife=skip_ridge_jackknife, num_chr_sets=num_chr_sets, num_chr=num_chr)
 
         else:
             assert not loco
@@ -393,7 +394,7 @@ def __init__(self, y, x, w, N, M, n_blocks=200, intercept=None, slow=False,
             twostep=None, old_weights=False, 
             chr_num=None, verbose=True,            
             approx_ridge=False, loco=False, ridge_lambda=None, standardize_ridge=True, skip_ridge_jackknife=True, keep_large=False,
-            num_chr_sets=22, evenodd_split=False, nn=False, nnls_exact=False):
+            num_chr_sets=22, evenodd_split=False, nn=False, nnls_exact=False, num_chr=22):
         step1_ii = None
         if twostep is not None:
             step1_ii = y < twostep
@@ -411,7 +412,8 @@ def __init__(self, y, x, w, N, M, n_blocks=200, intercept=None, slow=False,
                                      evenodd_split=evenodd_split,
                                      nn=nn,
                                      keep_large=keep_large,
-                                     nnls_exact=nnls_exact
+                                     nnls_exact=nnls_exact,
+                                     num_chr=num_chr
                                      )
         self.mean_chisq, self.lambda_gc = self._summarize_chisq(y)
         if not self.constrain_intercept:

diff --git a/ldsc_polyfun/sumstats.py b/ldsc_polyfun/sumstats.py
@@ -149,13 +149,13 @@ def _read_w_ld(args, log):
 
 
 def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs):
-    '''Read files split across 22 chromosomes (annot, ref_ld, w_ld).'''
+    '''Read files split across chromosomes (annot, ref_ld, w_ld).'''
     try:
         if not_chr_arg:
             log.log('Reading {N} from {F} ...'.format(F=not_chr_arg, N=noun))
             out = parsefunc(_splitp(not_chr_arg), **kwargs)
         elif chr_arg:
-            f = ps.sub_chr(chr_arg, '[1-22]')
+            f = ps.sub_chr(chr_arg, '[1-%d]'%(_N_CHR))
             log.log('Reading {N} from {F} ...'.format(F=f, N=noun))
             out = parsefunc(_splitp(chr_arg), _N_CHR, **kwargs)
     except ValueError as e:
@@ -282,6 +282,7 @@ def _read_ld_sumstats(args, log, fh, alleles=True, dropna=True):
 def estimate_h2(args, log):
     '''Estimate h2 and partitioned h2.'''
     args = copy.deepcopy(args)
+    _N_CHR = args.num_chr
     if args.samp_prev is not None and args.pop_prev is not None:
         args.samp_prev, args.pop_prev = list(map(
             float, [args.samp_prev, args.pop_prev]))
@@ -334,7 +335,8 @@ def estimate_h2(args, log):
                     evenodd_split=args.evenodd_split,
                     nn=args.nn,
                     keep_large=args.keep_large,
-                    nnls_exact=args.nnls_exact
+                    nnls_exact=args.nnls_exact,
+                    num_chr=args.num_chr
                     )
 
     if args.print_cov:

diff --git a/polyfun.py b/polyfun.py
@@ -124,13 +124,13 @@ def check_files(args):
     if args.compute_h2_L2:
         if not os.path.exists(args.sumstats):
             raise IOError('Cannot find sumstats file %s'%(args.sumstats))
-        for chr_num in range(1,23):
+        for chr_num in range(1, args.num_chr+1):
             get_file_name(args, 'ref-ld', chr_num, verify_exists=True, allow_multiple=True)
             get_file_name(args, 'w-ld', chr_num, verify_exists=True)
             get_file_name(args, 'annot', chr_num, verify_exists=True, allow_multiple=True)
 
     if args.compute_ldscores:
-        if args.chr is None: chr_range = range(1,23)            
+        if args.chr is None: chr_range = range(1, args.num_chr+1)            
         else: chr_range = range(args.chr, args.chr+1)
 
         for chr_num in chr_range:
@@ -143,7 +143,7 @@ def check_files(args):
                 get_file_name(args, 'bins', chr_num, verify_exists=True)
 
     if args.compute_h2_bins and not args.compute_ldscores:
-        for chr_num in range(1,23):
+        for chr_num in range(1, args.num_chr+1):
             get_file_name(args, 'w-ld', chr_num, verify_exists=True)
             if not args.compute_h2_L2:
                 get_file_name(args, 'bins', chr_num, verify_exists=True)
@@ -228,7 +228,8 @@ def run_ldsc(self, args, use_ridge, nn, keep_large, evenodd_split, n_blocks=2):
             evenodd_split=evenodd_split,
             nn=nn,
             keep_large=keep_large,
-            nnls_exact=args.nnls_exact
+            nnls_exact=args.nnls_exact,
+            num_chr=args.num_chr
             )
 
         #save the results object
@@ -309,7 +310,7 @@ def compute_snpvar_chr(self, args, chr_num, use_ridge):
             #make sure that the chromosome exists in one set
             found_chrom = np.any([chr_num in chr_set for chr_set in jknife.chromosome_sets])
             if not found_chrom:
-                raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all 22 human chromosomes')
+                raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all chromosomes')
 
             #find the relevant set number
             set_num=None
@@ -326,8 +327,8 @@ def compute_snpvar_chr(self, args, chr_num, use_ridge):
         else:
             hsqhat = self.hsqhat
             jknife = hsqhat.jknife
-            if len(jknife.est_loco) != 22:
-                raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all 22 human chromosomes')
+            if len(jknife.est_loco) != args.num_chr:
+                raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all chromosomes')
             taus = jknife.est_loco[chr_num-1][:hsqhat.n_annot] / hsqhat.Nbar
 
         #save the taus to disk
@@ -349,7 +350,7 @@ def compute_snpvar(self, args, use_ridge):
 
         #iterate over chromosomes
         df_snpvar_chr_list = []
-        for chr_num in tqdm(range(1,23)):
+        for chr_num in tqdm(range(1, args.num_chr+1)):
             df_snpvar_chr = self.compute_snpvar_chr(args, chr_num, use_ridge=use_ridge)            
             df_snpvar_chr_list.append(df_snpvar_chr)
         df_snpvar = pd.concat(df_snpvar_chr_list, axis=0)
@@ -519,7 +520,7 @@ def partition_snps_to_bins(self, args, use_ridge):
 
     def save_bins_to_disk(self, args):
         logging.info('Saving SNP-bins to disk')
-        for chr_num in tqdm(range(1,23)):
+        for chr_num in tqdm(range(1, args.num_chr+1)):
 
             #save bins file to disk
             df_bins_chr = self.df_bins.query('CHR==%d'%(chr_num))
@@ -576,7 +577,7 @@ def save_snpvar_to_disk(self, args, use_ridge, constrain_range):
                 raise ValueError(error_message + '. If you wish to omit the missing SNPs, please use the flag --allow-missing')
 
         #iterate over chromosomes 
-        for chr_num in tqdm(range(1,23)):
+        for chr_num in tqdm(range(1, args.num_chr+1)):
 
             #define output file name
             output_fname = 'snpvar'
@@ -616,7 +617,7 @@ def load_bins_chr(self, args, chr_num):
     def compute_ld_scores(self, args):    
         #define the range of chromosomes to iterate over
         if args.chr is None:
-            chr_range = range(1,23)
+            chr_range = range(1, args.num_chr+1)
         else:
             chr_range = range(args.chr, args.chr+1)
 
@@ -818,6 +819,7 @@ def polyfun_main(self, args):
     parser.add_argument('--ld-dir', default=None, help='The path of a directory with UKB LD files (if not specified PolyFun will create a temporary directory)')
     parser.add_argument('--output-prefix', required=True, help='Prefix of all PolyFun output file names')    
     parser.add_argument('--allow-missing', default=False, action='store_true', help='If specified, PolyFun will not terminate if some SNPs with sumstats are not found in the annotations files')
+    parser.add_argument('--num-chr', type=int, default=22, help='Number of chromosomes for your target organism (default is 22)')
 
     #LDSC parameters
     parser.add_argument('--nnls-exact', default=False, action='store_true', help='If specified, S-LDSC will estimate non-negative taus using an exact instead of an approximate solver (this will be slower but slightly more accurate)')

diff --git a/polyloc.py b/polyloc.py
@@ -93,7 +93,7 @@ def check_files(args):
 
     #check that required input files exist
     if args.compute_ldscores or args.compute_partitions:
-        if args.chr is None: chr_range = range(1,23)            
+        if args.chr is None: chr_range = range(1, args.num_chr+1)            
         else: chr_range = range(args.chr, args.chr+1)
 
         for chr_num in chr_range:
@@ -108,7 +108,7 @@ def check_files(args):
                 get_file_name(args, 'bins', chr_num, verify_exists=True)
 
     if args.compute_polyloc:    
-        for chr_num in range(1,23):
+        for chr_num in range(1, args.num_chr+1):
             get_file_name(args, 'w-ld', chr_num, verify_exists=True)
             if not args.compute_partitions:
                 get_file_name(args, 'bins', chr_num, verify_exists=True)
@@ -152,7 +152,7 @@ def polyloc_partitions(self, args):
         #add another partition for all SNPs not in the posterior file
         if args.bfile_chr is not None:
             df_bim_list = []
-            for chr_num in range(1,23):
+            for chr_num in range(1, args.num_chr+1):
                 df_bim_chr = pd.read_table(args.bfile_chr+'%d.bim'%(chr_num), sep='\s+', names=['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], header=None)
                 df_bim_list.append(df_bim_chr)
             df_bim = pd.concat(df_bim_list, axis=0)
@@ -328,6 +328,7 @@ def polyloc_main(self, args):
     parser.add_argument('--output-prefix', required=True, help='Prefix of all PolyLoc out file names')    
     parser.add_argument('--ld-ukb', default=False, action='store_true', help='If specified, PolyLoc will use UKB LD matrices to compute LD-scores')
     parser.add_argument('--ld-dir', default=None, help='The path of a directory with UKB LD files (if not specified PolyLoc will create a temporary directory)')
+    parser.add_argument('--num-chr', type=int, default=22, help='Number of chromosomes for your target organism (default is 22)')