## OSCA meta-QTL analysis

##### https://cnsgenomics.com/software/osca/#Overview
##### Zhang F, Chen W, Zhu Z, Zhang Q, Nabais, MF, Qi T, Deary IJ, Wray NR, Visscher PM, McRae AF, Yang J (2019) OSCA: a tool for omic-data-based complex trait analysis. Genome Biol, 20:107.

#### April 28, 2021

### Set up environment

In [2]:
import pandas as pd
import numpy as np
from numpy import argsort
import dask.dataframe as dd
import pyarrow.parquet as pq
import csv
import time
from gtfparse import read_gtf

import os

import warnings
#warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [12]:
# directories
wrk_dir = f'/labshare/anni/eqtl/osca'
script_dir = f'{wrk_dir}/scripts'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'

# input files
gencode_file1 = '/labshare/anni/eqtl/gencode.v32.annotation.gtf'
gencode_file2 = '/labshare/raph/datasets/gtex/ExpressionFiles/gencode.v26.GRCh38.genes.gtf'

# variables
timept = 'BLM0T1'
cell_types_list = ['Lymphocytes',
                   'Neutrophils', 'Basophils',
                   'Eosinophils','Monocytes']
cohort_list = ['ppmi','pdbp']

# biowulf variables
username = 'mooreank'
biowulf_dir = '/data/LNG/anni/osca'

# constant values
alpha_value = 0.05

In [None]:
##download osca and smr
!wget https://cnsgenomics.com/software/osca/download/osca_Linux.zip
!wget https://cnsgenomics.com/software/smr/download/smr_Linux.zip

### copy over tensorqtl results files from the cloud

In [None]:
##tensorqtl parquet files
!gsutil -m cp gs://nihnialngcbg-eqtl/ppmi/tensorqtl/* /labshare/anni/eqtl/tensorqtl_meta/ppmi/tensorqtl/
!gsutil -m cp gs://nihnialngcbg-eqtl/pdbp/tensorqtl/* /labshare/anni/eqtl/tensorqtl_meta/pdbp/tensorqtl/
#!gsutil -m cp gs://nihnialngcbg-eqtl/gtex/tensorqtl/* /labshare/anni/eqtl/tensorqtl_meta/gtex/tensorqtl/

##geno info files
!gsutil -m cp gs://nihnialngcbg-eqtl/ppmi/genotypes/ppmi.amppdv1.bfile.bim /labshare/anni/eqtl/tensorqtl_meta/ppmi/genotypes/
!gsutil -m cp gs://nihnialngcbg-eqtl/pdbp/genotypes/pdbp.amppdv1.bfile.bim /labshare/anni/eqtl/tensorqtl_meta/pdbp/genotypes/
#!gsutil -m cp gs://nihnialngcbg-eqtl/gtex/genotypes/gtex.v8.bfile.bim /labshare/anni/eqtl/tensorqtl_meta/gtex/genotypes/


#### remove other tensorqtl results from the biowulf folder and add working timepoint

In [None]:
timept = 
print('rm /data/LNG/anni/osca/tensorqtl/*')

print(f'scp /labshare/anni/eqtl/tensorqtl_meta/ppmi/tensorqtl/ppmi.{timept}* {username}@helix.nih.gov:/{biowulf_dir}/tensorqtl/')
print(f'scp /labshare/anni/eqtl/tensorqtl_meta/ppmi/tensorqtl/pdbp.{timept}* {username}@helix.nih.gov:/{biowulf_dir}/tensorqtl/')


### Generate esd, flist files to create besd files

In [16]:
##script generating esd files per gene

##tensorqtl_osca_esd_cell.py
# #def make_esd_files(cohort, version, cohort_build, cell_type):
# cohort = 'pdbp'
# version = 'amppdv1'
# cohort_build = 'pdbp'
# cell_type = 'Monocytes'
# print(f'{cell_type}')
# parquet_dir = f'/labshare/anni/eqtl/tensorqtl_meta/{cohort}/tensorqtl'
# cell_files = f'{parquet_dir}/{cohort_build}.{cell_type}.cis_qtl_pairs.chr*.parquet'
# cieqtl_df = dd.read_parquet(cell_files)
# cieqtl_df = cieqtl_df.drop_duplicates()
# cieqtl_df['new_gene'] = cieqtl_df['phenotype_id'].str.partition('.')[0]
# probe_ids = list(set(cieqtl_df['new_gene']))
# #probe_ids = cieqtl_df['new_gene'].tolist()
# print(f'genes: {len(probe_ids)}')

# #print(cieqtl_df.shape)
# #display(cieqtl_df.head())
# bim_dir = f'/labshare/anni/eqtl/tensorqtl_meta/{cohort}/genotypes'
# genotype_df = dd.read_csv(f'{bim_dir}/{cohort}.{version}.bfile.bim', sep = '\t', header=None)
# genotype_df = genotype_df.rename(columns={0:'chr',1:'variant_id',3:'pos',4:'ref',5:'alt'})

# genotype_df['variant_id'] = genotype_df['variant_id'].str.replace('_b38','')
# genotype_df['variant_id'] = genotype_df['variant_id'].str.replace('_',':')
# #genotype_df.head()

# merge_df = dd.merge(cieqtl_df,genotype_df, on='variant_id')
# #display(merge_df.head())
# ##subset columns

#updating gene ids from v26 to v32 when needed
#read in gencode files
# gencode = read_gtf(f'{gencode_file1}')
# print('loaded gencode v32.')
# gencode = gencode[gencode['feature'] == 'gene']
# gencode[['new_gene','end']] = gencode['gene_id'].str.split('.',n=2,expand=True)
# gencode = gencode[['gene_id','new_gene','seqname','strand','start']]
# gen_gene = gencode['new_gene'].tolist()

# # v26 = read_gtf(f'{gencode_file2}')
# # print('loaded gencode v26.')
# # v26 = v26[v26['feature'] == 'gene']
# # v26[['new_gene','part']] = v26['gene_id'].str.split('.',expand=True)
# # v26 = v26[['gene_id','new_gene','seqname','strand','start']]

# ##get missing genes from v32
# overlap = list(set(gen_gene) & set(probe_ids))
# print(f'in v32: {len(overlap)}')
# left = list(set(probe_ids) - set(overlap))
# print(f'adding with v26: {len(left)}')

#add v26 info to missing genes
# left_v26 = merge_df[merge_df['new_gene'].isin(left)]
# left_v26 = left_v26.merge(v26, on='new_gene')

# merge_df = dd.merge(merge_df,gencode, on='new_gene', how='inner')
# merge_df = merge_df.drop_duplicates()
# # display(merge_df.head())
# all_df = merge_df
# #merge all genes with info
# #all_df = dd.concat([merge_df,left_v26])
# #print(all_df.shape)

# esd_df = all_df[['gene_id','chr','variant_id','pos','ref','alt','maf','b_gi','b_gi_se','pval_gi']]
# esd_df = esd_df.drop_duplicates()


# ##convert to numpy array to speed up subsetting
# all_array = esd_df.compute().to_numpy()
# new_ids = list(set([item[0] for item in all_array]))
# all_array

# print('subsetting genes...')
# for gene in new_ids:
#     ##make subset array per gene
#     probe_array = all_array[np.in1d(all_array[:,0],gene)]

#     ##remove gene id column 
#     #probe_array = np.delete(probe_array, 1, 0)
#     probe_array = [i[1:] for i in probe_array]

#     ##save to textfile
#     filename = f'/labshare/anni/eqtl/tensorqtl_meta/{cohort}/osca/{cohort}.{cell_type}.{gene}.{cell_type}.esd'
#     with open(filename,"w+") as my_csv:
#         print('Chr\tSNP\tBp\tA1\tA2\tFreq\tBeta\tse\tp', file=my_csv)
#         csvWriter = csv.writer(my_csv,delimiter='\t')
#         csvWriter.writerows(probe_array)

# print(f'{cell_type}: done.')

#### make swarm file to make esd files for all cells

In [16]:
### generate swarm file to run in biowulf

def make_esd_swarm(timept):
    file = f'{script_dir}/wb.{timept}.make.esd.swarm'
    with open(file, "w") as text_file:
        for cohort in cohort_list:
            for cell_type in cell_types_list:
                text_file.write(f'python tensorqtl_osca_esd_cell.py {cohort} amppdv1 wb {cell_type} \
                > {cohort}.{cell_type}.log\n')

make_esd_swarm(time)

### make flist file

In [11]:

def make_flist_file(cohort, version, cohort_build, cell_type):
    parquet_dir = f'/labshare/anni/eqtl/tensorqtl_meta/{cohort}/tensorqtl'
    cell_files = f'{parquet_dir}/{cohort_build}.{cell_type}.cis_qtl_pairs.chr*.parquet'
    cieqtl_df = dd.read_parquet(cell_files)
    cieqtl_df = cieqtl_df.drop_duplicates(subset=['phenotype_id'])
    cieqtl_df['new_gene'] = cieqtl_df['phenotype_id'].str.partition('.')[0]
    probe_ids = list(set(cieqtl_df['new_gene']))
    print(f'genes: {len(probe_ids)}')
    #display(cieqtl_df.head())

    ##read in gencode files
    gencode = read_gtf(f'{gencode_file1}')
    print('loaded gencode v32.')
    gencode = gencode[gencode['feature'] == 'gene']
    #gencode[['gene','part']] = gencode['gene_id'].str.split('.',expand=True)
    gencode['seqname'] = gencode['seqname'].str.replace("chr", "")
    #gencode = gencode.drop_duplicates(subset=['gene'])
    gencode[['new_gene','end']] = gencode['gene_id'].str.split('.',n=2,expand=True)
    gencode = gencode[['seqname','gene_id', 'strand','start','new_gene']]
    gen_gene = gencode['new_gene'].tolist()

    v26 = read_gtf(f'{gencode_file2}')
    print('loaded gencode v26.')
    v26 = v26[v26['feature'] == 'gene']
    v26[['new_gene','part']] = v26['gene_id'].str.split('.',expand=True)
    v26['seqname'] = v26['seqname'].str.replace("chr", "")

    ##get missing genes from v32
    overlap = list(set(gen_gene) & set(probe_ids))
    print(f'in v32: {len(overlap)}')
    left = list(set(probe_ids) - set(overlap))
    print(f'adding with v26: {len(left)}')

    #add v26 info to missing genes
    cieqtl_v26 = cieqtl_df[cieqtl_df['new_gene'].isin(left)]
    cieqtl_v26 = cieqtl_v26.merge(v26, on='new_gene')
    cieqtl_v26 = cieqtl_v26[['seqname','gene_id', 'strand','start','new_gene']]
    #print(cieqtl_v26.shape)
    #display(cieqtl_v26.head())

    merge_df = dd.merge(cieqtl_df,gencode, on='new_gene', how='inner')
    #merge all genes with info
    all_df = dd.concat([merge_df,cieqtl_v26])
    #print(all_df.shape)
    #display(all_df.head())

    # ##subset columns
    all_df = all_df[['seqname','gene_id','start','strand']]
    all_df = all_df.rename(columns={'seqname':'Chr','gene_id':'ProbeID','start':'ProbeBp','strand':'Orientation'})
    all_df['GeneticDistance'] = '0'
    all_df['Gene'] =  all_df['ProbeID']
    all_df['Chr'] = all_df['Chr'].str.replace("chr", "")
    all_df['PathOfEsd'] = f'/data/LNG/anni/osca/besd_input/{cohort}.{cell_type}.'+all_df['ProbeID']+'.esd'
    all_df = all_df[['Chr','ProbeID','GeneticDistance','ProbeBp','Gene','Orientation','PathOfEsd']]
    a = all_df.shape
    all_df = all_df.compute()
    print(f'final shape: {a[0].compute(),a[1]}')
    out_dir = f'/labshare/anni/eqtl/osca/besd_input'
    all_df.to_csv(f'{out_dir}/{cohort}.{cell_type}.flist', sep='\t', index=False)


In [12]:
##generate flist files
for cell in cell_types_list:
    make_flist_file('ppmi','amppdv1','ppmi',cell)
    make_flist_file('pdbp','amppdv1','pdbp',cell)
    #make_flist_file('gtex','v8','gtex.v8.wb',cell)

genes: 14921


INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


loaded gencode v32.


INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'level', 'havana_gene', 'exon_id', 'exon_number', 'tag']


loaded gencode v26.
in v32: 14921
adding with v26: 0
final shape: (14921, 7)
genes: 13810


INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


loaded gencode v32.


INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'level', 'havana_gene', 'exon_id', 'exon_number', 'tag']


loaded gencode v26.
in v32: 13810
adding with v26: 0
final shape: (13810, 7)
genes: 14921


INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


loaded gencode v32.


INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'level', 'havana_gene', 'exon_id', 'exon_number', 'tag']


loaded gencode v26.
in v32: 14921
adding with v26: 0
final shape: (14921, 7)
genes: 13810


INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


loaded gencode v32.


INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'level', 'havana_gene', 'exon_id', 'exon_number', 'tag']


loaded gencode v26.
in v32: 13810
adding with v26: 0
final shape: (13810, 7)
genes: 14921


INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


loaded gencode v32.


INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'level', 'havana_gene', 'exon_id', 'exon_number', 'tag']


loaded gencode v26.
in v32: 14921
adding with v26: 0
final shape: (14921, 7)
genes: 13810


INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


loaded gencode v32.


INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'level', 'havana_gene', 'exon_id', 'exon_number', 'tag']


loaded gencode v26.
in v32: 13810
adding with v26: 0
final shape: (13810, 7)


In [14]:
# ## set up osca commands
# cohort_list = ['ppmi','pdbp']

# #for cell in cell_types_list:
# for cohort in cohort_list:
#     for cell in cell_types_list:
#         in_dir = f'/data/LNG/anni/osca/besd_input'
#         out_dir = f'/data/LNG/anni/osca/besd_files'
#         print(f'./smr_Linux --eqtl-flist {in_dir}/{cohort}.{cell}.flist --make-besd --out {out_dir}/{cohort}.{cell}')
#     print('')

#### make swarm file to create besd files from esd, flist files

In [23]:
def make_besd_swarm(timept):
    file = f'{script_dir}/wb.{timept}.make.besd.swarm'
    with open(file, "w") as text_file:
        for cohort in cohort_list:
            for cell_type in cell_types_list:
                text_file.write(f'{biowulf_dir}/smr_Linux --eqtl-flist {biowulf_dir}/besd_input/{cohort}.{cell_type}.flist\
                --make-besd --out {biowulf_dir}/besd_files/{cohort}.{timept}.{cell_type}\n')

make_besd_swarm(timept)

#### make swarm files to upate besd files
#### (Need to update samples sizes )

In [39]:
def update_besd_swarm(timept):
    file = f'{script_dir}/wb.{timept}.update.besd.swarm'
    with open(file, "w") as text_file:
        for cell_type in cell_types_list:
            text_file.write(f'{biowulf_dir}/smr_Linux --beqtl-summary {biowulf_dir}/besd_files/ppmi.{timept}.{cell_type} \
--add-n 1193 --make-besd --out {biowulf_dir}/besd_files/ppmi.{timept}.{cell_type}.test\n')
            text_file.write(f'{biowulf_dir}/smr_Linux --beqtl-summary {biowulf_dir}/besd_files/pdbp.{timept}.{cell_type} \
--add-n 1221 --make-besd --out {biowulf_dir}/besd_files/pdbp.{timept}.{cell_type}.test\n')

update_besd_swarm(timept)



In [None]:
# timept = 'BLM0T1'
# cell_type = 'Lymphocytes'
# print(f'{biowulf_dir}/smr_Linux --beqtl-summary {biowulf_dir}/besd_files/ppmi.{cell_type} \
# --add-n 1193 --make-besd --out {biowulf_dir}/besd_files/ppmi.{timept}.{cell_type}')
# print(f'{biowulf_dir}/smr_Linux --beqtl-summary {biowulf_dir}/besd_files/pdbp.{cell_type} \
# --add-n 1221 --make-besd --out {biowulf_dir}/besd_files/pdbp.{timept}.{cell_type}')



#### prep for meta analysis

In [12]:
##make file list
cohort_list = ['ppmi','pdbp']

def make_cell_besd_list(timept):
    for cell in cell_types_list:
        file = f'{script_dir}/wb.{timept}.{cell}.besd.flist'
        with open(file, "w") as text_file:
            for cohort in cohort_list:
                text_file.write(f'{biowulf_dir}/besd_files/{cohort}.{timept}.{cell}\n')
            #print(f'{file} &')


make_cell_besd_list(timept)

#### make swarm file to run meta analysis for all cell types

In [14]:
## run meta analysis

def make_meta_list(cell_list, timept):
    file = f'{script_dir}/wb.{timept}.run.meta.swarm'
    with open(file, "w") as text_file:
        for cell in cell_list:
            text_file.write(f'{biowulf_dir}/osca_Linux --besd-flist {biowulf_dir}/meta_scripts/wb.{timept}.{cell}.besd.flist \
            --meta --out {biowulf_dir}/results/wb.{timept}.{cell}\n')
#     with open(file, "w") as text_file:
# print(f'./osca_Linux --besd-flist /data/LNG/anni/osca/meta_scripts/wb.{cell_type}.besd.flist \
# --meta --out /data/LNG/anni/osca/results/wb.{cell_type} &\n')



make_meta_list(cell_types_list, timept)

### Copy over generated files to biowulf

In [29]:
## copy over files to biowulf

#copy over tools
print(f'scp {wrk_dir}/*_Linux {username}@helix.nih.gov:{biowulf_dir}/')

#copy ove scripts
print(f'scp /labshare/anni/notebooks/eqtl/meta/tensorqtl_osca_esd_cell* {username}@helix.nih.gov:{biowulf_dir}/meta_scripts/')
print(f'scp {script_dir}/* {username}@helix.nih.gov:{biowulf_dir}/meta_scripts/')
print(f'scp {wrk_dir}/besd_input/*flist {username}@helix.nih.gov:{biowulf_dir}/besd_input/')


scp /labshare/anni/eqtl/osca/*_Linux mooreank@helix.nih.gov:/data/LNG/anni/osca/
scp /labshare/anni/notebooks/eqtl/meta/tensorqtl_osca_esd_cell* mooreank@helix.nih.gov:/data/LNG/anni/osca/meta_scripts/
scp /labshare/anni/eqtl/osca/scripts/* mooreank@helix.nih.gov:/data/LNG/anni/osca/meta_scripts/
scp /labshare/anni/eqtl/osca/besd_input/*flist mooreank@helix.nih.gov:/data/LNG/anni/osca/besd_input/


### Generate osca commands to run in biowulf

#### run swarm to create esd files

In [22]:
print(f'swarm -f wb.{timept}.make.esd.swarm -g 50 --module python --time 24:00:00')

swarm -f wb.BLM0T1.make.esd.swarm -g 50 --module python --time 24:00:00


#### run swarm to create besd files

In [19]:
print(f'swarm -f wb.{timept}.make.besd.swarm -g 50 --time 04:00:00')

swarm -f wb.BLM0T1.make.besd.swarm -g 50 --time 04:00:00


In [40]:
##run to update sample size in besd file

print(f'swarm -f wb.{timept}.update.besd.swarm -g 50 --time 04:00:00')

swarm -f wb.BLM0T1.update.besd.swarm -g 50 --time 04:00:00


#### run meta analysis

In [32]:
print(f'swarm -f wb.{timept}.run.meta.swarm -g 50 --time 04:00:00')

swarm -f wb.BLM0T1.run.meta.swarm -g 50 --time 04:00:00


#### look at results

In [16]:
## get results
def get_top_snps(time):
    file = f'{script_dir}/wb.{timept}.top.results.swarm'
    with open(file, "w") as text_file:
        for cell_type in cell_types_list:
            text_file.write(f'/data/LNG/anni/osca/smr_Linux --beqtl-summary {biowulf_dir}/results/wb.{timept}.{cell_type} \
    --query {alpha_value} --out {biowulf_dir}/results/wb.{timept}.{cell_type}.top_assoc\n')

get_top_snps(timept)

In [17]:
print(f'swarm -f wb.{timept}.top.results.swarm -g 20')

swarm -f wb.BLM0T1.top.results.swarm -g 20
