Download data for house mouse crosses from Ballinger, Mack et al. (https://doi.org/10.1073/pnas.2214614120).

In [None]:
# wget data 
# ! wget -P ../data/ https://raw.githubusercontent.com/malballinger/BallingerMack_PNAS_2023/refs/heads/main/data/processed/BAT.MALE.COLD.categories.forplot.txt
# ! wget -P ../data/ https://raw.githubusercontent.com/malballinger/BallingerMack_PNAS_2023/refs/heads/main/data/processed/BAT.MALE.WARM.categories.forplot.txt
# ! wget -P ../data/ https://raw.githubusercontent.com/malballinger/BallingerMack_PNAS_2023/refs/heads/main/data/processed/Liver.MALE.COLD.categories.forplot.txt
# ! wget -P ../data/ https://raw.githubusercontent.com/malballinger/BallingerMack_PNAS_2023/refs/heads/main/data/processed/Lisver.MALE.WARM.categories.forplot.txt
# ! wget -P ../data/ https://raw.githubusercontent.com/malballinger/BallingerMack_PNAS_2023/refs/heads/main/data/raw/ReadCounts/all_parents_counts.txt
# ! wget -P ../data/ https://raw.githubusercontent.com/malballinger/BallingerMack_PNAS_2023/refs/heads/main/data/raw/ReadCounts/all_parents_sample_info.txt 

In [9]:
import pandas as pd
import numpy as np


In [10]:
# now read in the parents' raw counts
data_path = "../data/"
all_parents_counts = pd.read_csv(f'{data_path}/all_parents_counts.txt',sep='\t')
all_parents_counts = all_parents_counts.set_index('gene')
all_parents_sample_info = pd.read_csv(f'{data_path}/all_parents_sample_info.txt',sep='\t')

BAT_ASEreads_coldwarm_femalemale = pd.read_csv(f'{data_path}/BallingerPNAS_readcounts/BAT_ASEreads_coldwarm_femalemale.txt',sep='\t')
BAT_ASEreads_coldwarm_femalemale = BAT_ASEreads_coldwarm_femalemale.set_index('gene')
liver_ASEreads_coldwarm_femalemale = pd.read_csv(f'{data_path}/BallingerPNAS_readcounts/liver_ASEreads_warmcold_malefemale.txt',sep='\t')
liver_ASEreads_coldwarm_femalemale = liver_ASEreads_coldwarm_femalemale.set_index('gene')

genes1 = np.sort ( pd.read_csv(f'{data_path}/BAT.MALE.COLD.categories.forplot.txt',sep=' ').iloc[:,0].values ) 
genes2 = np.sort ( pd.read_csv(f'{data_path}/BAT.MALE.WARM.categories.forplot.txt',sep=' ').iloc[:,0].values )
genes3 = np.sort ( pd.read_csv(f'{data_path}/Liver.MALE.COLD.categories.forplot.txt',sep=' ').iloc[:,0].values )
genes4 = np.sort ( pd.read_csv(f'{data_path}/Liver.MALE.WARM.categories.forplot.txt',sep=' ').iloc[:,0].values )
genes_overlap = list(set(genes1).intersection(set(genes2)).intersection(set(genes3)).intersection(set(genes4)) )


genes_to_fit = {'BAT_cold': genes1,
                'BAT_warm': genes2,
                'Liver_cold': genes3,
                'Liver_warm': genes4}



In [11]:
# create gene name file
gene_names_raw1 = pd.read_csv(f"{data_path}/gProfiler_gene_names_1.csv")
gene_names_raw = pd.read_csv(f"{data_path}/gProfiler_gene_names.csv")
# concatenate
gene_names_df = pd.concat((gene_names_raw1,gene_names_raw))
gene_names = np.sort(gene_names_df.name.values)

In [12]:
P1 = 'Brazil'
P2 = 'NewYork'


In [13]:
# create count matrix for BAT, cold -- only using male samples (6 per P1, P2, H1, H2)

tissue = 'BAT'
temp = 'Cold'
m_counts_ = np.ones((24,len(genes_to_fit[f'{tissue}_{temp.lower()}'])))


p1_samples = all_parents_sample_info[ (all_parents_sample_info.sex=='M') & \
                         (all_parents_sample_info.condition == tissue) & \
                         (all_parents_sample_info.temperature == temp) & \
                         (all_parents_sample_info.population == P1)]['Unnamed: 0'].values

p2_samples = all_parents_sample_info[ (all_parents_sample_info.sex=='M') & \
                         (all_parents_sample_info.condition == tissue) & \
                         (all_parents_sample_info.temperature == temp) & \
                         (all_parents_sample_info.population == P2)]['Unnamed: 0'].values

h1_samples = [s for s in BAT_ASEreads_coldwarm_femalemale.columns if f'{temp.lower()}M_geno1' in s]
h2_samples = [s for s in BAT_ASEreads_coldwarm_femalemale.columns if f'{temp.lower()}M_geno2' in s]

# add parental counts
for i,p1_ in enumerate(p1_samples):
    m_counts_[i,:] = all_parents_counts.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{p1_}'].values

for i,p2_ in enumerate(p2_samples):
    m_counts_[6+i,:] = all_parents_counts.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{p2_}'].values

# add hybrid counts
for i,h1_ in enumerate(h1_samples):
    m_counts_[12+i,:] = BAT_ASEreads_coldwarm_femalemale.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{h1_}'].values

for i,h2_ in enumerate(h2_samples):
    m_counts_[18+i,:] = BAT_ASEreads_coldwarm_femalemale.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{h2_}'].values


# save 
# np.savetxt(f'{data_path}/male_{tissue}_{temp.lower()}_X.txt',m_counts_, delimiter='\t', fmt='%d')

In [14]:
BATcold_df = pd.DataFrame(m_counts_.T,
             columns = list(p1_samples)+list(p2_samples)+['X'+h for h in h1_samples]+\
                                                         ['X'+h for h in h2_samples],
             index = genes1)

In [7]:
parent_samples = list(p1_samples)+list(p2_samples)
    
hybrid_samples = list(h1_samples)+list(h2_samples)

hybrid_samples = ['X'+h for h in hybrid_samples]
metadata = pd.DataFrame({
    'Sample':parent_samples+hybrid_samples,
    'Allele':['P1']*6+['P2']*6+['H1']*6+['H2']*6,
})
metadata = metadata.set_index('Sample')
counts_all = pd.DataFrame(m_counts_.T, index=genes_to_fit['BAT_cold'], 
                               columns=parent_samples+hybrid_samples)

# save example for XgeneR testing -- BAT tissue
metadata.to_csv('/home/maria/XgeneR/inst/extdata/BATcold_ballinger_metadata.csv')
counts_all.to_csv('/home/maria/XgeneR/inst/extdata/BATcold_ballinger_counts.csv')

In [16]:
# create count matrix for BAT, warm -- only using male samples (6 per P1, P2, H1, H2)

tissue = 'BAT'
temp = 'Warm'

m_counts_ = np.ones((24,len(genes_to_fit[f'{tissue}_{temp.lower()}'])))


p1_samples = all_parents_sample_info[ (all_parents_sample_info.sex=='M') & \
                         (all_parents_sample_info.condition == tissue) & \
                         (all_parents_sample_info.temperature == temp) & \
                         (all_parents_sample_info.population == P1)]['Unnamed: 0'].values

p2_samples = all_parents_sample_info[ (all_parents_sample_info.sex=='M') & \
                         (all_parents_sample_info.condition == tissue) & \
                         (all_parents_sample_info.temperature == temp) & \
                         (all_parents_sample_info.population == P2)]['Unnamed: 0'].values

h1_samples = [s for s in BAT_ASEreads_coldwarm_femalemale.columns if f'{temp.lower()}M_geno1' in s]
h2_samples = [s for s in BAT_ASEreads_coldwarm_femalemale.columns if f'{temp.lower()}M_geno2' in s]

# add parental counts
for i,p1_ in enumerate(p1_samples):
    m_counts_[i,:] = all_parents_counts.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{p1_}'].values

for i,p2_ in enumerate(p2_samples):
    m_counts_[6+i,:] = all_parents_counts.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{p2_}'].values

# add hybrid counts
for i,h1_ in enumerate(h1_samples):
    m_counts_[12+i,:] = BAT_ASEreads_coldwarm_femalemale.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{h1_}'].values

for i,h2_ in enumerate(h2_samples):
    m_counts_[18+i,:] = BAT_ASEreads_coldwarm_femalemale.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{h2_}'].values

BATwarm_df = pd.DataFrame(m_counts_.T,
             columns = list(p1_samples)+list(p2_samples)+['X'+h for h in h1_samples]+\
                                                         ['X'+h for h in h2_samples],
             index = genes2)

# save 
# np.savetxt(f'{data_path}/male_{tissue}_{temp.lower()}_X.txt',m_counts_, delimiter='\t', fmt='%d')

In [17]:
# create count matrix for Liver, cold -- only using male samples (6 per P1, P2, H1, H2)

tissue = 'Liver'
temp = 'Cold'
m_counts_ = np.ones((24,len(genes_to_fit[f'{tissue}_{temp.lower()}'])))


p1_samples = all_parents_sample_info[ (all_parents_sample_info.sex=='M') & \
                         (all_parents_sample_info.condition == tissue.upper()) & \
                         (all_parents_sample_info.temperature == temp) & \
                         (all_parents_sample_info.population == P1)]['Unnamed: 0'].values

p2_samples = all_parents_sample_info[ (all_parents_sample_info.sex=='M') & \
                         (all_parents_sample_info.condition == tissue.upper()) & \
                         (all_parents_sample_info.temperature == temp) & \
                         (all_parents_sample_info.population == P2)]['Unnamed: 0'].values

h1_samples = [s for s in liver_ASEreads_coldwarm_femalemale.columns if f'{temp.lower()}M_geno1' in s]
h2_samples = [s for s in liver_ASEreads_coldwarm_femalemale.columns if f'{temp.lower()}M_geno2' in s]

# add parental counts
for i,p1_ in enumerate(p1_samples):
    m_counts_[i,:] = all_parents_counts.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{p1_}'].values

for i,p2_ in enumerate(p2_samples):
    m_counts_[6+i,:] = all_parents_counts.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{p2_}'].values

# add hybrid counts
for i,h1_ in enumerate(h1_samples):
    m_counts_[12+i,:] = liver_ASEreads_coldwarm_femalemale.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{h1_}'].values

for i,h2_ in enumerate(h2_samples):
    m_counts_[18+i,:] = liver_ASEreads_coldwarm_femalemale.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{h2_}'].values

Livercold_df = pd.DataFrame(m_counts_.T,
             columns = list(p1_samples)+list(p2_samples)+['X'+h for h in h1_samples]+\
                                                         ['X'+h for h in h2_samples],
             index = genes1)
    
    
# save 
# np.savetxt(f'{data_path}/male_{tissue}_{temp.lower()}_X.txt',m_counts_, delimiter='\t', fmt='%d')
# counts_all.to_csv('/home/maria/XgeneR/inst/extdata/Livercold_ballinger_counts.csv')

In [18]:
# save cold df
cold_df = pd.concat([Livercold_df,BATcold_df],axis=1)
# cold_df.to_csv('/home/maria/XgeneR/inst/extdata/cold_ballinger_counts.csv')

In [19]:
# create count matrix for Liver, warm -- only using male samples (6 per P1, P2, H1, H2)

tissue = 'Liver'
temp = 'Warm'
m_counts_ = np.ones((24,len(genes_to_fit[f'{tissue}_{temp.lower()}'])))


p1_samples = all_parents_sample_info[ (all_parents_sample_info.sex=='M') & \
                         (all_parents_sample_info.condition == tissue.upper()) & \
                         (all_parents_sample_info.temperature == temp) & \
                         (all_parents_sample_info.population == P1)]['Unnamed: 0'].values

p2_samples = all_parents_sample_info[ (all_parents_sample_info.sex=='M') & \
                         (all_parents_sample_info.condition == tissue.upper()) & \
                         (all_parents_sample_info.temperature == temp) & \
                         (all_parents_sample_info.population == P2)]['Unnamed: 0'].values

h1_samples = [s for s in liver_ASEreads_coldwarm_femalemale.columns if f'{temp.lower()}M_geno1' in s]
h2_samples = [s for s in liver_ASEreads_coldwarm_femalemale.columns if f'{temp.lower()}M_geno2' in s]

# add parental counts
for i,p1_ in enumerate(p1_samples):
    m_counts_[i,:] = all_parents_counts.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{p1_}'].values

for i,p2_ in enumerate(p2_samples):
    m_counts_[6+i,:] = all_parents_counts.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{p2_}'].values

# add hybrid counts
for i,h1_ in enumerate(h1_samples):
    m_counts_[12+i,:] = liver_ASEreads_coldwarm_femalemale.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{h1_}'].values

for i,h2_ in enumerate(h2_samples):
    m_counts_[18+i,:] = liver_ASEreads_coldwarm_femalemale.loc[genes_to_fit[f'{tissue}_{temp.lower()}'],f'{h2_}'].values


Liverwarm_df = pd.DataFrame(m_counts_.T,
             columns = list(p1_samples)+list(p2_samples)+['X'+h for h in h1_samples]+\
                                                         ['X'+h for h in h2_samples],
             index = genes2)

    
# save 
# np.savetxt(f'{data_path}/male_{tissue}_{temp.lower()}_X.txt',m_counts_, delimiter='\t', fmt='%d')

In [20]:
warm_df = pd.concat([Liverwarm_df,BATwarm_df],axis=1)
warm_df.to_csv('/home/maria/XgeneR/inst/extdata/warm_ballinger_counts.csv')

In [29]:
# all counts
count_df = pd.concat([Liverwarm_df,BATwarm_df,Livercold_df,BATcold_df],axis=1,join='inner')
# cold_df.to_csv('/home/maria/XgeneR/inst/extdata/ballinger_counts.csv

In [36]:
# create and save design matrix for model with one condition, log-additive
p1_index = [True]*6 + [False]*18
p2_index = [False]*6 + [True]*6 + [False]*12
h1_index = [False]*12 + [True]*6 + [False]*6
h2_index = [False]*18 + [True]*6 

# model C
design_matrix_modelC = np.ones((len(p1_index),3))
design_matrix_modelC[p1_index,1] = 0
design_matrix_modelC[p2_index,2] = 0
design_matrix_modelC[h1_index,1] = 0
design_matrix_modelC[h1_index,2] = 0.5
design_matrix_modelC[h2_index,2] = 0.5


design_matrix_modelC_no_cis = design_matrix_modelC.copy()[:,np.array([0,2])]
design_matrix_modelC_no_trans = design_matrix_modelC.copy()[:,np.array([0,1])]


np.savetxt(f'{data_path}/male_design_oneCond.txt',design_matrix_modelC, delimiter='\t', fmt='%f')
np.savetxt(f'{data_path}/male_design_oneCond_no_cis.txt',design_matrix_modelC_no_cis, delimiter='\t', fmt='%f')
np.savetxt(f'{data_path}/male_design_oneCond_no_trans.txt',design_matrix_modelC_no_trans, delimiter='\t', fmt='%f')

Now, create design matrices for specific hypotheses.