# Systems genetics 2020 - Final Project  

#### Note!
Our project is devided into several python modules which are loaded and excecuted from this notebook. <br>
Most of our code's project can be found under the /modules directory. 

Import public packges

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import f
import matplotlib.pyplot as plt
import os.path
import sys
from scipy import stats 
import warnings
warnings.filterwarnings('ignore')

Import custom packges (our python modules)

In [2]:
sys.path.append('modules')

from utils import print_dim, print_stats
from preprocessing import data_annotations_merge
from regression import eqtl_analysis, qtl_analysis
from results_analysis import get_associations, cis_trans_annotation
from causality_analysis import generate_models_df, get_LR, get_shuffled_df

In [3]:
bold_s = '\033[1m'
bold_e = '\033[0m'

Load data files

In [4]:
liver_exp = pd.read_csv('data/GSE17522_series_matrix_liver.txt', sep = "\t")
brain_exp = pd.read_csv('data/GSE36674_series_matrix_hypothalamus.txt', sep = "\t")

liver_annotations = pd.read_csv('data/annotations_liver_GPL6466-9752.txt', sep = "\t")
brain_annotations = pd.read_csv('data/annotation_brain.annot', sep = "\t")

genotypes = pd.read_excel('data/genotypes.xls', headers=None)
phenotypes = pd.read_excel('data/phenotypes.xls')

mgi = pd.read_csv('data/MGI_Coordinates.Build37.rpt.txt', sep = "\t", error_bad_lines=False, warn_bad_lines=False)

### 2. Gene expression data preprocessing

#### A. Merge data file with annotation file to get your input matrix

Brain

In [5]:
brain_annotations = brain_annotations.rename(columns={'Gene symbol' : 'GENE_SYMBOL'})
brain_matrix = data_annotations_merge(brain_exp, brain_annotations)

Liver

In [6]:
liver_matrix = data_annotations_merge(liver_exp, liver_annotations)

# Keep BXD columns and identifier columns only
liver_bxd_cols = list(liver_matrix.filter(regex=("BXD*")).columns)
id_cols = ['ID', 'GENE_SYMBOL']
liver_matrix = liver_matrix[id_cols + liver_bxd_cols]

# Rename BXD columns
liver_matrix = liver_matrix.rename(columns={col : col.split('_')[1] + '_' + col.split('_')[2] \
                                            for col in liver_matrix.columns.drop(id_cols)})

#### B. Keep only BXD columns that exists in the four files: genotypes, phenotypes and each tissue seperately

In [7]:
genotypes_bxd_cols = list(genotypes.filter(regex=("BXD*")).columns)
phenotypes_bxd_cols = list(phenotypes.filter(regex=("BXD*")).columns)

genotypes_baseline_cols = ["Locus", "Chr_Build37", "Build37_position"]
phenotypes_baseline_cols = ["Phenotype", "Authors", "Year", "Pubmed Id"]

# liver common columns
liver_matrix = liver_matrix.rename(columns = {col : col.split('_')[0] for col in liver_matrix.drop(columns = id_cols)}) 
liver_bxd_cols = list(liver_matrix.filter(regex='BXD').columns)
liver_common_bxd = list(set(genotypes_bxd_cols) & set(phenotypes_bxd_cols) &  set(liver_bxd_cols))
liver_common_bxd.sort()
liver_matrix = liver_matrix[['GENE_SYMBOL'] + liver_common_bxd]
liver_genotypes = genotypes[genotypes_baseline_cols + liver_common_bxd]
liver_phenotypes = phenotypes[phenotypes_baseline_cols + liver_common_bxd]

# brain common columns
brain_matrix = brain_matrix.rename(columns = {col : col.split('_')[0] for col in brain_matrix.drop(columns = id_cols)}) 
brain_bxd_cols = list(brain_matrix.filter(regex='BXD').columns)
brain_common_bxd = list(set(genotypes_bxd_cols) & set(phenotypes_bxd_cols) &  set(brain_bxd_cols))
brain_common_bxd.sort()
brain_matrix = brain_matrix[['GENE_SYMBOL'] + brain_common_bxd]
brain_genotypes = genotypes[genotypes_baseline_cols + brain_common_bxd]
brain_phenotypes = phenotypes[phenotypes_baseline_cols + brain_common_bxd]

# drop empty lines from phenotypes file
brain_phenotypes = brain_phenotypes.iloc[brain_phenotypes[brain_common_bxd].dropna(how='all').index]
liver_phenotypes = liver_phenotypes.iloc[liver_phenotypes[liver_common_bxd].dropna(how='all').index]

#### C. Use only representative genomic loci - Drop duplicated rows (of neighboring loci)

In [8]:
# liver genotypes filtering
snps_org = len(liver_genotypes)

bxd_data = liver_genotypes[liver_common_bxd]
duplicates_indx = bxd_data[bxd_data.shift() != bxd_data].dropna(how='all').index
liver_genotypes = liver_genotypes.loc[duplicates_indx]

print("LIVER Genotypes: Drop duplications:\n#SNPs before: %d   -->   #SNPs after: %d" % (snps_org, len(liver_genotypes)))
liver_genotypes.head(5)

# Brain genotypes filtering
snps_org = len(brain_genotypes)

bxd_data = brain_genotypes[brain_common_bxd]
duplicates_indx = bxd_data[bxd_data.shift() != bxd_data].dropna(how='all').index
brain_genotypes = brain_genotypes.loc[duplicates_indx]

print("LIVER Genotypes: Drop duplications:\n#SNPs before: %d   -->   #SNPs after: %d" % (snps_org, len(brain_genotypes)))
brain_genotypes.head(5)

LIVER Genotypes: Drop duplications:
#SNPs before: 3796   -->   #SNPs after: 1403
LIVER Genotypes: Drop duplications:
#SNPs before: 3796   -->   #SNPs after: 1598


Unnamed: 0,Locus,Chr_Build37,Build37_position,BXD1,BXD100,BXD101,BXD102,BXD103,BXD11,BXD12,...,BXD80,BXD83,BXD84,BXD85,BXD87,BXD89,BXD90,BXD95,BXD97,BXD99
0,rs6269442,1,3482276,B,B,U,U,U,B,D,...,D,H,B,D,B,B,B,D,B,B
2,rs6376963,1,5008090,B,B,U,U,U,B,D,...,D,H,B,D,B,B,B,D,B,B
3,rs3677817,1,5176059,B,B,U,U,U,B,D,...,D,H,B,D,B,B,B,D,B,B
4,rs8236463,1,5579194,B,B,U,U,U,D,D,...,D,H,B,D,B,B,B,D,B,B
6,rs6298633,1,6820242,B,B,U,U,U,D,D,...,D,H,B,D,B,B,B,D,B,B


#### • Remove rows with no gene identifier, <br> • Remove rows with low maximal value.  <br> • Remove rows with low variance.  <br> • Average multiple rows


Liver

In [9]:
max_treshold = 0.7
var_treshold = 0.2

# Remove rows with no gene identifier
liver_matrix = liver_matrix[~liver_matrix['GENE_SYMBOL'].isna()]
liver_bxd_cols = list(liver_matrix.filter(regex=("BXD*")).columns)

for col in liver_bxd_cols:
    liver_matrix[col] = liver_matrix[col].astype('float64')

# Filter by maximal value
liver_matrix['max'] = liver_matrix.drop(columns=['GENE_SYMBOL']).max(axis=1)
liver_matrix = liver_matrix[liver_matrix['max'] >= max_treshold]
liver_matrix = liver_matrix.drop(columns = 'max')
print("Num of rows after removing all rows with maximal values less than %.2f: %d " % (max_treshold, len(liver_matrix)))

# Filter by variance
liver_matrix['var'] = liver_matrix.drop(columns=['GENE_SYMBOL']).var(axis=1)
liver_matrix = liver_matrix[liver_matrix['var'] >= var_treshold]
liver_matrix = liver_matrix.drop(columns = 'var')
print("Num of rows after removing all rows with variance less than %.2f: %d " % (var_treshold, len(liver_matrix)))

# Group multiple rows by mean
liver_matrix = liver_matrix.groupby('GENE_SYMBOL').agg('mean').reset_index()
print("Num of rows after removing duplicated rows: %d ", len(liver_matrix))

Num of rows after removing all rows with maximal values less than 0.70: 7772 
Num of rows after removing all rows with variance less than 0.20: 1308 
Num of rows after removing duplicated rows: %d  1285


Brain

In [10]:
max_treshold = 9
var_treshold = 0.04

# Remove rows with no gene identifier
brain_matrix = brain_matrix[~brain_matrix['GENE_SYMBOL'].isna()]
brain_bxd_cols = list(brain_matrix.filter(regex=("BXD*")).columns)

for col in brain_bxd_cols:
    brain_matrix[col] = brain_matrix[col].astype('float64')

# Filter by maximal value
brain_matrix['max'] = brain_matrix.drop(columns=['GENE_SYMBOL']).max(axis=1)
brain_matrix = brain_matrix[brain_matrix['max'] >= max_treshold]
brain_matrix = brain_matrix.drop(columns = 'max')
print("Num of rows after removing all rows with maximal values less than %.2f: %d " % (max_treshold, len(brain_matrix)))

# Filter by variance
brain_matrix['var'] = brain_matrix.drop(columns=['GENE_SYMBOL']).var(axis=1)
brain_matrix = brain_matrix[brain_matrix['var'] >= var_treshold]
brain_matrix = brain_matrix.drop(columns = 'var')
print("Num of rows after removing all rows with variance less than %.2f: %d " % (var_treshold, len(brain_matrix)))

# Group multiple rows by mean
brain_matrix = brain_matrix.groupby('GENE_SYMBOL').agg('mean').reset_index()
print("Num of rows after removing duplicated rows: %d ", len(brain_matrix))

Num of rows after removing all rows with maximal values less than 9.00: 9893 
Num of rows after removing all rows with variance less than 0.04: 1444 
Num of rows after removing duplicated rows: %d  1247


#### • Average across different individuals of the same strain (Females and males)

In [11]:
liver_matrix = liver_matrix.rename(columns = {col : col.split('_')[0] for col in liver_matrix.drop(columns = ['GENE_SYMBOL'])}) 
liver_matrix = liver_matrix.set_index(['GENE_SYMBOL']) 
liver_matrix = liver_matrix.groupby(by=liver_matrix.columns, axis=1).mean()
liver_matrix = liver_matrix.reset_index()

brain_matrix = brain_matrix.rename(columns = {col : col.split('_')[0] for col in brain_matrix.drop(columns = ['GENE_SYMBOL'])}) 
brain_matrix = brain_matrix.set_index(['GENE_SYMBOL']) 
brain_matrix = brain_matrix.groupby(by=brain_matrix.columns, axis=1).mean()
brain_matrix = brain_matrix.reset_index()

### 3. eQTL analysis

#### Run  Regression

In [12]:
print_dim(len(liver_matrix), len(liver_genotypes), title="Liver: ")
print_dim(len(brain_matrix), len(brain_genotypes), title="Brain: ")

Liver: 
* Expression matrix size: 1285
* Genotype matrix size: 1403
* Expected num of tests: 1,802,855

Brain: 
* Expression matrix size: 1247
* Genotype matrix size: 1598
* Expected num of tests: 1,992,706



In [13]:
# Brain eqtl
if not os.path.exists("brain_reg_results.csv"):
    brain_eqtls = eqtl_analysis(brain_matrix, brain_genotypes, file_prefix="brain_")
else:
    brain_eqtls = pd.read_csv("brain_reg_results.csv")
    brain_eqtls = brain_eqtls.drop(columns='Unnamed: 0')
    
# Liver eqtl
if not os.path.exists("liver_reg_results.csv"):
    liver_eqtls = eqtl_analysis(liver_matrix, liver_genotypes, file_prefix="liver_")
else:
    liver_eqtls = pd.read_csv("liver_reg_results.csv")
    liver_eqtls = liver_eqtls.drop(columns='Unnamed: 0')

#### Cis/Trans annotation

In [14]:
brain_eqtls = cis_trans_annotation(brain_eqtls.copy(), mgi)
liver_eqtls = cis_trans_annotation(liver_eqtls.copy(), mgi)

100%|██████████████████████████████████████████████████████████████████████████████| 1247/1247 [04:04<00:00,  5.11it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1285/1285 [04:41<00:00,  4.57it/s]


#### Multiple test correction and associations filtering

In [15]:
brain_associations, brain_num_tests = get_associations(brain_eqtls) # by bonfferoni corrections
brain_associations.to_csv("brain_assoc_eqtl.csv")

liver_associations, liver_num_tests = get_associations(liver_eqtls) # by bonfferoni corrections
liver_associations.to_csv("liver_assoc_eqtl.csv")

print_stats(brain_associations, brain_num_tests, "1. Brain")
print_stats(liver_associations, liver_num_tests, "\n\n2. Liver")

[1m1. Brain[0m
[1mNumber of tests: [0m 1992706

[1mNumber of different significant eQTLs: [0m 472
From which: 
 260 - cis-acting 
 231 - trans-acting 

[1mNumber of total significant eQTLs: [0m 868
From which: 
 349 - cis-acting 
 337 - trans-acting 
[1m

2. Liver[0m
[1mNumber of tests: [0m 1802855

[1mNumber of different significant eQTLs: [0m 481
From which: 
 261 - cis-acting 
 278 - trans-acting 

[1mNumber of total significant eQTLs: [0m 840
From which: 
 325 - cis-acting 
 468 - trans-acting 


### 4. QTL analysis

* Currently looks like no morphine phenotype is significant..

In [60]:
#brain_phenotypes.to_excel('lool_brain.xlsx`
# brain_associations.join(brain_genotypes, on = 'Locus')
# brain_associations.head()
temp = pd.merge(brain_associations['SNP'], brain_genotypes, how='left', left_on=['SNP'], right_on=['Locus']).drop('SNP', axis=1)
temp[temp['Locus'].duplicated()==True]

brain_associations

Unnamed: 0,SNP,chromosome,position,gene,p-value,minus_log_p-value,closeness
10625,rs13481466,12,57107315,1700047I17Rik2///Fam177a,1.614445e-08,7.791977,Unknown
21243,rs6163111,5,74219748,2310040G07Rik,2.270665e-10,9.643847,trans
21244,rs3153753,5,76574922,2310040G07Rik,1.110223e-16,15.954590,cis
21245,CEL-5_76522694,5,78208834,2310040G07Rik,3.034129e-12,11.517966,cis
21246,rs13478357,5,82835236,2310040G07Rik,1.342021e-10,9.872241,trans
...,...,...,...,...,...,...,...
1979677,rs13482864,17,9565936,Zfp983,8.058357e-10,9.093753,Unknown
1979678,rs13482868,17,10586984,Zfp983,7.858147e-11,10.104680,Unknown
1979679,rs13482873,17,11606730,Zfp983,1.028067e-13,12.987979,Unknown
1979680,CEL-17_20354682,17,21827381,Zfp983,4.858505e-10,9.313497,Unknown


In [26]:
liver_phenotypes[liver_phenotypes['Phenotype'].str.contains("Morphine")]


Unnamed: 0,Phenotype,Authors,Year,Pubmed Id,BXD1,BXD11,BXD12,BXD13,BXD14,BXD15,...,BXD60,BXD62,BXD69,BXD73,BXD77,BXD8,BXD85,BXD86,BXD9,BXD92
15,"Morphine response (dose ip), hypothermia [slop...","Belknap JK, Crabbe JC",1992,1632590.0,-7.060,,-2.020,-3.430,-3.750,-1.8800,...,,,,,,-7.36,,,-2.930,
16,"Morphine response (dose ip), analgesia, slope ...","Belknap JK, Crabbe JC",1992,1632590.0,50.100,,33.900,25.800,31.200,25.5000,...,,,,,,62.70,,,27.600,
17,"Morphine response (dose mg/kg ip), Straub tail...","Belknap JK, Crabbe JC",1992,1632590.0,1.370,,0.810,0.940,1.370,0.9100,...,,,,,,0.42,,,0.750,
28,"Morphine response (16 mg/kg), analgesia relati...","Belknap JK, Mogil JS, Helms ML, Richards SP, O...",1995,7643715.0,63.000,,10.000,20.000,23.000,32.0000,...,,,,,,72.00,,,50.000,
466,"Morphine response (0.3 to 0.7 mg/ml p.o.), con...","Phillips TJ, Belknap JK, Crabbe JC",1991,2065120.0,16.000,,97.000,29.000,23.000,148.0000,...,,,,,,27.00,,,27.000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1517,"Morphine response (50 mg/kg ip), vertical acti...","Philip VM, Ansah TA, Blaha CD, Cook MN, Hamre ...",2010,19958391.0,469.538,227.143,382.571,103.222,322.500,61.2500,...,2.07692,68.6667,42.5000,142.000,79.75,9.00,435.250,228.8,256.909,306.333
1518,"Morphine response (50 mg/kg ip), vertical acti...","Philip VM, Ansah TA, Blaha CD, Cook MN, Hamre ...",2010,19958391.0,397.154,321.143,603.000,225.778,450.375,53.0833,...,5.53846,147.3330,54.5714,206.188,33.00,67.50,516.167,338.4,422.727,361.444
1519,"Morphine response (50 mg/kg ip), vertical acti...","Philip VM, Ansah TA, Blaha CD, Cook MN, Hamre ...",2010,19958391.0,512.154,349.000,656.286,280.556,581.625,49.1667,...,14.53850,188.7500,83.0000,323.875,16.75,99.50,618.417,503.3,583.455,373.111
1520,"Morphine response (50 mg/kg ip), vertical acti...","Philip VM, Ansah TA, Blaha CD, Cook MN, Hamre ...",2010,19958391.0,584.923,332.571,603.286,320.778,790.625,47.0833,...,20.46150,192.9170,113.7140,391.875,47.00,192.50,627.333,641.8,651.909,375.111


In [27]:
logpval_brain = qtl_analysis(brain_phenotypes[brain_phenotypes['Phenotype'].str.contains("Morphine")], brain_genotypes, file_prefix="brain_morphine_")
logpval_liver = qtl_analysis(liver_phenotypes[liver_phenotypes['Phenotype'].str.contains("Morphine")], liver_genotypes, file_prefix="liver_morphine_")

100%|████████████████████████████████████████████████████████████████████████████████| 115/115 [09:57<00:00,  5.20s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 115/115 [07:31<00:00,  3.92s/it]


In [38]:
assoc_qtl_l, ntests_qtl_l = get_associations(logpval_brain) # by bonfferoni corrections
assoc_qtl_b, ntests_qtl_b = get_associations(logpval_liver) # by bonfferoni corrections

In [35]:
assoc_qtl_b

Unnamed: 0,SNP,chromosome,position,gene,p-value,phenotype,minus_log_p-value


### 6. Causality Test

In [18]:
# Choose a triplet L, R, C for the causality analysis
locus_idx = 241      
gene_exp_idx = 288 
phenotype_idx = 1842

locus = liver_genotypes.loc[[locus_idx]].replace({'B': 0, 'b': 0, 'D':1, 'H': np.nan, 'U': np.nan})[liver_common_bxd]
gene_exp = liver_matrix.loc[[gene_exp_idx]][liver_common_bxd]
phenotype = liver_phenotypes.loc[[phenotype_idx]][liver_common_bxd]

print("Selected triplet for causality analysis:\n1. Locus (L): %s\n2. Gene expression (R): %s\n3. Phenotype (C): %s"
     % (liver_genotypes.iloc[locus_idx]['Locus'],
        liver_matrix.iloc[gene_exp_idx]['GENE_SYMBOL'],
        liver_phenotypes.iloc[phenotype_idx]['Phenotype']))

# Create input table 
input_table = pd.concat([locus, gene_exp, phenotype]) # concate data types
input_table = input_table.T.reset_index()             # transpose
input_table.columns = ['Individual', 'L', 'R', 'C']
input_table = input_table.dropna()                    # ignore nulls
input_table = input_table.sort_values(by='L')         # sort by lucus (genotype 0 or 2)

Selected triplet for causality analysis:
1. Locus (L): rs13477308
2. Gene expression (R): Cenpl
3. Phenotype (C): Anxiety assay, restraint stress [15 min] + ethanol treated [1.8 g/kg i.p.] (RSE group), activity in closed quadrants using an elevated zero maze in 60 to 120-day-old males and females during last 5 min [n beam breaks]


In [19]:
m1_df, m2_df, m3_df = generate_models_df(input_table)
m1_df.head(5)

Unnamed: 0,Individual,L,R,C,R|L_mean,R|L_var,C|R_mean,C|R_var,P(R|L),P(C|R),P(L) * P(C|R) * P(R|L)
0,BXD1,0.0,0.7185,787.0,0.641457,0.346496,585.057,14697.1,0.671956,0.000822,0.000276
36,BXD85,0.0,1.2175,605.67,0.641457,0.346496,642.945,14697.1,0.419863,0.003139,0.000659
35,BXD8,0.0,0.899,566.0,0.641457,0.346496,605.996,14697.1,0.615875,0.003116,0.00096
34,BXD77,0.0,0.619,361.23,0.641457,0.346496,573.515,14697.1,0.677243,0.00071,0.000241
33,BXD73,0.0,1.2045,572.5,0.641457,0.346496,641.437,14697.1,0.428931,0.002799,0.0006


### Calculate The Models Likelihood and Likelihood Ratio:

Likelihood = L(θ;M) = p(data|θm)

In [20]:
LR = get_LR(m1_df, m2_df, m3_df)
print("The Likelihood-Ratio is: %.2f" % LR)

Best model:  1
The Likelihood-Ratio is: 2.53


### Statistical estimation

TODO: one sample t-test is good?
TODO: should we correct the pval? (p/100)

In [21]:
n_tests = 100
LR_dist = []
for i in range(0,n_tests):
    shuffled_df = get_shuffled_df(input_table.copy())
    m1_df, m2_df, m3_df = generate_models_df(shuffled_df)
    LR_i = get_LR(m1_df, m2_df, m3_df)
    LR_dist.append(LR_i)
    
pval = stats.ttest_1samp(LR_dist, LR)[1]
if pval <= 0.05:
    print("Our LR is significant with pval ", pval)
else:
    print("Our LR is not significant ", pval)

Best model:  3
Best model:  3
Best model:  2
Best model:  2
Best model:  3
Best model:  3
Best model:  3
Best model:  3
Best model:  2
Best model:  2
Best model:  3
Best model:  1
Best model:  1
Best model:  2
Best model:  1
Best model:  2
Best model:  2
Best model:  1
Best model:  3
Best model:  3
Best model:  3
Best model:  2
Best model:  3
Best model:  1
Best model:  2
Best model:  2
Best model:  2
Best model:  2
Best model:  2
Best model:  3
Best model:  3
Best model:  2
Best model:  2
Best model:  2
Best model:  3
Best model:  3
Best model:  2
Best model:  3
Best model:  2
Best model:  2
Best model:  3
Best model:  1
Best model:  3
Best model:  3
Best model:  2
Best model:  3
Best model:  2
Best model:  3
Best model:  2
Best model:  1
Best model:  2
Best model:  3
Best model:  1
Best model:  3
Best model:  3
Best model:  1
Best model:  3
Best model:  3
Best model:  3
Best model:  2
Best model:  2
Best model:  2
Best model:  3
Best model:  2
Best model:  1
Best model:  3
Best model