# Systems genetics 2020 - Final Project  

Open issues - find phenotypes

Import public packges

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import f
import matplotlib.pyplot as plt
import os.path
import sys
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

Import custom packges

In [2]:
sys.path.append('modules')

from utils import print_dim, print_stats
from preprocessing import data_annotations_merge
from regression import eqtl_analysis, qtl_analysis
from results_analysis import get_associations, cis_trans_annotation

In [3]:
bold_s = '\033[1m'
bold_e = '\033[0m'

Load data files

In [4]:
liver_exp = pd.read_csv('data/GSE17522_series_matrix_liver.txt', sep = "\t")
brain_exp = pd.read_csv('data/GSE36674_series_matrix_hypothalamus.txt', sep = "\t")

liver_annotations = pd.read_csv('data/annotations_liver_GPL6466-9752.txt', sep = "\t")
brain_annotations = pd.read_csv('data/annotation_brain.annot', sep = "\t")

genotypes = pd.read_excel('data/genotypes.xls', headers=None)
phenotypes = pd.read_excel('data/phenotypes.xls')

mgi = pd.read_csv('data/MGI_Coordinates.Build37.rpt.txt', sep = "\t", error_bad_lines=False, warn_bad_lines=False)

### 2. Gene expression data preprocessing

#### A. Merge data file with annotation file to get your input matrix

Brain

In [5]:
brain_annotations = brain_annotations.rename(columns={'Gene symbol' : 'GENE_SYMBOL'})
brain_matrix = data_annotations_merge(brain_exp, brain_annotations)

Liver

In [6]:
liver_matrix = data_annotations_merge(liver_exp, liver_annotations)

# Keep BXD columns and identifier columns only
liver_bxd_cols = list(liver_matrix.filter(regex=("BXD*")).columns)
id_cols = ['ID', 'GENE_SYMBOL']
liver_matrix = liver_matrix[id_cols + liver_bxd_cols]

# Rename BXD columns
liver_matrix = liver_matrix.rename(columns={col : col.split('_')[1] + '_' + col.split('_')[2] \
                                            for col in liver_matrix.columns.drop(id_cols)})

#### B. Keep only BXD columns that exists in the four files: genotypes, phenotypes and each tissue seperately

In [7]:
genotypes_bxd_cols = list(genotypes.filter(regex=("BXD*")).columns)
phenotypes_bxd_cols = list(phenotypes.filter(regex=("BXD*")).columns)

genotypes_baseline_cols = ["Locus", "Chr_Build37", "Build37_position"]
phenotypes_baseline_cols = ["Phenotype", "Authors", "Year", "Pubmed Id"]

# liver common columns
liver_matrix = liver_matrix.rename(columns = {col : col.split('_')[0] for col in liver_matrix.drop(columns = id_cols)}) 
liver_bxd_cols = list(liver_matrix.filter(regex='BXD').columns)
liver_common_bxd = list(set(genotypes_bxd_cols) & set(phenotypes_bxd_cols) &  set(liver_bxd_cols))
liver_common_bxd.sort()
liver_matrix = liver_matrix[['GENE_SYMBOL'] + liver_common_bxd]
liver_genotypes = genotypes[genotypes_baseline_cols + liver_common_bxd]
liver_phenotypes = phenotypes[phenotypes_baseline_cols + liver_common_bxd]

# brain common columns
brain_matrix = brain_matrix.rename(columns = {col : col.split('_')[0] for col in brain_matrix.drop(columns = id_cols)}) 
brain_bxd_cols = list(brain_matrix.filter(regex='BXD').columns)
brain_common_bxd = list(set(genotypes_bxd_cols) & set(phenotypes_bxd_cols) &  set(brain_bxd_cols))
brain_common_bxd.sort()
brain_matrix = brain_matrix[['GENE_SYMBOL'] + brain_common_bxd]
brain_genotypes = genotypes[genotypes_baseline_cols + brain_common_bxd]
brain_phenotypes = phenotypes[phenotypes_baseline_cols + brain_common_bxd]

# drop empty lines from phenotypes file
brain_phenotypes = brain_phenotypes.iloc[brain_phenotypes[brain_common_bxd].dropna(how='all').index]
liver_phenotypes = liver_phenotypes.iloc[liver_phenotypes[liver_common_bxd].dropna(how='all').index]

#### C. Use only representative genomic loci - Drop duplicated rows (of neighboring loci)

In [None]:
# liver genotypes filtering
snps_org = len(liver_genotypes)

bxd_data = liver_genotypes[liver_common_bxd]
duplicates_indx = bxd_data[bxd_data.shift() != bxd_data].dropna(how='all').index
liver_genotypes = liver_genotypes.loc[duplicates_indx]

print("LIVER Genotypes: Drop duplications:\n#SNPs before: %d   -->   #SNPs after: %d" % (snps_org, len(liver_genotypes)))
liver_genotypes.head(5)

# Brain genotypes filtering
snps_org = len(brain_genotypes)

bxd_data = brain_genotypes[brain_common_bxd]
duplicates_indx = bxd_data[bxd_data.shift() != bxd_data].dropna(how='all').index
brain_genotypes = brain_genotypes.loc[duplicates_indx]

print("LIVER Genotypes: Drop duplications:\n#SNPs before: %d   -->   #SNPs after: %d" % (snps_org, len(brain_genotypes)))
brain_genotypes.head(5)

#### • Remove rows with no gene identifier, <br> • Remove rows with low maximal value.  <br> • Remove rows with low variance.  <br> • Average multiple rows


Liver

In [None]:
max_treshold = 0.7
var_treshold = 0.2

# Remove rows with no gene identifier
liver_matrix = liver_matrix[~liver_matrix['GENE_SYMBOL'].isna()]
liver_bxd_cols = list(liver_matrix.filter(regex=("BXD*")).columns)

for col in liver_bxd_cols:
    liver_matrix[col] = liver_matrix[col].astype('float64')

# Filter by maximal value
liver_matrix['max'] = liver_matrix.drop(columns=['GENE_SYMBOL']).max(axis=1)
liver_matrix = liver_matrix[liver_matrix['max'] >= max_treshold]
liver_matrix = liver_matrix.drop(columns = 'max')
print("Num of rows after removing all rows with maximal values less than %.2f: %d " % (max_treshold, len(liver_matrix)))

# Filter by variance
liver_matrix['var'] = liver_matrix.drop(columns=['GENE_SYMBOL']).var(axis=1)
liver_matrix = liver_matrix[liver_matrix['var'] >= var_treshold]
liver_matrix = liver_matrix.drop(columns = 'var')
print("Num of rows after removing all rows with variance less than %.2f: %d " % (var_treshold, len(liver_matrix)))

# Group multiple rows by mean
liver_matrix = liver_matrix.groupby('GENE_SYMBOL').agg('mean').reset_index()
print("Num of rows after removing duplicated rows: %d ", len(liver_matrix))

Brain

In [None]:
max_treshold = 9
var_treshold = 0.04

# Remove rows with no gene identifier
brain_matrix = brain_matrix[~brain_matrix['GENE_SYMBOL'].isna()]
brain_bxd_cols = list(brain_matrix.filter(regex=("BXD*")).columns)

for col in brain_bxd_cols:
    brain_matrix[col] = brain_matrix[col].astype('float64')

# Filter by maximal value
brain_matrix['max'] = brain_matrix.drop(columns=['GENE_SYMBOL']).max(axis=1)
brain_matrix = brain_matrix[brain_matrix['max'] >= max_treshold]
brain_matrix = brain_matrix.drop(columns = 'max')
print("Num of rows after removing all rows with maximal values less than %.2f: %d " % (max_treshold, len(brain_matrix)))

# Filter by variance
brain_matrix['var'] = brain_matrix.drop(columns=['GENE_SYMBOL']).var(axis=1)
brain_matrix = brain_matrix[brain_matrix['var'] >= var_treshold]
brain_matrix = brain_matrix.drop(columns = 'var')
print("Num of rows after removing all rows with variance less than %.2f: %d " % (var_treshold, len(brain_matrix)))

# Group multiple rows by mean
brain_matrix = brain_matrix.groupby('GENE_SYMBOL').agg('mean').reset_index()
print("Num of rows after removing duplicated rows: %d ", len(brain_matrix))

#### • Average across different individuals of the same strain (Females and males)

In [None]:
liver_matrix = liver_matrix.rename(columns = {col : col.split('_')[0] for col in liver_matrix.drop(columns = ['GENE_SYMBOL'])}) 
liver_matrix = liver_matrix.set_index(['GENE_SYMBOL']) 
liver_matrix = liver_matrix.groupby(by=liver_matrix.columns, axis=1).mean()
liver_matrix = liver_matrix.reset_index()

brain_matrix = brain_matrix.rename(columns = {col : col.split('_')[0] for col in brain_matrix.drop(columns = ['GENE_SYMBOL'])}) 
brain_matrix = brain_matrix.set_index(['GENE_SYMBOL']) 
brain_matrix = brain_matrix.groupby(by=brain_matrix.columns, axis=1).mean()
brain_matrix = brain_matrix.reset_index()

### 3. eQTL analysis

#### Run  Regression

In [None]:
print_dim(len(liver_matrix), len(liver_genotypes), title="Liver: ")
print_dim(len(brain_matrix), len(brain_genotypes), title="Brain: ")

In [None]:
# Brain eqtl
if not os.path.exists("brain_reg_results.csv"):
    brain_eqtls = eqtl_analysis(brain_matrix, brain_genotypes, file_prefix="brain_")
else:
    brain_eqtls = pd.read_csv("brain_reg_results.csv")
    brain_eqtls = brain_eqtls.drop(columns='Unnamed: 0')
    
# Liver eqtl
if not os.path.exists("liver_reg_results.csv"):
    liver_eqtls = eqtl_analysis(liver_matrix, liver_genotypes, file_prefix="liver_")
else:
    liver_eqtls = pd.read_csv("liver_reg_results.csv")
    liver_eqtls = liver_eqtls.drop(columns='Unnamed: 0')

#### Cis/Trans annotation

In [None]:
brain_eqtls = cis_trans_annotation(brain_eqtls.copy(), mgi)
liver_eqtls = cis_trans_annotation(liver_eqtls.copy(), mgi)

#### Multiple test correction and associations filtering

In [None]:
brain_associations, brain_num_tests = get_associations(brain_eqtls) # by bonfferoni corrections
brain_associations.to_csv("brain_assoc_eqtl.csv")

liver_associations, liver_num_tests = get_associations(liver_eqtls) # by bonfferoni corrections
liver_associations.to_csv("brain_assoc_eqtl.csv")

print_stats(brain_associations, brain_num_tests, "1. Brain")
print_stats(liver_associations, liver_num_tests, "\n\n2. Liver")

shared_assoc_snp = set(liver_associations['SNP'].unique()) & set(brain_associations['SNP'].unique())
print((bold_s + "\n\nIn total, there are %d common SNPs" + bold_e) % len(shared_assoc_snp))

### 4. QTL analysis

* Currently looks like no morphine phenotype is significant..

In [None]:
# Get all phenotypes related to Morphine by naive text search
morphine_phenotypes_b = brain_phenotypes[brain_phenotypes['Phenotype'].str.contains("Morphine")]
morphine_phenotypes_l = liver_phenotypes[liver_phenotypes['Phenotype'].str.contains("Morphine")]

In [None]:
logpval_brain = qtl_analysis(morphine_phenotypes_b, brain_genotypes, file_prefix="brain_morphine_")
logpval_liver = qtl_analysis(morphine_phenotypes_l, liver_genotypes, file_prefix="liver_morphine_")

In [None]:
assoc_qtl_l, ntests_qtl_l = get_associations(logpval_brain) # by bonfferoni corrections
assoc_qtl_b, ntests_qtl_b = get_associations(logpval_liver) # by bonfferoni corrections