# Systems genetics 2020 - Final Project  

Import public packges

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import f
import matplotlib.pyplot as plt
import os.path
import sys
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

Import custom packges

In [2]:
sys.path.append('modules')

from preprocessing import data_annotations_merge
from regression import run_regression

Load data files

In [3]:
liver_exp = pd.read_csv('data/GSE17522_series_matrix_liver.txt', sep = "\t")
brain_exp = pd.read_csv('data/GSE36674_series_matrix_hypothalamus.txt', sep = "\t")

liver_annotations = pd.read_csv('data/annotations_liver_GPL6466-9752.txt', sep = "\t")
brain_annotations = pd.read_csv('data/annotation_brain.annot', sep = "\t")

genotypes = pd.read_excel('data/genotypes.xls', headers=None)
phenotypes = pd.read_excel('data/phenotypes.xls')

mgi = pd.read_csv('data/MGI_Coordinates.Build37.rpt.txt', sep = "\t", error_bad_lines=False, warn_bad_lines=False)

### 2. Gene expression data preprocessing

#### • Make sure that the data is normalized

In [4]:
liver_exp[liver_exp['!Sample_title'] == '!Sample_data_processing']['Liver_C57BL6J_M_B1_rep1'].iloc[0]

'Data were extracted from the scanned image using Agilent Feature Extraction software version 6.1. A total of 122 arrays were run in 8 batches. The samples were semirandomly distributed throughout the batches prior to microarray analysis in order to separate sexes and strains, and to minimize between- and within- batch bias. Technical and biological replicates were run both within each batch and between batches. The microarray data was deposited in the UNC Microarray Database and extracted using Log2 ratios of the mean red channel intensity over the mean green channel intensity. This was followed by LOWESS normalization to remove the intensity dependent dye bias3. Neither the genes nor the arrays were centered. Inter-batch normalization was carried out using a nested ANOVA mixed model with samples within each batch crossed with sex and strain.'

In [5]:
brain_exp[brain_exp['!Sample_title'] == '!Sample_data_processing']['BXD44_F'].iloc[0]

'Probe intensity values were extracted using the Affymetrix GeneChip Operating Software and RMA normalized'

#### • Merge data file with annotation file to get your input matrix

Brain

In [6]:
brain_annotations = brain_annotations.rename(columns={'Gene symbol' : 'GENE_SYMBOL'})
brain_matrix = data_annotations_merge(brain_exp, brain_annotations)

Liver

In [7]:
liver_matrix = data_annotations_merge(liver_exp, liver_annotations)

# Keep BXD columns and identifier columns only
liver_bxd_cols = list(liver_matrix.filter(regex=("BXD*")).columns)
id_cols = ['ID', 'GENE_SYMBOL']
liver_matrix = liver_matrix[id_cols + liver_bxd_cols]

# Rename BXD columns
liver_matrix = liver_matrix.rename(columns={col : col.split('_')[1] + '_' + col.split('_')[2] \
                                            for col in liver_matrix.columns.drop(id_cols)})

#### Keep only BXD columns that exists in the four files: genotypes, phenotypes and each tissue seperately

In [8]:
genotypes_bxd_cols = list(genotypes.filter(regex=("BXD*")).columns)
phenotypes_bxd_cols = list(phenotypes.filter(regex=("BXD*")).columns)

genotypes_baseline_cols = ["Locus", "Chr_Build37", "Build37_position"]
phenotypes_baseline_cols = ["Phenotype", "Authors", "Year", "Pubmed Id"]

# liver common columns
liver_matrix = liver_matrix.rename(columns = {col : col.split('_')[0] for col in liver_matrix.drop(columns = id_cols)}) 
liver_bxd_cols = list(liver_matrix.filter(regex='BXD').columns)
liver_common_bxd = list(set(genotypes_bxd_cols) & set(phenotypes_bxd_cols) &  set(liver_bxd_cols))
liver_common_bxd.sort()
liver_matrix = liver_matrix[['GENE_SYMBOL'] + liver_common_bxd]
liver_genotypes = genotypes[genotypes_baseline_cols + liver_common_bxd]
liver_phenotypes = phenotypes[phenotypes_baseline_cols + liver_common_bxd]

# brain common columns
brain_matrix = brain_matrix.rename(columns = {col : col.split('_')[0] for col in brain_matrix.drop(columns = id_cols)}) 
brain_bxd_cols = list(brain_matrix.filter(regex='BXD').columns)
brain_common_bxd = list(set(genotypes_bxd_cols) & set(phenotypes_bxd_cols) &  set(brain_bxd_cols))
brain_common_bxd.sort()
brain_matrix = brain_matrix[['GENE_SYMBOL'] + brain_common_bxd]
brain_genotypes = genotypes[genotypes_baseline_cols + brain_common_bxd]
brain_phenotypes = phenotypes[phenotypes_baseline_cols + brain_common_bxd]

#### • Remove rows with no gene identifier, <br> • Remove rows with low maximal value.  <br> • Remove rows with low variance.  <br> • Average multiple rows


Liver

In [9]:
max_treshold = 0.7
var_treshold = 0.2

# Remove rows with no gene identifier
liver_matrix = liver_matrix[~liver_matrix['GENE_SYMBOL'].isna()]
liver_bxd_cols = list(liver_matrix.filter(regex=("BXD*")).columns)

for col in liver_bxd_cols:
    liver_matrix[col] = liver_matrix[col].astype('float64')

# Filter by maximal value
liver_matrix['max'] = liver_matrix.drop(columns=['GENE_SYMBOL']).max(axis=1)
liver_matrix = liver_matrix[liver_matrix['max'] >= max_treshold]
liver_matrix = liver_matrix.drop(columns = 'max')
print("Num of rows after removing all rows with maximal values less than %.2f: %d " % (max_treshold, len(liver_matrix)))

# Filter by variance
liver_matrix['var'] = liver_matrix.drop(columns=['GENE_SYMBOL']).var(axis=1)
liver_matrix = liver_matrix[liver_matrix['var'] >= var_treshold]
liver_matrix = liver_matrix.drop(columns = 'var')
print("Num of rows after removing all rows with variance less than %.2f: %d " % (var_treshold, len(liver_matrix)))

# Group multiple rows by mean
liver_matrix = liver_matrix.groupby('GENE_SYMBOL').agg('mean').reset_index()
print("Num of rows after removing duplicated rows: %d ", len(liver_matrix))

Num of rows after removing all rows with maximal values less than 0.70: 7772 
Num of rows after removing all rows with variance less than 0.20: 1308 
Num of rows after removing duplicated rows: %d  1285


Brain

In [10]:
max_treshold = 9
var_treshold = 0.04

# Remove rows with no gene identifier
brain_matrix = brain_matrix[~brain_matrix['GENE_SYMBOL'].isna()]
brain_bxd_cols = list(brain_matrix.filter(regex=("BXD*")).columns)

for col in brain_bxd_cols:
    brain_matrix[col] = brain_matrix[col].astype('float64')

# Filter by maximal value
brain_matrix['max'] = brain_matrix.drop(columns=['GENE_SYMBOL']).max(axis=1)
brain_matrix = brain_matrix[brain_matrix['max'] >= max_treshold]
brain_matrix = brain_matrix.drop(columns = 'max')
print("Num of rows after removing all rows with maximal values less than %.2f: %d " % (max_treshold, len(brain_matrix)))

# Filter by variance
brain_matrix['var'] = brain_matrix.drop(columns=['GENE_SYMBOL']).var(axis=1)
brain_matrix = brain_matrix[brain_matrix['var'] >= var_treshold]
brain_matrix = brain_matrix.drop(columns = 'var')
print("Num of rows after removing all rows with variance less than %.2f: %d " % (var_treshold, len(brain_matrix)))

# Group multiple rows by mean
brain_matrix = brain_matrix.groupby('GENE_SYMBOL').agg('mean').reset_index()
print("Num of rows after removing duplicated rows: %d ", len(brain_matrix))

Num of rows after removing all rows with maximal values less than 9.00: 9893 
Num of rows after removing all rows with variance less than 0.04: 1444 
Num of rows after removing duplicated rows: %d  1247


#### • Average across different individuals of the same strain (Females and males)

In [11]:
liver_matrix = liver_matrix.rename(columns = {col : col.split('_')[0] for col in liver_matrix.drop(columns = ['GENE_SYMBOL'])}) 
liver_matrix = liver_matrix.set_index(['GENE_SYMBOL']) 
liver_matrix = liver_matrix.groupby(by=liver_matrix.columns, axis=1).mean()
liver_matrix = liver_matrix.reset_index()

brain_matrix = brain_matrix.rename(columns = {col : col.split('_')[0] for col in brain_matrix.drop(columns = ['GENE_SYMBOL'])}) 
brain_matrix = brain_matrix.set_index(['GENE_SYMBOL']) 
brain_matrix = brain_matrix.groupby(by=brain_matrix.columns, axis=1).mean()
brain_matrix = brain_matrix.reset_index()

### 3. eQTL analysis

#### Run  Regression

In [14]:
def eqtl_analysis(exp_df, genotype_df):
    e_data = exp_df.drop(columns='GENE_SYMBOL')
    num_genes = len(exp_df)
    reg_results = pd.DataFrame(columns=['SNP', 'chromosome', 'position', 'gene', 'p-value'])

    gene_locus = genotype_df.iloc[:, 0]
    genotypes = genotype_df.drop(columns = ['Locus', 'Chr_Build37', 'Build37_position'])
    
    # Remove heterozygous markers
    genotypes = genotypes.replace({'B': 0, 'b': 0, 'D':2, 'H': np.nan, 'U': np.nan})
     
    for i in tqdm(range(0, num_genes)):
        gene = exp_df.iloc[i]['GENE_SYMBOL']
        logpval_list, pval_list = run_regression(genotypes, gene_locus, e_data.iloc[[i]])

        # store results
        values = genotype_df[['Locus','Chr_Build37', 'Build37_position']].copy() 
        values.columns = ['SNP', 'chromosome', 'position']
        values['gene'] = gene
        values['p-value'] = np.array(pval_list)
        values['minus_log_p-value'] = np.array(logpval_list)

        reg_results = pd.concat([reg_results,values])
        
    reg_results.to_excel("reg_results.xlsx")
    return reg_results

In [15]:
# TODO: SORT
brain_reg_results = eqtl_analysis(brain_matrix, brain_genotypes)

  0%|                                                                              | 1/1247 [00:45<15:38:34, 45.20s/it]


KeyboardInterrupt: 

In [None]:
# if not os.path.exists(regression_file):
#     reg_results = run_regression_eqtl(lps_df, genotype_df)
# else:
#     reg_results = pd.read_excel(regression_file)
#     reg_results = reg_results.drop(columns='Unnamed: 0')