# Systems genetics 2020
## Final Project  

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

Load data files

In [2]:
liver_exp = pd.read_csv('data/GSE17522_series_matrix_liver.txt', sep = "\t")
brain_exp = pd.read_csv('data/GSE36674_series_matrix_hypothalamus.txt', sep = "\t")

liver_annotations = pd.read_csv('data/annotations_liver_GPL6466-9752.txt', sep = "\t")
brain_annotations = pd.read_csv('data/annotation_brain.annot', sep = "\t")

genotypes = pd.read_excel('data/genotypes.xls', headers=None)
phenotypes = pd.read_excel('data/phenotypes.xls')

mgi = pd.read_csv('data/MGI_Coordinates.Build37.rpt.txt', sep = "\t", error_bad_lines=False, warn_bad_lines=False)

### 2. Gene expression data preprocessing

#### • Make sure that the data is normalized

In [3]:
liver_exp[liver_exp['!Sample_title'] == '!Sample_data_processing']['Liver_C57BL6J_M_B1_rep1'].iloc[0]

'Data were extracted from the scanned image using Agilent Feature Extraction software version 6.1. A total of 122 arrays were run in 8 batches. The samples were semirandomly distributed throughout the batches prior to microarray analysis in order to separate sexes and strains, and to minimize between- and within- batch bias. Technical and biological replicates were run both within each batch and between batches. The microarray data was deposited in the UNC Microarray Database and extracted using Log2 ratios of the mean red channel intensity over the mean green channel intensity. This was followed by LOWESS normalization to remove the intensity dependent dye bias3. Neither the genes nor the arrays were centered. Inter-batch normalization was carried out using a nested ANOVA mixed model with samples within each batch crossed with sex and strain.'

In [4]:
brain_exp[brain_exp['!Sample_title'] == '!Sample_data_processing']['BXD44_F'].iloc[0]

'Probe intensity values were extracted using the Affymetrix GeneChip Operating Software and RMA normalized'

#### • Merge data file with annotation file to get your input matrix

In [5]:
def data_annotations_merge(df_exp, df_annotations):
    """ Merge gene expression dataframe with annotations file """
    
    # Remove metadata rows to get the raw data only
    df_exp = df_exp[~df_exp['!Sample_title'].str.contains('!Sample_', na=False)]
    df_exp = df_exp[~df_exp['!Sample_title'].str.contains('!series_', na=False)]
    df_exp = df_exp[df_exp['!Sample_title'] != 'ID_REF']

    # Rename ID column to match the annotation matrix
    df_exp = df_exp.rename(columns = {'!Sample_title' : 'ID'})

    # Merge with annotation matrix to get the gene identifier (gene symbol)
    df_annotations = df_annotations[['ID', 'GENE_SYMBOL']]
    input_matrix = df_annotations.merge(df_exp, left_on='ID', right_on='ID')
    
    return input_matrix  

Brain

In [6]:
brain_annotations = brain_annotations.rename(columns={'Gene symbol' : 'GENE_SYMBOL'})
brain_matrix = data_annotations_merge(brain_exp, brain_annotations)

Liver

In [7]:
liver_matrix = data_annotations_merge(liver_exp, liver_annotations)

# Keep BXD columns and identifier columns only
liver_bxd_cols = list(liver_matrix.filter(regex=("BXD*")).columns)
id_cols = ['ID', 'GENE_SYMBOL']
liver_matrix = liver_matrix[id_cols + liver_bxd_cols]

# Rename BXD columns
liver_matrix = liver_matrix.rename(columns={col : col.split('_')[1] + '_' + col.split('_')[2] \
                                            for col in liver_matrix.columns.drop(id_cols)})

#### • Remove rows with no gene identifier, <br> • Remove rows with low maximal value.  <br> • Remove rows with low variance.  <br> • Average multiple rows


In [8]:
max_treshold = 0.7
var_treshold = 0.2

# Remove rows with no gene identifier
liver_matrix = liver_matrix[~liver_matrix['GENE_SYMBOL'].isna()]
liver_bxd_cols = list(liver_matrix.filter(regex=("BXD*")).columns)

for col in liver_bxd_cols:
    liver_matrix[col] = liver_matrix[col].astype('float64')

# Filter by maximal value
liver_matrix['max'] = liver_matrix.drop(columns=['ID', 'GENE_SYMBOL']).max(axis=1)
liver_matrix = liver_matrix[liver_matrix['max'] >= max_treshold]
liver_matrix = liver_matrix.drop(columns = 'max')
print("Num of rows after removing all rows with maximal values less than %.2f: %d " % (max_treshold, len(liver_matrix)))

# Filter by variance
liver_matrix['var'] = liver_matrix.drop(columns=['ID', 'GENE_SYMBOL']).var(axis=1)
liver_matrix = liver_matrix[liver_matrix['var'] >= var_treshold]
liver_matrix = liver_matrix.drop(columns = 'var')
print("Num of rows after removing all rows with variance less than %.2f: %d " % (var_treshold, len(liver_matrix)))

# Group multiple rows by mean
liver_matrix = liver_matrix.groupby('GENE_SYMBOL').agg('mean').reset_index()
print("Num of rows after removing duplicated rows: %d ", len(liver_matrix))

Num of rows after removing all rows with maximal values less than 0.70: 7790 
Num of rows after removing all rows with variance less than 0.20: 1310 
Num of rows after removing duplicated rows: %d  1288


In [9]:
max_treshold = 9
var_treshold = 0.03

# Remove rows with no gene identifier
brain_matrix = brain_matrix[~brain_matrix['GENE_SYMBOL'].isna()]
brain_bxd_cols = list(brain_matrix.filter(regex=("BXD*")).columns)

for col in brain_bxd_cols:
    brain_matrix[col] = brain_matrix[col].astype('float64')

# Filter by maximal value
brain_matrix['max'] = brain_matrix.drop(columns=['ID', 'GENE_SYMBOL']).max(axis=1)
brain_matrix = brain_matrix[brain_matrix['max'] >= max_treshold]
brain_matrix = brain_matrix.drop(columns = 'max')
print("Num of rows after removing all rows with maximal values less than %.2f: %d " % (max_treshold, len(brain_matrix)))

# Filter by variance
brain_matrix['var'] = brain_matrix.drop(columns=['ID', 'GENE_SYMBOL']).var(axis=1)
brain_matrix = brain_matrix[brain_matrix['var'] >= var_treshold]
brain_matrix = brain_matrix.drop(columns = 'var')
print("Num of rows after removing all rows with variance less than %.2f: %d " % (var_treshold, len(brain_matrix)))

# Group multiple rows by mean
brain_matrix = brain_matrix.groupby('GENE_SYMBOL').agg('mean').reset_index()
print("Num of rows after removing duplicated rows: %d ", len(brain_matrix))

Num of rows after removing all rows with maximal values less than 9.00: 9899 
Num of rows after removing all rows with variance less than 0.03: 2069 
Num of rows after removing duplicated rows: %d  1792


#### • Average across different individuals of the same strain (Females and males)

In [10]:
# Drop BXD columns which are not exist in the LPS file (from genotype file)
liver_bxd_unique = set()
for col in liver_bxd_cols:
    bxd_idx = col.split("_")[0]
    liver_bxd_unique.add(bxd_idx)
    
liver_bxd_unique = list(liver_bxd_unique)

# Average across different individuals of the same strain
for col in liver_bxd_unique:
    individuals_cols = list(liver_matrix.filter(regex=(col + '_')).columns)
    liver_matrix[col] = liver_matrix[individuals_cols].astype(float).mean(axis=1)

liver_matrix = liver_matrix.drop(columns=liver_bxd_cols)
liver_matrix.head(5)

Unnamed: 0,GENE_SYMBOL,BXD8,BXD43,BXD23,BXD40,BXD14,BXD33,BXD42,BXD32,BXD16,...,BXD44,BXD73,BXD15,BXD5,BXD28,BXD29,BXD77,BXD12,BXD60,BXD19
0,0610010K14Rik,1.5065,0.5045,0.4035,0.918,0.327,0.758,0.7685,0.9875,1.0265,...,0.6915,0.307,0.644,0.4025,0.562,0.7515,0.956,0.304,0.7985,1.2945
1,0610012H03Rik,0.389,-0.7505,-0.767,0.008,-0.495,0.0535,-0.0915,0.1305,0.122,...,-0.162,-0.605,-0.2175,-0.058,-0.0135,0.3105,-0.185,-0.552,-0.006,0.497
2,100043147,-0.415,-0.0725,-0.3685,-0.6245,-0.083,-0.1215,0.0935,-0.446,-0.2865,...,-0.0525,-0.4255,0.0405,-0.115,-0.0455,-0.043,-0.1875,-0.1525,0.0105,-0.5065
3,1110003O08Rik,-0.6825,-0.675,-0.0095,-0.546,0.171,-0.4605,-0.4325,-0.4465,-0.5155,...,0.7385,0.026,-0.534,0.323,0.2935,-0.6305,-0.6255,0.3205,-0.2795,-0.655
4,1110006O24Rik,0.877,1.4545,0.7595,0.876,0.4645,0.49,1.2365,0.993,0.2105,...,0.8675,0.194,0.773,1.513,0.8,0.544,0.654,0.798,0.798,0.509


In [11]:
# Drop BXD columns which are not exist in the LPS file (from genotype file)
brain_bxd_unique = set()
for col in brain_bxd_cols:
    bxd_idx = col.split("_")[0]
    brain_bxd_unique.add(bxd_idx)
    
brain_bxd_unique = list(brain_bxd_unique)

# Average across different individuals of the same strain
for col in brain_bxd_unique:
    individuals_cols = list(brain_matrix.filter(regex=(col + '_')).columns)
    brain_matrix[col] = brain_matrix[individuals_cols].astype(float).mean(axis=1)

brain_matrix = brain_matrix.drop(columns=brain_bxd_cols)
brain_matrix.head(5)

Unnamed: 0,GENE_SYMBOL,BXD70,BXD27,BXD56,BXD43,BXD80,BXD92A,BXD75,BXD71,BXD40,...,BXD44,BXD73,BXD29,BXD103,BXD12,BXD83,BXD89,BXD60,BXD87,BXD102
0,0610010K14Rik///0610010K14Rik,9.99,9.999,9.7745,9.805,9.572,9.6865,9.907,9.876,9.698,...,9.952,9.796,9.5015,9.8585,9.703,9.598,9.713,9.722,10.0685,10.072
1,0610012G03Rik,9.582,9.7345,9.7195,9.958,9.881,9.707,9.5755,9.6915,9.728,...,9.7785,9.7145,9.5925,9.8305,9.592,9.4545,9.7735,9.826,9.6745,9.851
2,1110008P14Rik,8.8995,9.099,8.869,9.328,9.1575,8.928,9.0025,9.0405,9.0145,...,9.2085,9.0,8.9055,9.085,8.637,8.733,8.8845,9.127,9.045,9.206
3,1110012L19Rik,8.768,8.807,8.871,9.479,9.3385,8.952,8.898,8.6895,9.0385,...,8.937,9.0625,8.807,9.101,8.947,8.7905,8.8075,9.2555,8.8065,9.064
4,1110059E24Rik,8.904,9.0695,8.9055,9.314,9.062,8.911,8.8345,8.9885,8.9425,...,9.084,8.9585,8.773,9.09,8.838,8.649,9.102,9.129,8.8795,9.01


#### Keep only BXD columns that exists in the four files: genotypes, phenotypes and each tissue seperately

In [14]:
genotypes_bxd_cols = list(genotypes.filter(regex=("BXD*")).columns)
phenotypes_bxd_cols = list(phenotypes.filter(regex=("BXD*")).columns)

genotypes_baseline_cols = ["Locus", "Chr_Build37", "Build37_position"]
phenotypes_baseline_cols = ["Phenotype", "Authors", "Year", "Pubmed Id"]

liver_common_bxd = list(set(genotypes_bxd_cols) & set(phenotypes_bxd_cols) &  set(liver_bxd_unique))
liver_matrix = liver_matrix[['GENE_SYMBOL'] + liver_common_bxd]
liver_genotypes = genotypes[genotypes_baseline_cols + liver_common_bxd]
liver_phenotypes = phenotypes[phenotypes_baseline_cols + liver_common_bxd]

brain_common_bxd = list(set(genotypes_bxd_cols) & set(phenotypes_bxd_cols) &  set(brain_bxd_unique))
brain_matrix = brain_matrix[['GENE_SYMBOL'] + brain_common_bxd]
brain_genotypes = genotypes[genotypes_baseline_cols + brain_common_bxd]
liver_phenotypes = phenotypes[phenotypes_baseline_cols + brain_common_bxd]

### 3. eQTL analysis