**Ex N0 3 : Identification of Differentially Expressed Genes from Affymatrix Data**

Dr J Arunachalam

SASTRA Deemed University

In this exercise we will identify differentially expressed genes from the GEO data 'GSE20986'. You are requested to gather further information about the experiment from the GEO page & corresponding publications. The code in this tutorial is applicable for other GEO data sets as well with minimum changes. 

We will use the GEOparse library for parsing the GEO data

Install GEOparse with the following command

In [None]:
! pip install GEOparse

Import all necessary libraries

In [None]:
import GEOparse
import pandas as pd
import pylab as pl
import seaborn as sns
import numpy as np
import networkx as nx
import scipy
import json
import itertools

Download GEO data

In [None]:
gse = GEOparse.get_GEO(geo="GSE20986")

In [None]:
def filter_genes(gse_obj, cut_off, log2, condition_list):
  if log2:
    pivoted_samples = gse_obj.pivot_samples('VALUE')[condition_list]
    return pivoted_samples
  else:
    pivoted_samples = np.log2(gse_obj.pivot_samples('VALUE')[condition_list])
  pivoted_samples_average = pivoted_samples.median(axis=1)
  print("Number of probes before filtering: ", len(pivoted_samples_average))
  expression_threshold = pivoted_samples_average.quantile(cut_off)
  expressed_probes = pivoted_samples_average[pivoted_samples_average >= expression_threshold]
  samples = np.log2(gse.pivot_samples("VALUE").loc[expressed_probes.keys()])
  samples = samples[condition_list]
  print("Number of probes after filtering: ", len(samples))
  return samples

In [None]:
def get_ttest(control_samples, post_samples):
  ttest_result = scipy.stats.ttest_ind(control_samples, post_samples, axis=1)
  ttest = pd.DataFrame({"stat": ttest_result[0], "pvalue": ttest_result[1]}, index=control_samples.index)

  return ttest

In [None]:
sample_ids = sorted([key for key in gse.gsms.keys()])
sample_ids

In [None]:
iris = ['GSM524662','GSM524665','GSM524667']
retina= ['GSM524663','GSM524664','GSM524666']	
choroid=sample_ids[6:9]
huvec=sample_ids[9:12]
#iris
#retina
#choroid
huvec	

In [None]:
huvec_samples = filter_genes(gse, 0.25, True, huvec)
iris_smaples = filter_genes(gse, 0.25, True, iris)

In [None]:
ttest_df = get_ttest(huvec_samples, iris_smaples)

In [None]:
ttest_df

In [None]:
from statsmodels.stats import multitest

In [None]:
def get_FDR(ttest_df):
  corrected_pvalue = multitest.multipletests(pvals=ttest_df['pvalue'], method='bonferroni', alpha=0.05)
  print(corrected_pvalue)
  FDR = pd.DataFrame({'Rejected': corrected_pvalue[0], 'FDR': corrected_pvalue[1]}, index= ttest_df.index)

  return FDR

In [None]:
FDR = get_FDR(ttest_df)

In [None]:
selected = FDR.loc[FDR['FDR'] < 0.1]
selected

In [None]:
def get_selected_df(gse_obj, selected_FDR):
  pivoted_samples = gse_obj.pivot_samples('VALUE').loc[selected_FDR.index]
  return pivoted_samples

In [None]:
selected_df = get_selected_df(gse, selected)
selected_df

In [None]:
selected_df = np.log2(selected_df)
selected_df

In [None]:
def get_lfc(control_samples, post_samples):
  control_mean = control_samples.mean(axis=1)
  post_mean = post_samples.mean(axis=1)
  LFCs = pd.DataFrame({"LFC": (post_mean - control_mean).to_dict()})
  return LFCs

In [None]:
huvec_samples = selected_df[huvec]
choroid_samples = selected_df[choroid]
LFCs = get_lfc(huvec_samples, choroid_samples)
LFCs

In [None]:
def get_annotation(gene_dataframe, data_flatform, leftkey):
  gene_annotated = gene_dataframe.reset_index().merge(gse.gpls[data_flatform].table[["ID", "ENTREZ_GENE_ID", "Gene Symbol"]],
                                left_on=leftkey, right_on="ID").set_index(leftkey)
  del gene_annotated["ID"]
  # remove probes without ENTREZ
  gene_annotated = gene_annotated.dropna(subset=["ENTREZ_GENE_ID"])
  # remove probes with more than one gene assigned
  gene_annotated['ENTREZ_GENE_ID'] = pd.to_numeric(gene_annotated['ENTREZ_GENE_ID'], errors="coerce")
  gene_annotated.dropna(how="any", inplace=True)
  gene_annotated['ENTREZ_GENE_ID'] = gene_annotated.ENTREZ_GENE_ID.astype('int').astype('str')
  # for each gene average LFC over probes
  gene_annotated = gene_annotated.groupby("Gene Symbol").median()

  return gene_annotated

In [None]:
LFCs_annotated = get_annotation(LFCs, 'GPL570', 'index')

In [None]:
LFCs_annotated