**Ex N0 3 : Identification of Differentially Expressed Genes from Affymatrix Data**

Dr J Arunachalam

SASTRA Deemed University

In this exercise we will identify differentially expressed genes from the GEO data 'GSE20986'. You are requested to gather further information about the experiment from the GEO page & corresponding publications. The code in this tutorial is applicable for other GEO data sets as well with minimum changes. 

We will use the GEOparse library for parsing the GEO data

Install GEOparse with the following command

In [1]:
! pip install GEOparse

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting GEOparse
  Downloading GEOparse-2.0.3.tar.gz (278 kB)
[K     |████████████████████████████████| 278 kB 18.2 MB/s 
Building wheels for collected packages: GEOparse
  Building wheel for GEOparse (setup.py) ... [?25l[?25hdone
  Created wheel for GEOparse: filename=GEOparse-2.0.3-py3-none-any.whl size=29064 sha256=647f4a708ad235ea3c37dd45a58fcdcabf07cce48f3f33244323554ed4cb2981
  Stored in directory: /root/.cache/pip/wheels/7a/75/14/727a857e321c5e5590d7e58efe9f028d753d340cea4ee540f0
Successfully built GEOparse
Installing collected packages: GEOparse
Successfully installed GEOparse-2.0.3


Import all necessary libraries

In [2]:
import GEOparse
import pandas as pd
import pylab as pl
import seaborn as sns
import numpy as np
import networkx as nx
import scipy
import json
import itertools

Download GEO data

In [3]:
gse = GEOparse.get_GEO(geo="GSE22255")

13-Dec-2022 17:05:38 DEBUG utils - Directory ./ already exists. Skipping.
DEBUG:GEOparse:Directory ./ already exists. Skipping.
13-Dec-2022 17:05:38 INFO GEOparse - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22255/soft/GSE22255_family.soft.gz to ./GSE22255_family.soft.gz
INFO:GEOparse:Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22255/soft/GSE22255_family.soft.gz to ./GSE22255_family.soft.gz
100%|██████████| 32.4M/32.4M [00:00<00:00, 124MB/s] 
13-Dec-2022 17:05:39 DEBUG downloader - Size validation passed
DEBUG:GEOparse:Size validation passed
13-Dec-2022 17:05:39 DEBUG downloader - Moving /tmp/tmphcs1vn36 to /content/GSE22255_family.soft.gz
DEBUG:GEOparse:Moving /tmp/tmphcs1vn36 to /content/GSE22255_family.soft.gz
13-Dec-2022 17:05:39 DEBUG downloader - Successfully downloaded ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22255/soft/GSE22255_family.soft.gz
DEBUG:GEOparse:Successfully downloaded ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22

In [4]:
def filter_genes(gse_obj, cut_off, log2, condition_list):
  if log2:
    pivoted_samples = gse_obj.pivot_samples('VALUE')[condition_list]
    return pivoted_samples
  else:
    pivoted_samples = np.log2(gse_obj.pivot_samples('VALUE')[condition_list])
  pivoted_samples_average = pivoted_samples.median(axis=1)
  print("Number of probes before filtering: ", len(pivoted_samples_average))
  expression_threshold = pivoted_samples_average.quantile(cut_off)
  expressed_probes = pivoted_samples_average[pivoted_samples_average >= expression_threshold]
  samples = np.log2(gse.pivot_samples("VALUE").loc[expressed_probes.keys()])
  samples = samples[condition_list]
  print("Number of probes after filtering: ", len(samples))
  return samples

In [5]:
def get_ttest(control_samples, post_samples):
  ttest_result = scipy.stats.ttest_ind(control_samples, post_samples, axis=1)
  ttest = pd.DataFrame({"stat": ttest_result[0], "pvalue": ttest_result[1]}, index=control_samples.index)

  return ttest

In [6]:
sample_ids = sorted([key for key in gse.gsms.keys()])
sample_ids

['GSM554014',
 'GSM554015',
 'GSM554016',
 'GSM554017',
 'GSM554018',
 'GSM554019',
 'GSM554020',
 'GSM554021',
 'GSM554022',
 'GSM554023',
 'GSM554024',
 'GSM554025',
 'GSM554026',
 'GSM554027',
 'GSM554028',
 'GSM554029',
 'GSM554030',
 'GSM554031',
 'GSM554032',
 'GSM554033',
 'GSM554034',
 'GSM554035',
 'GSM554036',
 'GSM554037',
 'GSM554038',
 'GSM554039',
 'GSM554040',
 'GSM554041',
 'GSM554042',
 'GSM554043',
 'GSM554044',
 'GSM554045',
 'GSM554046',
 'GSM554047',
 'GSM554048',
 'GSM554049',
 'GSM554050',
 'GSM554051',
 'GSM554052',
 'GSM554053']

In [10]:
control = sample_ids[0:20]
ischemic_stroke = sample_ids[20:]	
# choroid=sample_ids[6:9]
# huvec=sample_ids[9:12]
# #iris
# #retina
# #choroid
control


['GSM554014',
 'GSM554015',
 'GSM554016',
 'GSM554017',
 'GSM554018',
 'GSM554019',
 'GSM554020',
 'GSM554021',
 'GSM554022',
 'GSM554023',
 'GSM554024',
 'GSM554025',
 'GSM554026',
 'GSM554027',
 'GSM554028',
 'GSM554029',
 'GSM554030',
 'GSM554031',
 'GSM554032',
 'GSM554033']

In [11]:
control_samples = filter_genes(gse, 0.25, True, control)
ischemic_smaples = filter_genes(gse, 0.25, True, ischemic_stroke)

In [12]:
ttest_df = get_ttest(control_samples, ischemic_smaples)

In [13]:
ttest_df

Unnamed: 0_level_0,stat,pvalue
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1
1007_s_at,-0.098940,0.921706
1053_at,1.648757,0.107442
117_at,1.164843,0.251343
121_at,-0.301076,0.764998
1255_g_at,-0.362737,0.718811
...,...,...
AFFX-r2-Ec-bioC-5_at,-0.785910,0.436794
AFFX-r2-Ec-bioD-3_at,-1.172536,0.248280
AFFX-r2-Ec-bioD-5_at,-0.972928,0.336742
AFFX-r2-P1-cre-3_at,-0.317624,0.752509


In [14]:
from statsmodels.stats import multitest

In [15]:
def get_FDR(ttest_df):
  corrected_pvalue = multitest.multipletests(pvals=ttest_df['pvalue'], method='bonferroni', alpha=0.05)
  print(corrected_pvalue)
  FDR = pd.DataFrame({'Rejected': corrected_pvalue[0], 'FDR': corrected_pvalue[1]}, index= ttest_df.index)

  return FDR

In [16]:
FDR = get_FDR(ttest_df)

(array([False, False, False, ..., False, False, False]), array([1., 1., 1., ..., 1., 1., 1.]), 9.381485199799755e-07, 9.144947416552355e-07)


In [17]:
selected = FDR.loc[FDR['FDR'] < 0.1]
selected

Unnamed: 0_level_0,Rejected,FDR
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1
1567009_at,False,0.054826


In [18]:
def get_selected_df(gse_obj, selected_FDR):
  pivoted_samples = gse_obj.pivot_samples('VALUE').loc[selected_FDR.index]
  return pivoted_samples

In [19]:
selected_df = get_selected_df(gse, selected)
selected_df

name,GSM554014,GSM554015,GSM554016,GSM554017,GSM554018,GSM554019,GSM554020,GSM554021,GSM554022,GSM554023,...,GSM554044,GSM554045,GSM554046,GSM554047,GSM554048,GSM554049,GSM554050,GSM554051,GSM554052,GSM554053
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1567009_at,2.99433,2.8593,3.18044,3.06269,2.8978,2.96311,3.14702,2.91434,2.98669,2.92412,...,3.25096,3.20366,3.24605,3.24617,3.33791,3.5345,3.13338,3.44088,3.05369,3.36339


In [20]:
selected_df = np.log2(selected_df)
selected_df

name,GSM554014,GSM554015,GSM554016,GSM554017,GSM554018,GSM554019,GSM554020,GSM554021,GSM554022,GSM554023,...,GSM554044,GSM554045,GSM554046,GSM554047,GSM554048,GSM554049,GSM554050,GSM554051,GSM554052,GSM554053
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1567009_at,1.582233,1.515662,1.669226,1.614799,1.534958,1.567112,1.653986,1.543169,1.578548,1.548003,...,1.700866,1.679721,1.698685,1.698739,1.738945,1.821506,1.64772,1.782778,1.610554,1.749916


In [21]:
def get_lfc(control_samples, post_samples):
  control_mean = control_samples.mean(axis=1)
  post_mean = post_samples.mean(axis=1)
  LFCs = pd.DataFrame({"LFC": (post_mean - control_mean).to_dict()})
  return LFCs

In [22]:
huvec_samples = selected_df[control]
choroid_samples = selected_df[ischemic_stroke]
LFCs = get_lfc(huvec_samples, choroid_samples)
LFCs

Unnamed: 0,LFC
1567009_at,0.122949


In [23]:
def get_annotation(gene_dataframe, data_flatform, leftkey):
  gene_annotated = gene_dataframe.reset_index().merge(gse.gpls[data_flatform].table[["ID", "ENTREZ_GENE_ID", "Gene Symbol"]],
                                left_on=leftkey, right_on="ID").set_index(leftkey)
  del gene_annotated["ID"]
  # remove probes without ENTREZ
  gene_annotated = gene_annotated.dropna(subset=["ENTREZ_GENE_ID"])
  # remove probes with more than one gene assigned
  gene_annotated['ENTREZ_GENE_ID'] = pd.to_numeric(gene_annotated['ENTREZ_GENE_ID'], errors="coerce")
  gene_annotated.dropna(how="any", inplace=True)
  gene_annotated['ENTREZ_GENE_ID'] = gene_annotated.ENTREZ_GENE_ID.astype('int').astype('str')
  # for each gene average LFC over probes
  gene_annotated = gene_annotated.groupby("Gene Symbol").median()

  return gene_annotated

In [24]:
LFCs_annotated = get_annotation(LFCs, 'GPL570', 'index')

In [25]:
LFCs_annotated

Unnamed: 0_level_0,LFC
Gene Symbol,Unnamed: 1_level_1
