# **Summary of Workflow for Integrative approach:**


1.Based on the binding profiles for STAT1, STAT2, IRF1 and IRF9, TF-specific gene lists were prepared (e.g. STAT1-target genes).

2.The motif file containing GAS-only, ISRE-only and composite containing genes were generated (by others).

3.The integration analysis was performed using up-regulated, P-adjusted(P<0.05) genes from RNA-seq and the peaks from CHIP-seq (From: /media/dell/Elements/MAH/dhmg/beta/TFs/New_analysis). Briefly, BETA tool was run using this script (nested_all_beta.sh), followed by extraction of upregulated genes from all time points (using beta_genes.sh) and combining all files into a single one (cat *.txt | sort | uniq > IFNa_genes.txt). Then, 
the integrative gene list was initially refined by the identification of overlapping genes between the integrative list and the motif list from step 2, followed by the detection of integrative genes that are common with gene lists from step 1. Accordingly, the number of refined integrative genes were 330 and 308, in IFNa- and IFNy-treated groups, respectively. Please check "clusters.ipynb" script (Refined gene list) for the latest integrative gene list.



In [1]:
import pandas as pd
import numpy as np
import os
import glob
import fnmatch

In [2]:
# List all files in a specific directory

for (root,dirs,files) in os.walk('Peaks/final_peaks/', topdown=False):
    for name in files:
        print(os.path.join(root,name))
    

Peaks/final_peaks/.ipynb_checkpoints/hs_HuhWT_IFNa_IRF1_allPeaksSet_nR_topScore_10_final-checkpoint.xls
Peaks/final_peaks/allPeaksSet_nR_allAntibodies_IFNa_topScore_final.bed
Peaks/final_peaks/allPeaksSet_nR_allAntibodies_IFNa_topScore_final.txt
Peaks/final_peaks/allPeaksSet_nR_allAntibodies_IFNy_topScore.bed
Peaks/final_peaks/allPeaksSet_nR_allAntibodies_IFNy_topScore.txt
Peaks/final_peaks/hs_HuhWT_IFNa_IRF1_allPeaksSet_nR_topScore_10_final.txt
Peaks/final_peaks/hs_HuhWT_IFNa_IRF1_allPeaksSet_nR_topScore_10_final.xls
Peaks/final_peaks/hs_HuhWT_IFNa_IRF9_allPeaksSet_nR_topScore_5_final.txt
Peaks/final_peaks/hs_HuhWT_IFNa_pSTAT1_allPeaksSet_nR_topScore_20_final.txt
Peaks/final_peaks/hs_HuhWT_IFNa_pSTAT2_allPeaksSet_nR_topScore_15_final.txt
Peaks/final_peaks/hs_HuhWT_IFNy_IRF1_allPeaksSet_nR_topScore_10_final.txt
Peaks/final_peaks/hs_HuhWT_IFNy_IRF9_allPeaksSet_nR_topScore_5_final.txt
Peaks/final_peaks/hs_HuhWT_IFNy_pSTAT1_allPeaksSet_nR_topScore_20_final.txt


# Preparing the TF-specific genes (To make annotate folder)


In [3]:
# working directory for this script
!pwd

/media/dell/Elements/MAH/dhmg/beta/TFs/IDR/Networks_motifs/New_analysis


In [3]:
# List of Files in a specific directory with specific extension
# Ref: https://www.pythonpool.com/python-loop-through-files-in-directory/ 

path_of_the_directory = 'Peaks/final_peaks/'
ext = ('final.txt') # It is not allowed to use wildcards
for files in os.scandir(path_of_the_directory):
    if files.path.endswith(ext):
        print(files)

<DirEntry 'allPeaksSet_nR_allAntibodies_IFNa_topScore_final.txt'>
<DirEntry 'hs_HuhWT_IFNa_IRF1_allPeaksSet_nR_topScore_10_final.txt'>
<DirEntry 'hs_HuhWT_IFNa_IRF9_allPeaksSet_nR_topScore_5_final.txt'>
<DirEntry 'hs_HuhWT_IFNa_pSTAT1_allPeaksSet_nR_topScore_20_final.txt'>
<DirEntry 'hs_HuhWT_IFNa_pSTAT2_allPeaksSet_nR_topScore_15_final.txt'>
<DirEntry 'hs_HuhWT_IFNy_IRF1_allPeaksSet_nR_topScore_10_final.txt'>
<DirEntry 'hs_HuhWT_IFNy_IRF9_allPeaksSet_nR_topScore_5_final.txt'>
<DirEntry 'hs_HuhWT_IFNy_pSTAT1_allPeaksSet_nR_topScore_20_final.txt'>


In [46]:
# Preparing the file for annotatePeaks.pl command from homer
# This code bloc is repeated for each TF
# Another solution @ https://stackoverflow.com/questions/64479604/dataframe-to-csv-in-a-for-loop-with-different-file-names

in_dir = 'Peaks/final_peaks/'
out_dir = 'Peaks/For_homer/'

for file in os.listdir(in_dir):
    if fnmatch.fnmatch(file, 'hs*.txt'):
        print('Running {}'.format(file))
        fileBaseName = os.path.basename(file).split('.')[0]
        newFilename = '{}{}.tsv'.format(out_dir, fileBaseName)
        tf = pd.read_csv(in_dir + file, sep="\t", header=None, skiprows=[0])
        tf_bd = tf.iloc[:,0:6].copy()
        tf_bd.to_csv(newFilename, sep="\t", header=None, index=False)  
        print(type(tf_bd)); print(tf_bd.shape)


Running hs_HuhWT_IFNa_IRF1_allPeaksSet_nR_topScore_10_final.txt
<class 'pandas.core.frame.DataFrame'>
(3498, 6)
Running hs_HuhWT_IFNa_IRF9_allPeaksSet_nR_topScore_5_final.txt
<class 'pandas.core.frame.DataFrame'>
(1178, 6)
Running hs_HuhWT_IFNa_pSTAT1_allPeaksSet_nR_topScore_20_final.txt
<class 'pandas.core.frame.DataFrame'>
(3874, 6)
Running hs_HuhWT_IFNa_pSTAT2_allPeaksSet_nR_topScore_15_final.txt
<class 'pandas.core.frame.DataFrame'>
(4218, 6)
Running hs_HuhWT_IFNy_IRF1_allPeaksSet_nR_topScore_10_final.txt
<class 'pandas.core.frame.DataFrame'>
(6217, 6)
Running hs_HuhWT_IFNy_IRF9_allPeaksSet_nR_topScore_5_final.txt
<class 'pandas.core.frame.DataFrame'>
(358, 6)
Running hs_HuhWT_IFNy_pSTAT1_allPeaksSet_nR_topScore_20_final.txt
<class 'pandas.core.frame.DataFrame'>
(2796, 6)


In [47]:
%%bash

# Running Homer

for i in Peaks/For_homer/*IRF* Peaks/For_homer/*pST* ; do perl ../../../../../../homer/bin/annotatePeaks.pl $i hg38 > ${i%%_final.tsv}_genes.tsv ; done 


	Peak file = Peaks/For_homer/hs_HuhWT_IFNa_IRF1_allPeaksSet_nR_topScore_10_final.tsv
	Genome = hg38
	Organism = human
	Peak/BED file conversion summary:
		BED/Header formatted lines: 3498
		peakfile formatted lines: 0
		Duplicated Peak IDs: 0

	Peak File Statistics:
		Total Peaks: 3498
		Redundant Peak IDs: 0
		Peaks lacking information: 0 (need at least 5 columns per peak)
		Peaks with misformatted coordinates: 0 (should be integer)
		Peaks with misformatted strand: 0 (should be either +/- or 0/1)

	Peak file looks good!

	Reading Positions...
	-----------------------
	Finding Closest TSS...
	Annotating:.......................
		Annotation	Number of peaks	Total size (bp)	Log2 Ratio (obs/exp)	LogP enrichment (+values depleted)
		3UTR	26.0	26786192	-0.251	1.538
		miRNA	0.0	97618	-0.154	0.113
		ncRNA	13.0	6998912	0.686	-2.694
		TTS	44.0	32227484	0.242	-1.895
		pseudo	1.0	2085537	-1.268	1.182
		Exon	36.0	37015031	-0.248	1.780
		Intron	1591.0	1253662019	0.136	-14.469
		Intergenic	1361.0	1

In [55]:
# iterate over files, extract file name, pass to pandas logic, and change the name! 

in_dir = 'Peaks/For_homer/'
out_dir = 'Peaks/'

for file in os.listdir(in_dir):
    if fnmatch.fnmatch(file, '*genes.tsv'):
        print('Running {}'.format(in_dir + file))
        hmr = pd.read_csv(in_dir + file, sep='\t')
        hmr_g = hmr[hmr['Annotation'].str.contains("promoter|5' UTR", regex=True)]
        hmr_g = hmr_g[['Gene Name']]
        hmr_g.to_csv(out_dir + str('Homr_') + file, index=False, header=None, sep='\t')
        print("dim before: ", hmr.shape ,"\n", "dim after: ", hmr_g.shape)    

Running Peaks/For_homer/hs_HuhWT_IFNa_IRF1_allPeaksSet_nR_topScore_10_genes.tsv
dim before:  (3498, 19) 
 dim after:  (426, 1)
Running Peaks/For_homer/hs_HuhWT_IFNa_IRF9_allPeaksSet_nR_topScore_5_genes.tsv
dim before:  (1178, 19) 
 dim after:  (286, 1)
Running Peaks/For_homer/hs_HuhWT_IFNa_pSTAT1_allPeaksSet_nR_topScore_20_genes.tsv
dim before:  (3874, 19) 
 dim after:  (757, 1)
Running Peaks/For_homer/hs_HuhWT_IFNa_pSTAT2_allPeaksSet_nR_topScore_15_genes.tsv
dim before:  (4218, 19) 
 dim after:  (689, 1)
Running Peaks/For_homer/hs_HuhWT_IFNy_IRF1_allPeaksSet_nR_topScore_10_genes.tsv
dim before:  (6217, 19) 
 dim after:  (496, 1)
Running Peaks/For_homer/hs_HuhWT_IFNy_IRF9_allPeaksSet_nR_topScore_5_genes.tsv
dim before:  (358, 19) 
 dim after:  (139, 1)
Running Peaks/For_homer/hs_HuhWT_IFNy_pSTAT1_allPeaksSet_nR_topScore_20_genes.tsv
dim before:  (2796, 19) 
 dim after:  (547, 1)


In [59]:
# Optional

# Change the file names

# From: https://pynative.com/python-rename-file/

folder = 'Peaks/test1/'

# iterate all files from a directory
for file_name in os.listdir(folder):
    if file.endswith('.tsv'):
        # Construct old file name
        source = folder + file_name

        # Adding the count to the new file name and extension
        destination = folder + "Homr_" +  file_name

        # Renaming the file
        os.rename(source, destination)
    
print('All Files Renamed')

print('New Names are')

# verify the result
res = os.listdir(folder)
print(res)

All Files Renamed
New Names are
['Homr_homr_hmr_IFNa_IRF1_genes.tsv', 'Homr_homr_hmr_IFNa_IRF9_genes.tsv', 'Homr_homr_hmr_IFNa_pSTAT1_genes.tsv', 'Homr_homr_hmr_IFNa_pSTAT2_genes.tsv', 'Homr_IFNa_IRF1_genes.tsv', 'Homr_IFNa_IRF9_genes.tsv', 'Homr_IFNa_pSTAT1_genes.tsv', 'Homr_IFNa_pSTAT2_genes.tsv']


# Prepare the motif files

In [26]:
# Load the latest version of motif file

dfm = pd.read_csv("motifs_IFNag_latest.tsv", sep="\t")
print(dfm.shape); print(dfm.columns)
dfm.rename(columns={'Gene':'Symbol'}, inplace=True)
dfm.head(3)

(220, 2)
Index(['Gene', 'Motif'], dtype='object')


Unnamed: 0,Symbol,Motif
0,APOL1,composite
1,ACOT7,ISRE
2,A2M,GAS


In [5]:
# Optional

# Giving colors to Motifs
# Creating IF condition
dfm.loc[dfm['Motif'] == "composite", "motfcol"] = "blue"
dfm.loc[dfm['Motif'] == "GAS", "motfcol"] = "green"
dfm.loc[dfm['Motif'] == "ISRE", "motfcol"] = "orange"

dfm['Symbol'] = dfm['Symbol'].str.strip()
dfm.head()

Unnamed: 0,Symbol,Motif,motfcol
0,APOL1,composite,blue
1,ACOT7,ISRE,orange
2,A2M,GAS,green
3,APOL2,composite,blue
4,ACSL5,ISRE,orange


# Finding Mutual genes between the integrative list & motif genes

In [4]:
# Read BETA files

## (Recommended) 1st solution using Next function and dictionary and specific names 

path, dirs, files = next(os.walk('./BETA_files/'))
print(files)

### Create empty dictionary
df_dict = {}

for i, df in enumerate(files):
    df_dict[files[i].split('.')[0]] = pd.read_csv('./BETA_files/'+files[i], sep='\t', header=None, names=['Symbol'])
    print("File name is: ", files[i])

for item in df_dict.items():
    print(item)

['IFNa_genes.txt', 'IFNy_genes.txt']
File name is:  IFNa_genes.txt
File name is:  IFNy_genes.txt
('IFNa_genes',            Symbol
0             A2M
1            AAAS
2         AADACP1
3            AASS
4           ABCA9
...           ...
2088        ZNRF2
2089       ZRANB2
2090  ZSCAN16-AS1
2091       ZSCAN2
2092       ZWILCH

[2093 rows x 1 columns])
('IFNy_genes',       Symbol
0        A2M
1       AAAS
2       AAMP
3      ABCA9
4      ABCB4
...      ...
1906   ZNRD1
1907   ZNRF2
1908  ZRANB2
1909  ZSWIM6
1910  ZWILCH

[1911 rows x 1 columns])


In [20]:
# Optional
# Read BETA files

# 2nd solution using Next function & list
# Ref: https://www.geeksforgeeks.org/read-multiple-csv-files-into-separate-dataframes-in-python/

# Assign path
path, dirs, files = next(os.walk("./BETA_files/"))
file_count = len(files)

# create empty list
dataframes_list = []

# append dfs to the list
for i in range(file_count):
    temp_df = pd.read_csv("./BETA_files/"+files[i], sep="\t", header=None, names=['Symbol'])
    print("File name is: ", files[i], "The dimension is : ", temp_df.shape)
    dataframes_list.append(temp_df)

# display dataframes or access the individual dataframes with list_of_dfs[0], list_of_dfs[1]
#for df in dataframes_list:
    #display(df)

    

File name is:  IFNa_genes.txt The dimension is :  (2093, 1)
File name is:  IFNy_genes.txt The dimension is :  (1911, 1)


In [24]:
# Optional
# Read BETA files

# 3rd solution using For loop & list

path = os.getcwd()
files = glob.glob(os.path.join(r'./BETA_files/', "*.txt" ))

df_list= []
for f in files:
    temp_df = pd.read_csv(f, sep="\t", header=None, names=['Symbol'])
    print('File name: ', f.split('/')[-1], 'the Dimension is: ', temp_df.shape)
    df_list.append(temp_df)


File name:  IFNa_genes.txt the Dimension is:  (2093, 1)
File name:  IFNy_genes.txt the Dimension is:  (1911, 1)


In [5]:
# load the motif file (prepared above)
# dfm
print(dfm.shape); print(dfm.columns)

(220, 2)
Index(['Symbol', 'Motif'], dtype='object')


In [7]:
# Ref: https://realpython.com/iterate-through-dictionary-python/
# The type of "value" is a dataframe

for value in df_dict.values():
    print(type(value))
    
# The type of "key" is string
for key in df_dict.keys():
    print(type(key))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'str'>
<class 'str'>


In [28]:
# Otional: when using list

for i in df_list:
    df_mrg = pd.merge(i, dfm, on='Symbol')
    df_mrg.to_csv(i)
    print(df_mrg.shape)
    

(217, 2)
(216, 2)


In [9]:
# Merging using dictionary

for key, value in df_dict.items():
    print('The file is: {}'.format(key))
    df_mrg = pd.merge(value, dfm, on='Symbol')  # This is merging!
    print('The dimention is: {} and the columns are : {}'.format(df_mrg.shape, df_mrg.columns))
    df_mrg.to_csv('Mrgd_' + key + '.tsv', index=False, sep='\t') # Save
    

The file is: IFNa_genes
The dimention is: (217, 2) and the columns are : Index(['Symbol', 'Motif'], dtype='object')
The file is: IFNy_genes
The dimention is: (216, 2) and the columns are : Index(['Symbol', 'Motif'], dtype='object')


In [None]:
# Genes are not in the motif list 
# # To get unique values see https://stackoverflow.com/questions/23460345/selecting-unique-rows-between-two-dataframes-in-pandas
# # How to implement 'in' and 'not in' for a pandas DataFrame 
# https://stackoverflow.com/questions/19960077/how-to-filter-pandas-dataframe-using-in-and-not-in-like-in-sql

for key, value in df_dict.items():
    print('Genes that are not in motif list for {}'. format(key))
    print(dfm[~dfm['Symbol'].isin(value['Symbol'])])
    

In [34]:
# Load Mrgd Files

NewAna_dict = {}

path, dirs, files = next(os.walk('.'))

for i, df in enumerate(files):
    if fnmatch.fnmatch(files[i], 'Mrg*'):
        temp_df = files[i].split('.')[0]
        NewAna_dict[temp_df] = pd.read_csv(files[i], sep='\t')
        
for key, value in NewAna_dict.items():
    print('Name: {}, Size: {} \n'.format(key, value.shape))
    
    
ovlp_IFNay = pd.merge(NewAna_dict['Mrgd_IFNa_genes'], NewAna_dict['Mrgd_IFNy_genes'], how='inner', on='Symbol')

ovlp_IFNay.rename(columns={"Motif_x" : "Motif"}, inplace=True)
ovlp_IFNay = ovlp_IFNay[['Symbol', 'Motif']]
ovlp_IFNay.shape

Name: Mrgd_IFNa_genes, Size: (217, 2) 

Name: Mrgd_IFNy_genes, Size: (216, 2) 



(214, 2)

# Compare Old analysis with new one (This is NOT part of analysis!)

In [54]:
# Create txt files with one column

folder = 'Old_New_analysis/'

path, dirs, files = next(os.walk(folder))

for i, df in enumerate(files):
    if files[i].endswith('.tsv'):
        df_temp = pd.read_csv(folder + files[i], sep='\t')
        df_temp['Symbol'].to_csv(folder + files[i].split('.')[0] + '.txt', index=False, header=None) 
        

In [7]:
# load old & new files 

folder = 'Old_New_analysis/'

all_dict = {}
path, dirs, files = next(os.walk(folder))

for i, df in enumerate(files):
    if files[i].endswith('.txt'):
        temp_df = files[i].split('.')[0]
        all_dict[temp_df] = pd.read_csv(folder + files[i], header=None, names=['Symbol'])
        print('Running {}'. format(files[i]))

for key, value in all_dict.items():
    print(key)
    value['Symbol'] = value['Symbol'].str.strip()

for item in all_dict.items():
    print(item)


Running Mrgd_IFNa_genes.txt
Running Mrgd_IFNy_genes.txt
Running Old_IFNa_geneList.txt
Running Old_IFNy_geneList.txt
Mrgd_IFNa_genes
Mrgd_IFNy_genes
Old_IFNa_geneList
Old_IFNy_geneList
('Mrgd_IFNa_genes',       Symbol
0        A2M
1      ACOT7
2      ACSL5
3       ACY3
4       ADAR
..       ...
212    YPEL2
213   ZC3H7B
214  ZC3HAV1
215  ZFYVE26
216    ZNF24

[217 rows x 1 columns])
('Mrgd_IFNy_genes',       Symbol
0        A2M
1      ACOT7
2      ACSL5
3       ACY3
4       ADAR
..       ...
211    YPEL2
212   ZC3H7B
213  ZC3HAV1
214  ZFYVE26
215    ZNF24

[216 rows x 1 columns])
('Old_IFNa_geneList',       Symbol
0     EPSTI1
1     ANKFY1
2      BAZ2A
3    ZC3HAV1
4      ZNF24
..       ...
325   BTN3A2
326   ZC3H7B
327    PRRG4
328    HADHB
329    CCND1

[330 rows x 1 columns])
('Old_IFNy_geneList',        Symbol
0      EPSTI1
1    TMEM126B
2     ZC3HAV1
3       ZNF24
4      RNF213
..        ...
303    SUCLG1
304   CAPRIN1
305     CCND1
306      VTA1
307      PSPH

[308 rows x 1 column

In [8]:
# Statistical report

# Compare all to "Mrgd_IFNa_genes"

folder = 'Old_New_analysis/Comparison/'

for key, value in all_dict.items():
    print('The file is: {}'.format(key))
    tmp = value[~value['Symbol'].isin(all_dict['Mrgd_IFNa_genes']['Symbol'])]
    display(tmp)
    #tmp.to_csv(folder + key + str('_vs_Mrgd_IFNa_genes.tsv'), sep='\t', index=False) # Save
    

The file is: Mrgd_IFNa_genes


Unnamed: 0,Symbol


The file is: Mrgd_IFNy_genes


Unnamed: 0,Symbol
197,TLCD2
201,TP53BP1


The file is: Old_IFNa_geneList


Unnamed: 0,Symbol
1,ANKFY1
2,BAZ2A
16,SRSF4
19,CREM
21,NADK
...,...
313,NT5C3A
316,APH1A
317,FKBP11
321,RPS8


The file is: Old_IFNy_geneList


Unnamed: 0,Symbol
1,TMEM126B
6,IL18BP
7,SULT1C2
11,ATAD5
18,NEK8
...,...
302,PHB2
303,SUCLG1
304,CAPRIN1
306,VTA1


In [9]:
# Statistical report

# Compare all to "Mrgd_IFNy_genes"

folder = 'Old_New_analysis/Comparison/'

for key, value in all_dict.items():
    print('The file is: {}'.format(key))
    tmp = value[~value['Symbol'].isin(all_dict['Mrgd_IFNy_genes']['Symbol'])]
    display(tmp)
    #tmp.to_csv(folder + key + str('_vs_Mrgd_IFNy_genes.tsv'), sep='\t', index=False) # Save
    

The file is: Mrgd_IFNa_genes


Unnamed: 0,Symbol
58,EPSTI1
199,TMEM62
203,TRIM22


The file is: Mrgd_IFNy_genes


Unnamed: 0,Symbol


The file is: Old_IFNa_geneList


Unnamed: 0,Symbol
0,EPSTI1
1,ANKFY1
2,BAZ2A
16,SRSF4
19,CREM
...,...
313,NT5C3A
316,APH1A
317,FKBP11
321,RPS8


The file is: Old_IFNy_geneList


Unnamed: 0,Symbol
0,EPSTI1
1,TMEM126B
6,IL18BP
7,SULT1C2
11,ATAD5
...,...
302,PHB2
303,SUCLG1
304,CAPRIN1
306,VTA1


In [10]:
# Statistical report

# Finding overlaps

folder = 'Old_New_analysis/Comparison/'

for key, value in all_dict.items():
    print(key)

print('*************************')

ovrlp_a = pd.merge(all_dict['Mrgd_IFNa_genes'], all_dict['Mrgd_IFNy_genes'], how='inner', on='Symbol')
print('New analysis: The size of overlaping for IFNa: ', ovrlp_a.shape, 'The size of IFNa: ', all_dict['Mrgd_IFNa_genes'].shape,
     'The size of IFNy: ', all_dict['Mrgd_IFNy_genes'].shape)
ovrlp_a.to_csv(folder + str('New_ovrlp_IFay.tsv'), sep='\t', index = False)

print('*************************')
ovrlp_y = pd.merge(all_dict['Old_IFNa_geneList'], all_dict['Old_IFNy_geneList'], how='inner', on='Symbol')
print('Old analysis: The size of overlaping for IFNa: ', ovrlp_y.shape, 'The size of IFNa: ', all_dict['Old_IFNa_geneList'].shape,
     'The size of IFNy: ', all_dict['Old_IFNy_geneList'].shape)
ovrlp_y.to_csv(folder + str('Old_ovrlp_IFay.tsv'), sep='\t', index = False)

print('*************************')
ovrlp_ay = pd.merge(ovrlp_a, ovrlp_y, how='inner', on='Symbol')
print('The overlaps between Old vs New analysis: ', ovrlp_ay.shape)

Mrgd_IFNa_genes
Mrgd_IFNy_genes
Old_IFNa_geneList
Old_IFNy_geneList
*************************
New analysis: The size of overlaping for IFNa:  (214, 1) The size of IFNa:  (217, 1) The size of IFNy:  (216, 1)
*************************
Old analysis: The size of overlaping for IFNa:  (220, 1) The size of IFNa:  (330, 1) The size of IFNy:  (308, 1)
*************************
The overlaps between Old vs New analysis:  (214, 1)


# Overlapping antibody-specific genes with the integrative list

In [35]:
## IFNa

in_dir = 'Peaks/'
out_dir = 'TFs_4_Networks/'

for file in os.listdir(in_dir):
    if fnmatch.fnmatch(file, '*IFNa*'):
        tf_tmp = pd.read_csv(in_dir + file, sep='\t', header=None, names=['Symbol'] )
        print('The file is: ', file, 'The dimension is: ', tf_tmp.shape)
        tf_tmp['Symbol'] = tf_tmp['Symbol'].str.strip()
        tf_mrg = pd.merge(ovlp_IFNay, tf_tmp, on='Symbol')  # This is merging!
        tf_mrg.drop_duplicates(subset=['Symbol'], inplace=True, ignore_index=True)
        print('After merging the size is: ', tf_mrg.shape)
        tf_mrg.to_csv(out_dir + str('Integ_') + file, sep='\t', index=False) ########SAVE
        

The file is:  Homr_hs_HuhWT_IFNa_IRF1_allPeaksSet_nR_topScore_10_genes.tsv The dimension is:  (426, 1)
After merging the size is:  (123, 2)
The file is:  Homr_hs_HuhWT_IFNa_IRF9_allPeaksSet_nR_topScore_5_genes.tsv The dimension is:  (286, 1)
After merging the size is:  (90, 2)
The file is:  Homr_hs_HuhWT_IFNa_pSTAT1_allPeaksSet_nR_topScore_20_genes.tsv The dimension is:  (757, 1)
After merging the size is:  (182, 2)
The file is:  Homr_hs_HuhWT_IFNa_pSTAT2_allPeaksSet_nR_topScore_15_genes.tsv The dimension is:  (689, 1)
After merging the size is:  (143, 2)


In [3]:
## IFNy

in_dir = 'Peaks/'
out_dir = 'TFs_4_Networks/'

for file in os.listdir(in_dir):
    if fnmatch.fnmatch(file, '*IFNy*'):
        tf_tmp = pd.read_csv(in_dir + file, sep='\t', header=None, names=['Symbol'] )
        print('The file is: ', file, 'The dimension is: ', tf_tmp.shape)
        tf_tmp['Symbol'] = tf_tmp['Symbol'].str.strip()
        tf_mrg = pd.merge(ovlp_IFNay, tf_tmp, on='Symbol') # This is merging!
        tf_mrg.drop_duplicates(subset=['Symbol'], inplace=True, ignore_index=True)
        print('After merging the size is: ', tf_mrg.shape)
        tf_mrg.to_csv(out_dir + str('Integ_') + file, sep='\t', index=False) #######SAVE

The file is:  Homr_hs_HuhWT_IFNy_IRF1_allPeaksSet_nR_topScore_10_genes.tsv The dimension is:  (496, 1)
After merging the size is:  (126, 2)
The file is:  Homr_hs_HuhWT_IFNy_IRF9_allPeaksSet_nR_topScore_5_genes.tsv The dimension is:  (139, 1)
After merging the size is:  (43, 2)
The file is:  Homr_hs_HuhWT_IFNy_pSTAT1_allPeaksSet_nR_topScore_20_genes.tsv The dimension is:  (547, 1)
After merging the size is:  (137, 2)


# Task: Networks 

## IFNa

In [36]:

in_dir = 'TFs_4_Networks/'
out_dir = 'TFs_4_Networks/Cytoscape_files/'

In [37]:
# Create a dictionary

path, dirs, files = next(os.walk(in_dir))
Integ_dict_a = {}

for i, df in enumerate(files):
    if fnmatch.fnmatch(files[i], '*IFNa*'):
        Integ_dict_a[files[i].split('.')[0]] = pd.read_csv(in_dir + files[i], sep='\t', header=None, skiprows=[0])
        print('File name is: {}'.format(files[i]))
        
for key, value in Integ_dict_a.items():
    print(key)
    print(value.head(1))
    print(value.shape)

File name is: Integ_Homr_hs_HuhWT_IFNa_IRF1_allPeaksSet_nR_topScore_10_genes.tsv
File name is: Integ_Homr_hs_HuhWT_IFNa_IRF9_allPeaksSet_nR_topScore_5_genes.tsv
File name is: Integ_Homr_hs_HuhWT_IFNa_pSTAT1_allPeaksSet_nR_topScore_20_genes.tsv
File name is: Integ_Homr_hs_HuhWT_IFNa_pSTAT2_allPeaksSet_nR_topScore_15_genes.tsv
Integ_Homr_hs_HuhWT_IFNa_IRF1_allPeaksSet_nR_topScore_10_genes
       0     1
0  ACOT7  ISRE
(123, 2)
Integ_Homr_hs_HuhWT_IFNa_IRF9_allPeaksSet_nR_topScore_5_genes
       0     1
0  ACOT7  ISRE
(90, 2)
Integ_Homr_hs_HuhWT_IFNa_pSTAT1_allPeaksSet_nR_topScore_20_genes
       0     1
0  ACOT7  ISRE
(182, 2)
Integ_Homr_hs_HuhWT_IFNa_pSTAT2_allPeaksSet_nR_topScore_15_genes
       0     1
0  ACOT7  ISRE
(143, 2)


In [38]:
# Change the name of dictionary key
Integ_dict_a['ir1_ov_a'] = Integ_dict_a.pop('Integ_Homr_hs_HuhWT_IFNa_IRF1_allPeaksSet_nR_topScore_10_genes')
Integ_dict_a['ir9_ov_a'] = Integ_dict_a.pop ('Integ_Homr_hs_HuhWT_IFNa_IRF9_allPeaksSet_nR_topScore_5_genes')
Integ_dict_a['st1_ov_a'] = Integ_dict_a.pop('Integ_Homr_hs_HuhWT_IFNa_pSTAT1_allPeaksSet_nR_topScore_20_genes')
Integ_dict_a['st2_ov_a'] = Integ_dict_a.pop('Integ_Homr_hs_HuhWT_IFNa_pSTAT2_allPeaksSet_nR_topScore_15_genes')

for key, value in Integ_dict_a.items():
    print(key)
    print(value.head(1))
    print(value.shape)


ir1_ov_a
       0     1
0  ACOT7  ISRE
(123, 2)
ir9_ov_a
       0     1
0  ACOT7  ISRE
(90, 2)
st1_ov_a
       0     1
0  ACOT7  ISRE
(182, 2)
st2_ov_a
       0     1
0  ACOT7  ISRE
(143, 2)


# Create node files for each Ab

In [8]:
#INa ST1

columns = ["#node1", "node2"]
dfs1 = pd.DataFrame(columns=columns)
dfs1['node2'] = Integ_dict_a['st1_ov_a'][0]
dfs1['#node1'] = "STAT1"

# Saving Network File
dfs1.to_csv(out_dir + "IFNa_ST1_ov.tsv", sep="\t", index=False)

In [9]:
# IFNa ST2

columns = ["#node1", "node2"]
dfs2 = pd.DataFrame(columns=columns)
dfs2['node2'] = Integ_dict_a['st2_ov_a'][0]
dfs2["#node1"] = "STAT2"

dfs2.to_csv(out_dir + "IFNa_ST2_ov.tsv", sep="\t", index=False)

In [10]:
# IFNa IRF1

columns = ["#node1", "node2"]
dfir1 = pd.DataFrame(columns = columns)
dfir1['node2'] = Integ_dict_a['ir1_ov_a'][0]
dfir1['#node1'] = "IRF1"

dfir1.to_csv(out_dir + "IFNa_IRF1_ov.tsv", sep="\t", index=False)

In [11]:
# IFNa IRF9

columns = ["#node1", "node2"]
dfir9 = pd.DataFrame(columns=columns)
dfir9["node2"] = Integ_dict_a['ir9_ov_a'][0]
dfir9["#node1"] = "IRF9"

dfir9.to_csv(out_dir + "IFNa_IRF9_ov.tsv", index= False, sep="\t")

In [12]:
# To confrim
x = [dfs1, dfs2, dfir1, dfir9]
for i in x:
    print(i.shape)

(182, 2)
(143, 2)
(123, 2)
(90, 2)


## Different combinations of TFS for IFNa

In [39]:
from functools import reduce

In [40]:
# Creating dataframes
st1_ov_a = pd.DataFrame(data=Integ_dict_a['st1_ov_a'][0].values, columns=['symbol'])
st2_ov_a = pd.DataFrame(data=Integ_dict_a['st2_ov_a'][0].values, columns=['symbol'])
ir1_ov_a = pd.DataFrame(data=Integ_dict_a['ir1_ov_a'][0].values, columns=['symbol'])
ir9_ov_a = pd.DataFrame(data=Integ_dict_a['ir9_ov_a'][0].values, columns=['symbol'])


In [41]:
# Checking
dfs = [st1_ov_a, st2_ov_a, ir1_ov_a, ir9_ov_a]
for i in dfs:
    print(i.shape)

(182, 1)
(143, 1)
(123, 1)
(90, 1)


In [42]:
# overlapping between ST1, ST2 and IRF9

dfs = [st1_ov_a, st2_ov_a, ir9_ov_a]
isgf3 = reduce(lambda left,right: pd.merge(left,right, on="symbol"), dfs)
isgf3_only = pd.merge(isgf3, ir1_ov_a, how="left", indicator=True).loc[lambda x:x["_merge"] == "left_only"].reset_index(drop=True).iloc[:,0]
isgf3_only.shape


(9,)

In [43]:
# overlapping between ST1, ST2 and IRF1

dfs = [st1_ov_a, st2_ov_a, ir1_ov_a]
s1_2_ir1 = reduce(lambda left,right: pd.merge(left,right, on="symbol"), dfs)
s1_2_ir1_only = pd.merge(s1_2_ir1, ir9_ov_a, how="left", indicator=True).loc[lambda x:x["_merge"] == "left_only"].reset_index(drop=True).iloc[:,0]
s1_2_ir1_only.shape


(14,)

In [44]:
s1_2_ir1.shape

(94, 1)

In [45]:
# overlapping between ISGF3 and IRF1

dfs = [isgf3, ir1_ov_a]
isgf3_irf1 = reduce(lambda left,right: pd.merge(left,right, on="symbol"),dfs)
isgf3_irf1.shape


(80, 1)

In [46]:
# overlapping between ST1 and IRF1

dfs = [st1_ov_a, ir1_ov_a]
st1_ir1 = reduce(lambda left,right: pd.merge(left, right, on="symbol"), dfs)
st1_ir1_only = pd.merge(st1_ir1, st2_ov_a, how="left", indicator=True).loc[lambda x:x["_merge"] == "left_only"].iloc[:,0]
st1_ir1_only.shape

(0,)

In [47]:
# overlapping between ST2 and IRF1

dfs = [st2_ov_a, ir1_ov_a]
st2_ir1 = reduce(lambda left,right: pd.merge(left, right, on="symbol"), dfs)
st2_ir1_only = pd.merge(st2_ir1, st1_ov_a, how="left", indicator=True).loc[lambda x:x["_merge"] == "left_only"].iloc[:,0]
st2_ir1_only.shape

(5,)

In [48]:
# overlapping between ST1 & ST2 

dfs = [st1_ov_a, st2_ov_a]
st1_2 = reduce(lambda left,right: pd.merge(left, right, on="symbol"),dfs)
print(st1_2.shape)
st1_2_only = pd.merge(st1_2, ir9_ov_a, how='left', indicator=True).loc[lambda x: x['_merge'] == 'left_only'].reset_index(drop=True).iloc[:,0]
st1_2_only_1 = pd.merge(st1_2_only, ir1_ov_a, how='left', indicator=True).loc[lambda x: x['_merge'] == 'left_only'].reset_index(drop=True).iloc[:,0]
print(st1_2_only_1.shape)

(137, 1)
(34,)


In [49]:
# Finding IRF1_only genes
dfs = [st1_ov_a, st2_ov_a, ir9_ov_a]
st1_2_ir9 = reduce(lambda left,right : pd.merge(left,right, how="outer"),dfs) # Note it is "outer"

# https://stackoverflow.com/questions/48647534/python-pandas-find-difference-between-two-data-frames
ir1_only=pd.merge(ir1_ov_a, st1_2_ir9, how="left", indicator=True).loc[lambda x: x['_merge'] != 'both'].reset_index(drop=True).iloc[:,0]
ir1_only.shape

(23,)

In [50]:
# Finding ST1_only genes
dfs = [st2_ov_a, ir1_ov_a, ir9_ov_a]

st2_ir1_9 = reduce(lambda left,right: pd.merge(left,right, how="outer"), dfs)

st1_only=pd.merge(st1_ov_a, st2_ir1_9, how="left", indicator=True).loc[lambda x: x['_merge'] == "left_only"].reset_index(drop=True).iloc[:,0]
st1_only.shape

(45,)

In [51]:
# Finding ST2_only genes
dfs = [st1_ov_a, ir1_ov_a, ir9_ov_a]

st1_ir1_9 = reduce(lambda left,right: pd.merge(left,right, how="outer"), dfs)

st2_only=pd.merge(st2_ov_a, st1_ir1_9, how="left", indicator=True).loc[lambda x: x['_merge'] == "left_only"].reset_index(drop=True).iloc[:,0]
st2_only.shape


(1,)

In [52]:
# overlapping between IRF1 and IRF9

dfs = [ir1_ov_a, ir9_ov_a]
ir1_9 = reduce(lambda left, right: pd.merge(left, right, on="symbol"), dfs)
ir1_9_only = pd.merge(ir1_9, st1_ov_a, how='left', indicator=True).loc[lambda x: x['_merge'] == 'left_only'].reset_index(drop=True).iloc[:,0]
ir1_9_only.shape

(1,)

In [None]:
######R code ###

# Create an empty plot
plot(NULL ,xaxt='n',yaxt='n',bty='n',ylab='',xlab='', xlim=0:1, ylim=0:1)
# xaxt="n" and yaxt="n" suppress the x and y axis respectively

# use legend as you would if there were a plot
legend("topleft", legend =c('ISGF3+IRF1', 'ISGF3', 'IRF1-only',
    'STAT1-STAT2', 'STAT1 homodimer', 'STAT1-IRF1'), pch=16, pt.cex=3, cex=1.5, bty='n',
    col = c('#FA8072', '#ADFF2F', '#40E0D0', '#FF0000', '#DAA520', '#FFD700'))
mtext("Gene targets", at=0.2, cex=2)

# Generating Style Table _ IFNa

In [54]:
# Preparing the colour table 
# Colours From: https://betterfigures.org/2015/06/23/picking-a-colour-scale-for-scientific-graphics/

#1
columns = ["symbol", "col", "Target"]
df_isgf3_irf1 = pd.DataFrame(columns=columns)
df_isgf3_irf1['symbol'] = isgf3_irf1
df_isgf3_irf1['col'] = "#ffadad"
df_isgf3_irf1['Target'] = "STAT1+STAT2+IRF9+IRF1"

#2
#columns = ["symbol", "col", "Target"]
#df_s1_2_ir1_only = pd.DataFrame(columns=columns)
#df_s1_2_ir1_only['symbol'] = s1_2_ir1_only
#df_s1_2_ir1_only['col'] = "#99d8c9"
#df_s1_2_ir1_only['Target'] = "STAT1 + STAT2 + IRF1"

#3
columns = ["symbol", "col", "Target"]
df_isgf3_only = pd.DataFrame(columns=columns)
df_isgf3_only['symbol'] = isgf3_only
df_isgf3_only['col'] = "#ffd6a5"
df_isgf3_only['Target'] = "STAT1 + STAT2 + IRF9"

#4
columns = ["symbol", "col", "Target"]
df_ir1_only = pd.DataFrame(columns=columns)
df_ir1_only['symbol'] = ir1_only
df_ir1_only['col'] = "#fdffb6"
df_ir1_only['Target'] = "IRF1"

#5
columns = ["symbol", "col", "Target"]
df_st1_2_only_1 = pd.DataFrame(columns=columns)
df_st1_2_only_1['symbol'] = st1_2_only_1
df_st1_2_only_1['col'] = "#caffbf"
df_st1_2_only_1['Target'] = "STAT1 + STAT2"

#6
columns = ["symbol", "col", "Target"]
df_st1_only = pd.DataFrame(columns=columns)
df_st1_only['symbol'] = st1_only
df_st1_only['col'] = "#9bf6ff"
df_st1_only['Target'] = "STAT1"

#7
#columns = ["symbol", "col", "Target"]
#df_ir1_9_only = pd.DataFrame(columns=columns)
#df_ir1_9_only['symbol'] = ir1_9_only
#df_ir1_9_only['col'] = "#e5f5f9"
#df_ir1_9_only['Target'] = "IRF1 + IRF9"

#8
#columns = ["symbol", "col", "Target"]
#df_st2_only = pd.DataFrame(columns=columns)
#df_st2_only['symbol'] = st2_only
#df_st2_only['col'] = "#4eb3d3"
#df_st2_only['Target'] = "STAT2"

#9
#columns = ["symbol", "col", "Target"]
#df_st2_ir1_only = pd.DataFrame(columns=columns)
#df_st2_ir1_only['symbol'] = st2_ir1_only
#df_st2_ir1_only['col'] = "#c994c7"
#df_st2_ir1_only['Target'] = 'STAT2 + IRF1'

# Collect all #s
col_tabl = pd.concat([df_isgf3_irf1, df_s1_2_ir1_only, df_isgf3_only, df_ir1_only, df_st1_2_only_1, df_st1_only,
                     df_ir1_9_only, df_st2_only, df_st2_ir1_only], ignore_index=True)



# col_tabl.to_csv(out_dir + "col_tabl_Modf.tsv", index=False, sep="\t")

In [55]:
# Calculating sum of the elements 

dfs = [df_isgf3_irf1, df_s1_2_ir1_only, df_isgf3_only, df_ir1_only, df_st1_2_only_1, df_st1_only,
                     df_ir1_9_only, df_st2_only, df_st2_ir1_only]

con=[]
for i in dfs:
    cn = i['col'].count()
    con.append(cn)
print('Sum of the elements: ', sum(con))

for i in dfs:
    print('the size: {}'.format(i.shape[0]))

Sum of the elements:  212
the size: 80
the size: 14
the size: 9
the size: 23
the size: 34
the size: 45
the size: 1
the size: 1
the size: 5


In [56]:
# Load color table
col_tabl['Symbol'] = col_tabl['symbol'].str.strip()

col_tabl = col_tabl[['Symbol', 'col', 'Target']]

print(col_tabl.shape)
col_tabl.head(2)

(212, 3)


Unnamed: 0,Symbol,col,Target
0,ACOT7,#ffadad,STAT1+STAT2+IRF9+IRF1
1,ACY3,#ffadad,STAT1+STAT2+IRF9+IRF1


In [57]:
# Load the latest version of motif file

dfm = pd.read_csv("motifs_IFNag_latest.tsv", sep="\t")
print(dfm.shape); print(dfm.columns)
dfm.rename(columns={'Gene':'Symbol'}, inplace=True)
dfm.head(3)

(220, 2)
Index(['Gene', 'Motif'], dtype='object')


Unnamed: 0,Symbol,Motif
0,APOL1,composite
1,ACOT7,ISRE
2,A2M,GAS


In [58]:
# Generate style table file
# Add colour
styl_tbl_a = pd.merge(dfm, col_tabl, on="Symbol", how="inner")
styl_tbl_a.shape

(212, 4)

In [59]:
styl_tbl_a[styl_tbl_a.duplicated(subset=['Symbol'])]

Unnamed: 0,Symbol,Motif,col,Target


In [60]:
styl_tbl_a['Symbol'].nunique()

212

In [61]:
styl_tbl_a['Motif'].unique()

array(['composite', 'ISRE', 'GAS'], dtype=object)

In [62]:
# Add shape

styl_tbl_a.loc[styl_tbl_a['Motif'] == 'GAS', 'Shape'] = 'diamond'
styl_tbl_a.loc[styl_tbl_a['Motif'] == 'ISRE', 'Shape'] = 'ellipse'
styl_tbl_a.loc[styl_tbl_a['Motif'] == 'composite', 'Shape'] = 'rectangle'

styl_tbl_a.head(2)

Unnamed: 0,Symbol,Motif,col,Target,Shape
0,APOL1,composite,#fdffb6,IRF1,rectangle
1,ACOT7,ISRE,#ffadad,STAT1+STAT2+IRF9+IRF1,ellipse


In [63]:
styl_tbl_a.to_csv(out_dir + "Style_table_IFNa.tsv", sep="\t", index=False)

## IFNg

In [4]:

in_dir = 'TFs_4_Networks/'
out_dir = 'TFs_4_Networks/Cytoscape_files/'

In [5]:
# Create a dictionary

path, dirs, files = next(os.walk(in_dir))
Integ_dict_y = {}

for i, df in enumerate(files):
    if fnmatch.fnmatch(files[i], '*IFNy*'):
        Integ_dict_y[files[i].split('.')[0]] = pd.read_csv(in_dir + files[i], sep='\t', header=None, skiprows=[0])
        print('File name is: {}'.format(files[i]))
        
for key, value in Integ_dict_y.items():
    print(key)
    print(value.head(1))
    print(value.shape)

File name is: Integ_Homr_hs_HuhWT_IFNy_IRF1_allPeaksSet_nR_topScore_10_genes.tsv
File name is: Integ_Homr_hs_HuhWT_IFNy_IRF9_allPeaksSet_nR_topScore_5_genes.tsv
File name is: Integ_Homr_hs_HuhWT_IFNy_pSTAT1_allPeaksSet_nR_topScore_20_genes.tsv
Integ_Homr_hs_HuhWT_IFNy_IRF1_allPeaksSet_nR_topScore_10_genes
       0     1
0  ACOT7  ISRE
(126, 2)
Integ_Homr_hs_HuhWT_IFNy_IRF9_allPeaksSet_nR_topScore_5_genes
      0     1
0  ACY3  ISRE
(43, 2)
Integ_Homr_hs_HuhWT_IFNy_pSTAT1_allPeaksSet_nR_topScore_20_genes
     0    1
0  A2M  GAS
(137, 2)


In [6]:
# Change the name of dictionary key
Integ_dict_y['ir1_ov_y'] = Integ_dict_y.pop('Integ_Homr_hs_HuhWT_IFNy_IRF1_allPeaksSet_nR_topScore_10_genes')
Integ_dict_y['ir9_ov_y'] = Integ_dict_y.pop('Integ_Homr_hs_HuhWT_IFNy_IRF9_allPeaksSet_nR_topScore_5_genes')
Integ_dict_y['st1_ov_y'] = Integ_dict_y.pop('Integ_Homr_hs_HuhWT_IFNy_pSTAT1_allPeaksSet_nR_topScore_20_genes')


for key, value in Integ_dict_y.items():
    print(key)
    print(value.head(1))
    print(value.shape)


ir1_ov_y
       0     1
0  ACOT7  ISRE
(126, 2)
ir9_ov_y
      0     1
0  ACY3  ISRE
(43, 2)
st1_ov_y
     0    1
0  A2M  GAS
(137, 2)


# Create node files for each Ab

In [7]:
#IFNy ST1

columns = ["#node1", "node2"]
dfs1 = pd.DataFrame(columns=columns)
dfs1['node2'] = Integ_dict_y['st1_ov_y'][0]
dfs1['#node1'] = "STAT1"

# Saving Network File
dfs1.to_csv(out_dir + "IFNy_ST1_ov.tsv", sep="\t", index=False)

In [8]:
# IFNy IRF1

columns = ["#node1", "node2"]
dfir1 = pd.DataFrame(columns = columns)
dfir1['node2'] = Integ_dict_y['ir1_ov_y'][0]
dfir1['#node1'] = "IRF1"

dfir1.to_csv(out_dir + "IFNy_IRF1_ov.tsv", sep="\t", index=False)

In [9]:
# IFNy IRF9

columns = ["#node1", "node2"]
dfir9 = pd.DataFrame(columns=columns)
dfir9["node2"] = Integ_dict_y['ir9_ov_y'][0]
dfir9["#node1"] = "IRF9"

dfir9.to_csv(out_dir + "IFNy_IRF9_ov.tsv", index= False, sep="\t")

In [10]:
# To confrim
x = [dfs1, dfir1, dfir9]
for i in x:
    print(i.shape)

(137, 2)
(126, 2)
(43, 2)


## Different combinations of TFS for IFNy

In [11]:
from functools import reduce

In [12]:
# Creating dataframes
st1_ov_y = pd.DataFrame(data=Integ_dict_y['st1_ov_y'][0].values, columns=['symbol'])
ir1_ov_y = pd.DataFrame(data=Integ_dict_y['ir1_ov_y'][0].values, columns=['symbol'])
ir9_ov_y = pd.DataFrame(data=Integ_dict_y['ir9_ov_y'][0].values, columns=['symbol'])


In [13]:
# Checking
dfs = [st1_ov_y, ir1_ov_y, ir9_ov_y]
for i in dfs:
    print(i.shape)

(137, 1)
(126, 1)
(43, 1)


In [14]:
# overlapping between ST1, IRF1 and IRF9

dfs = [st1_ov_y, ir1_ov_y, ir9_ov_y]
st1_ir1_9 = reduce(lambda left,right: pd.merge(left,right, on="symbol"), dfs).reset_index(drop=True).iloc[:,0]
st1_ir1_9.shape

#isgf3_only = pd.merge(isgf3, ir1_ov_a, how="left", indicator=True).loc[lambda x:x["_merge"] == "left_only"].reset_index(drop=True).iloc[:,0]
#isgf3_only.shape


(34,)

In [15]:
# overlapping between ST1 and IRF1

dfs = [st1_ov_y, ir1_ov_y]
st1_ir1_only = reduce(lambda left,right: pd.merge(left, right, on="symbol"), dfs).reset_index(drop=True).iloc[:,0]
st1_ir1_only.shape
st1_ir1_only_only = pd.merge(st1_ir1_only, st1_ir1_9, how='left', indicator=True).loc[lambda x: x['_merge'] == "left_only"].iloc[:,0]
#st1_ir1_only = pd.merge(st1_ir1, st2_ov_a, how="left", indicator=True).loc[lambda x:x["_merge"] == "left_only"].iloc[:,0]
#st1_ir1_only.shape
st1_ir1_only_only.shape

(18,)

In [16]:
# overlapping between ST1 and IRF9

dfs = [st1_ov_y, ir9_ov_y]
st1_ir9_only = reduce(lambda left,right: pd.merge(left, right, on="symbol"), dfs).reset_index(drop=True).iloc[:,0]
st1_ir9_only.shape
st1_ir9_only_only = pd.merge(st1_ir9_only, st1_ir1_9, how='left', indicator=True).loc[lambda x: x['_merge'] == "left_only"].iloc[:,0]
#st1_ir1_only = pd.merge(st1_ir1, st2_ov_a, how="left", indicator=True).loc[lambda x:x["_merge"] == "left_only"].iloc[:,0]
#st1_ir1_only.shape
st1_ir9_only_only.shape

(2,)

In [17]:
# overlapping between IRF1 and IRF9

dfs = [ir1_ov_y, ir9_ov_y]
ir1_9 = reduce(lambda left,right: pd.merge(left, right, on="symbol"), dfs).reset_index(drop=True).iloc[:,0]
ir1_9_only = pd.merge(ir1_9, st1_ov_y, how='left', indicator=True).loc[lambda x: x['_merge'] == "left_only"].iloc[:,0]
ir1_9_only.shape

(6,)

In [18]:
st1_ir9_only_only.values

array(['IRF9', 'SERPING1'], dtype=object)

In [19]:
# Finding ST1_only genes
dfs = [ir1_ov_y, ir9_ov_y]

ir1_9 = reduce(lambda left,right: pd.merge(left,right, how="outer"), dfs)

st1_only=pd.merge(st1_ov_y, ir1_9, how="left", indicator=True).loc[lambda x: x['_merge'] == "left_only"].reset_index(drop=True).iloc[:,0]
st1_only.shape

(83,)

In [20]:
# Finding IRF1_only genes
dfs = [st1_ov_y, ir9_ov_y]
st1_ir9 = reduce(lambda left,right : pd.merge(left,right, how="outer"),dfs)

# https://stackoverflow.com/questions/48647534/python-pandas-find-difference-between-two-data-frames
ir1_only=pd.merge(ir1_ov_y, st1_ir9, how="left", indicator=True).loc[lambda x: x['_merge'] != 'both'].reset_index(drop=True).iloc[:,0]
ir1_only.shape

(68,)

In [21]:
# Finding IRF9_only genes
dfs = [st1_ov_y, ir1_ov_y]
st1_ir1 = reduce(lambda left,right : pd.merge(left,right, how="outer"),dfs)

# https://stackoverflow.com/questions/48647534/python-pandas-find-difference-between-two-data-frames
ir9_only=pd.merge(ir9_ov_y, st1_ir1, how="left", indicator=True).loc[lambda x: x['_merge'] != 'both'].reset_index(drop=True).iloc[:,0]
ir9_only.shape

(1,)

# Generating Style Table _ IFNy

In [22]:
# Preparing the colour table ***Modified version***

#1
columns = ["symbol", "col", "Target"]
df_st1_ir1_9 = pd.DataFrame(columns=columns)
df_st1_ir1_9['symbol'] = st1_ir1_9
df_st1_ir1_9['col'] = "#ffadad"
df_st1_ir1_9['Target'] = "STAT1 + IRF1 + IRF9"

#2
columns = ["symbol", "col", "Target"]
df_ir1_only = pd.DataFrame(columns=columns)
df_ir1_only['symbol'] = ir1_only
df_ir1_only['col'] = "#ffd6a5"
df_ir1_only['Target'] = "IRF1"
#3
columns = ["symbol", "col", "Target"]
df_st1_only = pd.DataFrame(columns=columns)
df_st1_only['symbol'] = st1_only
df_st1_only['col'] = "#fdffb6"
df_st1_only['Target'] = "STAT1"

#4
columns = ["symbol", "col", "Target"]
df_st1_ir1_only = pd.DataFrame(columns=columns)
df_st1_ir1_only['symbol'] = st1_ir1_only_only
df_st1_ir1_only['col'] = "#caffbf"
df_st1_ir1_only['Target'] = "STAT1 + IRF1"

#5
columns = ["symbol", "col", "Target"]
df_st1_ir9_only = pd.DataFrame(columns=columns)
df_st1_ir9_only['symbol'] = st1_ir9_only_only
df_st1_ir9_only['col'] = "#9bf6ff"
df_st1_ir9_only['Target'] = "STAT1 + IRF9"

#6 We are Not interested in this group!!
#columns = ["symbol", "col", "Target"]
#df_ir9_only = pd.DataFrame(columns=columns)
#df_ir9_only['symbol'] = ir9_only
#df_ir9_only['col'] = "#e5f5f9"
#df_ir9_only['Target'] = "IRF9"

#7
columns = ["symbol", "col", "Target"]
df_ir1_9_only = pd.DataFrame(columns=columns)
df_ir1_9_only['symbol'] = ir1_9_only
df_ir1_9_only['col'] = "#66c2a4"
df_ir1_9_only['Target'] = "IRF1 + IRF9"


# Collect all #s
col_tabl = pd.concat([df_st1_ir1_9, df_ir1_only, df_st1_only, df_st1_ir1_only, df_st1_ir9_only, df_ir1_9_only ], ignore_index=True)



#col_tabl.to_csv("col_tabl_INFy_Modf.tsv", index=False, sep="\t")

In [23]:
col_tabl['col'].unique()

array(['#ffadad', '#ffd6a5', '#fdffb6', '#caffbf', '#9bf6ff', '#66c2a4'],
      dtype=object)

In [24]:
col_tabl.columns

Index(['symbol', 'col', 'Target'], dtype='object')

In [67]:
# Calculating sum of the elements 

df = [df_st1_ir1_9, df_ir1_only, df_st1_only, df_st1_ir1_only]

con=[]
for i in df:
    cn = i['col'].count()
    con.append(cn)
print('Sum of the elements: ', sum(con))

Sum of the elements:  203


In [25]:
# Load colur table
col_tabl['Symbol'] = col_tabl['symbol'].str.strip()

col_tabl = col_tabl[['Symbol', 'col', 'Target']]

print(col_tabl.shape)
col_tabl.head(2)

(211, 3)


Unnamed: 0,Symbol,col,Target
0,ADAR,#ffadad,STAT1 + IRF1 + IRF9
1,APOL2,#ffadad,STAT1 + IRF1 + IRF9


In [26]:
# Load the latest version of motif file

dfm = pd.read_csv("motifs_IFNag_latest.tsv", sep="\t")
print(dfm.shape); print(dfm.columns)
dfm.rename(columns={'Gene':'Symbol'}, inplace=True)
dfm.head(3)

(220, 2)
Index(['Gene', 'Motif'], dtype='object')


Unnamed: 0,Symbol,Motif
0,APOL1,composite
1,ACOT7,ISRE
2,A2M,GAS


In [28]:
# Generate style table file
# Add colour

styl_tbl_y = pd.merge(dfm, col_tabl, on="Symbol", how="inner")
styl_tbl_y.shape

(211, 4)

In [29]:
styl_tbl_y[styl_tbl_y.duplicated(subset=['Symbol'])]

Unnamed: 0,Symbol,Motif,col,Target


In [30]:
styl_tbl_y['Symbol'].nunique()

211

In [31]:
styl_tbl_y['Motif'].unique()

array(['composite', 'ISRE', 'GAS'], dtype=object)

In [32]:
# Add shape

styl_tbl_y.loc[styl_tbl_y['Motif'] == 'GAS', 'Shape'] = 'diamond'
styl_tbl_y.loc[styl_tbl_y['Motif'] == 'ISRE', 'Shape'] = 'ellipse'
styl_tbl_y.loc[styl_tbl_y['Motif'] == 'composite', 'Shape'] = 'rectangle'

styl_tbl_y.head(2)

Unnamed: 0,Symbol,Motif,col,Target,Shape
0,APOL1,composite,#caffbf,STAT1 + IRF1,rectangle
1,ACOT7,ISRE,#ffd6a5,IRF1,ellipse


In [33]:
styl_tbl_y.to_csv(out_dir + "Style_table_IFNy_rev1.tsv", sep="\t", index=False)

In [50]:
styl_tbl_a.shape

NameError: name 'styl_tbl_a' is not defined

In [51]:
styl_tbl_y.shape

(212, 4)

In [68]:
styl_tbl = pd.concat([styl_tbl_a, styl_tbl_y], ignore_index=True)
styl_tbl.shape

(394, 4)

In [69]:
styl_tbl[styl_tbl.duplicated(subset=['Symbol'])].count()

Symbol    181
Motif     181
col       181
Shape     181
dtype: int64

In [165]:
styl_tbl.to_csv(out_dir + 'Style_table.tsv', sep='\t', index=False)

# Task: Enrichment analysis

In [None]:
g = pd.read_csv("overlap_IFNa_y/jVenn_IFNa_IFNy_GAS.csv")
c = pd.read_csv("overlap_IFNa_y/jVenn_IFNa_IFNy_composite.csv")
i = pd.read_csv("overlap_IFNa_y/jVenn_IFNa_IFNy_ISRE.csv")
g.head(3)

In [None]:
# Finding overlapping elements between two columns
# https://stackoverflow.com/questions/18079563/finding-the-intersection-between-two-series-in-pandas

s1 = pd.Series(g['IFNa_GAS']).dropna()
s2 = pd.Series(g['IFNy_GAS']).dropna()
pd.Series(np.intersect1d(s1,s2))

In [None]:
# saving
g['IFNa_GAS|IFNy_GAS'].to_csv("overlap_IFNa_y/GAS.txt", sep="\t", index=False, header=None)
c['IFNa_comp|IFNy_comp'].to_csv("overlap_IFNa_y/Comp.txt", sep="\t", index=False, header=None)
i['IFNa_ISRE|IFNy_ISRE'].to_csv("overlap_IFNa_y/ISRE.txt", sep="\t", index=False, header=None)