# Aim: Preparing motif file and IFN-specific genes

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Loading overlapping genes

ovr = pd.read_csv('padj05/ovrlp/ovrlp_genes.txt', sep="\t")

ovrlp = pd.DataFrame(data=ovr['elements'].values, columns=['symbol'])

#saving
#ovrlp.to_csv("ovrlap_genes_a_g.tsv", sep="\t", index=False, header=None)

ovr['elements'].shape

(220,)

In [3]:
ovr.head(2)

Unnamed: 0,Names,total,elements,Unnamed: 3,IFNa,110,AASS,Unnamed: 7,IFNy,88,ANP32E
0,IFNa IFNy,220.0,A2M,,,,ANAPC4,,,,APOL3
1,,,ACOT7,,,,ANKFY1,,,,ARF6


In [22]:
only_IFa = ovr['AASS'].iloc[0:109]
only_IFy = ovr['ANP32E'].iloc[:87]

print(only_IFa[only_IFa.isin(only_IFy)].shape)
print(only_IFy[only_IFy.isin(only_IFa)].shape)

print('***********')
print(ovr[ovr['elements'].isin(only_IFa)].shape)
print(ovr[ovr['elements'].isin(only_IFy)].shape)

(0,)
(0,)
***********
(0, 11)
(0, 11)


In [23]:
only_IFa.to_csv('only_IFa.tsv', sep='\t', index=False, header=None)
only_IFy.to_csv('only_IFy.tsv', sep='\t', index=False, header=None)


## IFNa

In [24]:
# Loading motifs

mtf = pd.read_csv("padj05/motifs_IFNa.tsv", sep="\t")
mtf.columns

Index(['level_0', 'level_1', 'gene', 'motfcol'], dtype='object')

In [25]:
mtf['motfcol'].unique()

array(['blue', 'orange', 'green'], dtype=object)

In [26]:
mtf['level_1'].unique()

array(['composite', 'ISRE', 'GAS'], dtype=object)

In [27]:
#GAS
gas = mtf[mtf['motfcol'] == 'green'].iloc[:,2]

#ISRE
isre = mtf[mtf['motfcol'] == 'orange'].iloc[:,2]

#COM
comp = mtf[mtf['motfcol'] == 'blue'].iloc[:,2]

for i in gas, isre, comp:
    print('Size: ', i.shape)

Size:  (145,)
Size:  (130,)
Size:  (64,)


In [28]:
ovrlp_gas = ovrlp[ovrlp['symbol'].isin(gas)]
ovrlp_isre = ovrlp[ovrlp['symbol'].isin(isre)]
ovrlp_comp = ovrlp[ovrlp['symbol'].isin(comp)]

for i in ovrlp_gas, ovrlp_isre, ovrlp_comp:
    print('Size: ', i['symbol'].shape)

Size:  (90,)
Size:  (82,)
Size:  (50,)


In [29]:
# Checking duplicates
print(ovrlp_gas[ovrlp_gas.duplicated(subset=['symbol'])])
print(ovrlp_isre[ovrlp_isre.duplicated(subset=['symbol'])])
print(ovrlp_comp[ovrlp_comp.duplicated(subset=['symbol'])])

Empty DataFrame
Columns: [symbol]
Index: []
Empty DataFrame
Columns: [symbol]
Index: []
Empty DataFrame
Columns: [symbol]
Index: []


In [30]:
# Finding overlaps
pd.merge(ovrlp_gas, ovrlp_isre, on=['symbol'])

Unnamed: 0,symbol
0,NCOA7
1,NFKBIZ


In [31]:
mtf[mtf['gene'].isin(['NCOA7', 'NFKBIZ', 'FGB', 'SHISA5'])]

Unnamed: 0,level_0,level_1,gene,motfcol
119,39,GAS,FGB,green
194,65,ISRE,NCOA7,orange
198,67,ISRE,NFKBIZ,orange
221,78,GAS,NCOA7,green
225,80,GAS,NFKBIZ,green
262,99,ISRE,SHISA5,orange


In [32]:
# Removing
mtf.drop(index=[221,198], inplace=True)

In [33]:
mtf[mtf['gene'].isin(['NCOA7', 'NFKBIZ', 'FGB', 'SHISA5'])]

Unnamed: 0,level_0,level_1,gene,motfcol
119,39,GAS,FGB,green
194,65,ISRE,NCOA7,orange
225,80,GAS,NFKBIZ,green
262,99,ISRE,SHISA5,orange


In [34]:
mtf.loc[119,['level_1', 'motfcol']] = ['ISRE', 'orange']

In [35]:
mtf.loc[262, ['level_1', 'motfcol']] = ['composite', 'blue']


In [36]:
mtf.loc[194,['level_1', 'motfcol']] = ['composite', 'blue']


In [37]:
mtf[mtf['gene'].isin(['NCOA7', 'NFKBIZ', 'FGB', 'SHISA5'])]

Unnamed: 0,level_0,level_1,gene,motfcol
119,39,ISRE,FGB,orange
194,65,composite,NCOA7,blue
225,80,GAS,NFKBIZ,green
262,99,composite,SHISA5,blue


In [38]:
pd.merge(ovrlp_gas, ovrlp_comp, on=["symbol"])

Unnamed: 0,symbol


In [39]:
pd.merge(ovrlp_isre, ovrlp_comp, on=['symbol'])

Unnamed: 0,symbol


In [40]:
mtf_a = mtf.copy()

# Saving motif annotations for IFNa
# mtf.to_csv("motifs_IFNa_latest.tsv", index=False, sep="\t")

In [41]:
#GAS
gas = mtf_a[mtf_a['motfcol'] == 'green'].iloc[:,2]

#ISRE
isre = mtf_a[mtf_a['motfcol'] == 'orange'].iloc[:,2]

#COM
comp = mtf_a[mtf_a['motfcol'] == 'blue'].iloc[:,2]

for i in gas, isre, comp:
    print('Size: ', i.shape)

Size:  (143,)
Size:  (128,)
Size:  (66,)


In [42]:
ovrlp_gas = ovrlp[ovrlp['symbol'].isin(gas)]
ovrlp_isre = ovrlp[ovrlp['symbol'].isin(isre)]
ovrlp_comp = ovrlp[ovrlp['symbol'].isin(comp)]

for i in ovrlp_gas, ovrlp_isre, ovrlp_comp:
    print('Size: ', i['symbol'].shape)

Size:  (88,)
Size:  (80,)
Size:  (52,)


In [43]:
89 + 80 + 51

220

In [279]:
# Saving

ovrlp_gas.to_csv("ovrlp_gas_a.tsv", index=False, sep="\t", header=None)
ovrlp_isre.to_csv("ovrlp_isre_a.tsv", index=False, sep="\t", header=None)
ovrlp_comp.to_csv("ovrlp_comp_a.tsv", index=False, sep="\t", header=None)

## IFNg 

In [23]:
# Loading motifs

mtf = pd.read_csv("padj05/motifs_IFNy.tsv", sep="\t")
mtf.columns

Index(['level_0', 'level_1', 'symbol', 'motfcol'], dtype='object')

In [24]:
mtf['motfcol'].unique()

array(['blue', 'green', 'orange'], dtype=object)

In [25]:
#GAS
gas = mtf[mtf['motfcol'] == 'green'].iloc[:,2]

#ISRE
isre = mtf[mtf['motfcol'] == 'orange'].iloc[:,2]

#COM
comp = mtf[mtf['motfcol'] == 'blue'].iloc[:,2]

for i in gas, isre, comp:
    print('Size: ', i.shape)

Size:  (141,)
Size:  (123,)
Size:  (55,)


In [26]:
################# Ovelapping genes between IFa & IFg
ovrlp_gas = ovrlp[ovrlp['symbol'].isin(gas)]
ovrlp_isre = ovrlp[ovrlp['symbol'].isin(isre)]
ovrlp_comp = ovrlp[ovrlp['symbol'].isin(comp)]

for i in ovrlp_gas, ovrlp_isre, ovrlp_comp:
    print('Size: ', i.shape)

Size:  (91, 1)
Size:  (89, 1)
Size:  (44, 1)


In [27]:
pd.merge(ovrlp_gas, ovrlp_isre, on=['symbol'])

Unnamed: 0,symbol
0,FGB
1,NCOA7
2,NFKBIZ
3,SHISA5


In [28]:
mtf[mtf['symbol'].isin(['FGB', 'NCOA7', 'NFKBIZ', 'SHISA5'])]

Unnamed: 0,level_0,level_1,symbol,motfcol
101,33,ISRE,FGB,orange
118,39,GAS,FGB,green
176,60,ISRE,NCOA7,orange
182,63,ISRE,NFKBIZ,orange
207,76,GAS,NCOA7,green
215,80,GAS,NFKBIZ,green
244,94,ISRE,SHISA5,orange
283,114,GAS,SHISA5,green


In [29]:
mtf.drop(index=[118,207,283,182], inplace=True)

In [30]:
mtf[mtf['symbol'].isin(['FGB', 'NCOA7', 'NFKBIZ', 'SHISA5'])]

Unnamed: 0,level_0,level_1,symbol,motfcol
101,33,ISRE,FGB,orange
176,60,ISRE,NCOA7,orange
215,80,GAS,NFKBIZ,green
244,94,ISRE,SHISA5,orange


In [31]:
mtf.loc[176, ['level_1', 'motfcol']] = ['composite', 'blue']

In [32]:
mtf.loc[244, ['level_1', 'motfcol']] = ['composite', 'blue']

In [33]:
mtf[mtf['symbol'].isin(['FGB', 'NCOA7', 'NFKBIZ', 'SHISA5'])]

Unnamed: 0,level_0,level_1,symbol,motfcol
101,33,ISRE,FGB,orange
176,60,composite,NCOA7,blue
215,80,GAS,NFKBIZ,green
244,94,composite,SHISA5,blue


In [34]:
pd.merge(ovrlp_gas, ovrlp_comp, on=['symbol'])

Unnamed: 0,symbol


In [35]:
pd.merge(ovrlp_isre, ovrlp_comp, on=['symbol'])

Unnamed: 0,symbol


In [36]:
mtf_g = mtf.copy()

# Saving motif annotations for IFNg
# mtf_g.to_csv("motifs_IFNg_latest.tsv", index=False, sep="\t")


In [37]:
#GAS
gas = mtf_g[mtf_g['motfcol'] == 'green'].iloc[:,2]

#ISRE
isre = mtf_g[mtf_g['motfcol'] == 'orange'].iloc[:,2]

#COM
comp = mtf_g[mtf_g['motfcol'] == 'blue'].iloc[:,2]

for i in gas, isre, comp:
    print('Size: ', i.shape)

Size:  (138,)
Size:  (120,)
Size:  (57,)


In [38]:
################# Ovelapping genes between IFa & IFg
ovrlp_gas = ovrlp[ovrlp['symbol'].isin(gas)]
ovrlp_isre = ovrlp[ovrlp['symbol'].isin(isre)]
ovrlp_comp = ovrlp[ovrlp['symbol'].isin(comp)]

for i in ovrlp_gas, ovrlp_isre, ovrlp_comp:
    print('Size: ', i.shape)

Size:  (88, 1)
Size:  (86, 1)
Size:  (46, 1)


In [39]:
88 + 86 + 46

220

In [40]:
########################### IFNg genes
IFNg = pd.read_csv("padj05/IFNy/IFNy_MRGgene.tsv", sep="\t")
print(IFNg.shape)

IFNg_gas = IFNg[IFNg['symbol'].isin(gas)].iloc[:,0].drop_duplicates()

IFNg_isre = IFNg[IFNg['symbol'].isin(isre)].iloc[:,0].drop_duplicates()

IFNg_comp = IFNg[IFNg['symbol'].isin(comp)].iloc[:,0].drop_duplicates()

for i in IFNg_gas, IFNg_isre, IFNg_comp:
    print("Size: ", i.shape)

(319, 4)
Size:  (134,)
Size:  (117,)
Size:  (57,)


In [298]:
# Saving

#IFNg_gas.to_csv("IFNg_gas.tsv", index=False, sep="\t", header=None)
#IFNg_isre.to_csv("IFNg_isre.tsv", index=False, sep="\t", header=None)
#IFNg_comp.to_csv("IFNg_comp.tsv", index=False, sep="\t", header=None)

In [41]:
# Making a unified list for motif annotations

for i in mtf_a, mtf_g:
    print("size: ", i.shape)

size:  (337, 4)
size:  (315, 4)


In [42]:
print(mtf_a.columns)
print(mtf_g.columns)

Index(['level_0', 'level_1', 'gene', 'motfcol'], dtype='object')
Index(['level_0', 'level_1', 'symbol', 'motfcol'], dtype='object')


In [43]:
mtf_ag = pd.merge(mtf_a, mtf_g, left_on='gene', right_on='symbol')
mtf_ag.shape

(236, 8)

In [44]:
mtf_ag.head(3)

Unnamed: 0,level_0_x,level_1_x,gene,motfcol_x,level_0_y,level_1_y,symbol,motfcol_y
0,0,composite,APOL1,blue,0,composite,APOL1,blue
1,0,ISRE,ACOT7,orange,0,ISRE,ACOT7,orange
2,0,GAS,A2M,green,0,GAS,A2M,green


In [45]:
mtf_ag['motfcol_x'].equals(mtf_ag['motfcol_y'])

False

In [46]:
# Compare two columns 
# https://stackoverflow.com/questions/27474921/compare-two-columns-using-pandas


mtf_ag['RES'] = np.where(mtf_ag['motfcol_x'] == mtf_ag['motfcol_y'], 'OK', 'NOT')
mtf_ag.head(3)

Unnamed: 0,level_0_x,level_1_x,gene,motfcol_x,level_0_y,level_1_y,symbol,motfcol_y,RES
0,0,composite,APOL1,blue,0,composite,APOL1,blue,OK
1,0,ISRE,ACOT7,orange,0,ISRE,ACOT7,orange,OK
2,0,GAS,A2M,green,0,GAS,A2M,green,OK


In [50]:
mtf_ag[mtf_ag.duplicated(subset=['gene'])].shape

(16, 9)

In [53]:
mtf_ag.drop_duplicates(subset=['gene'], inplace=True)
mtf_ag[mtf_ag.duplicated(subset=['gene'])].shape

(0, 9)

In [54]:
# Applying corrections in mtf_ag dataset

mtf_ag[mtf_ag['RES'] == 'NOT']

Unnamed: 0,level_0_x,level_1_x,gene,motfcol_x,level_0_y,level_1_y,symbol,motfcol_y,RES
31,15,composite,DDX58,blue,20,ISRE,DDX58,orange,NOT
39,18,GAS,BOD1,green,4,composite,BOD1,blue,NOT
57,26,composite,IFI6,blue,41,ISRE,IFI6,orange,NOT
59,27,composite,IFIT5,blue,46,ISRE,IFIT5,orange,NOT
63,30,composite,IL15,blue,47,ISRE,IL15,orange,NOT
107,51,composite,RTP4,blue,90,ISRE,RTP4,orange,NOT
109,52,composite,SAMD9,blue,91,ISRE,SAMD9,orange,NOT
134,62,composite,VTN,blue,136,GAS,VTN,green,NOT


In [55]:
# DDX58 (comp), BOD1 (GAS), IFI6 (ISRE), IFIT5 (ISRE), IL15 (comp), RTP4 (ISRE), SAMD9 (ISRE), VTN (GAS) 

mtf_ag.loc[57, ['level_1_x', 'motfcol_x']] = ['ISRE', 'orange']
mtf_ag.loc[59, ['level_1_x', 'motfcol_x']] = ['ISRE', 'orange']
mtf_ag.loc[107, ['level_1_x', 'motfcol_x']] = ['ISRE', 'orange']
mtf_ag.loc[109, ['level_1_x', 'motfcol_x']] = ['ISRE', 'orange']
mtf_ag.loc[134, ['level_1_x', 'motfcol_x']] = ['GAS', 'green']


In [60]:
mtf_ag.shape

(220, 9)

In [61]:
mtf_ag.columns

Index(['level_0_x', 'level_1_x', 'gene', 'motfcol_x', 'level_0_y', 'level_1_y',
       'symbol', 'motfcol_y', 'RES'],
      dtype='object')

In [64]:
# saving Final motif list

mtf_ag_finl = mtf_ag.copy()

mtf_ag_finl.rename(columns={'level_1_x': 'Motif', 'gene': 'Gene', 'motfcol_x': 'motifCOL'}, inplace=True)

cols = ['Gene', 'Motif', 'motifCOL']

mtf_ag_finl.loc[:, cols].to_csv("motifs_IFNag_latest.tsv", index=False, sep="\t")

In [47]:
# Loading motif File

mtf_ag = pd.read_csv("motifs_IFNag_latest.tsv", sep='\t')

mtf_ag[mtf_ag['Gene'].isin(ovr['elements'])].shape

(220, 3)

In [48]:
mtf_ag.head(3)

Unnamed: 0,Gene,Motif,motifCOL
0,APOL1,composite,blue
1,ACOT7,ISRE,orange
2,A2M,GAS,green


In [52]:
#GAS
gas = mtf_ag[mtf_ag['Motif'] == 'GAS'].iloc[:,0]

#ISRE
isre = mtf_ag[mtf_ag['Motif'] == 'ISRE'].iloc[:,0]

#COM
comp = mtf_ag[mtf_ag['Motif'] == 'composite'].iloc[:,0]

for i in gas, isre, comp:
    print('Size: ', i.shape)

Size:  (89,)
Size:  (84,)
Size:  (47,)


In [53]:
89 + 84 + 47

220

In [54]:
# Saving

gas.to_csv("gas.tsv", index=False, sep="\t", header=None)
isre.to_csv("isre.tsv", index=False, sep="\t", header=None)
comp.to_csv("comp.tsv", index=False, sep="\t", header=None)

In [62]:
mtf_ag.shape

(220, 3)

## Assigning geometric shapes

In [60]:
mtf_shape = mtf_ag.copy()
mtf_shape.head(3)

Unnamed: 0,Gene,Motif,motifCOL
0,APOL1,composite,blue
1,ACOT7,ISRE,orange
2,A2M,GAS,green


In [61]:
mtf_shape.loc[mtf_test['Motif'] == 'ISRE', 'Shape'] = 'rectangle'
mtf_shape.loc[mtf_test['Motif'] == 'GAS', 'Shape'] = 'diamond'
mtf_shape.loc[mtf_test['Motif'] == 'composite', 'Shape'] = 'ellipse'

mtf_shape.head(3)

Unnamed: 0,Gene,Motif,motifCOL,Shape
0,APOL1,composite,blue,ellipse
1,ACOT7,ISRE,orange,rectangle
2,A2M,GAS,green,diamond


In [63]:
mtf_shape.to_csv('motifs_IFNag_latestWshape.tsv', index=False, sep='\t')