In [1]:
import pandas as pd

In [2]:
# Read in the TRRUST data
raw_data = pd.read_csv('./trrust_rawdata.human.tsv', sep='\t', names=['tf1', 'tf2', 'regulation_type', 'pmid'])

where `tf1` is the TF, `tf2` is the target, `regulation_type` is the regulation type of the TF-target pair, and `pmid` is the PubMed ID(s) of the paper(s) from which TRRUST inferred the interaction.

# Bidirectional TF-Target Pairs

In [3]:
# Make a copy to avoid altering the original raw data DataFrame
df = raw_data.copy()

# Create a new DataFrame with reversed pairs
df_reversed = df[['tf2', 'tf1']].rename(columns={'tf2': 'tf1', 'tf1': 'tf2'})

# Merge the original DataFrame with the reversed pairs DataFrame to identify pairs and their reversed equivalents
merged = pd.merge(df, df_reversed, on=['tf1', 'tf2'], how='inner')

df = pd.DataFrame(merged)

# Create a new column containing tuples of (tf1, tf2) ensuring the pair and its reversed form are represented in the same order
df['pair_tuple'] = df.apply(lambda row: tuple(sorted([row['tf1'], row['tf2']])), axis=1)

# Sort the DataFrame based on the 'pair_tuple' column
df_sorted = df.sort_values(by='pair_tuple')

# Drop the 'pair_tuple' column
df_sorted = df_sorted.drop(columns=['pair_tuple'])

# Exclude self-regulation TFs (rows where tf1 == tf2)
different_directions = df_sorted[df_sorted.tf1 != df_sorted.tf2].reset_index(drop=True)

# Aggregate regulation types and PMIDs of any TF-target duplicates, joining them with a semicolon
processed_different_directions = different_directions.groupby(['tf1', 'tf2']).agg({'regulation_type': lambda x: ';'.join(set(sorted(list(x)))), 'pmid': lambda x: ';'.join(set(sorted(list(x))))}).reset_index()

In [4]:
processed_different_directions

Unnamed: 0,tf1,tf2,regulation_type,pmid
0,AHR,ARNT,Unknown,19255421;8631989
1,AR,CTNNB1,Activation,18535113
2,AR,ETV1,Activation,19789348
3,AR,JUN,Activation,20148354
4,AR,MED1,Unknown,17483314
...,...,...,...,...
201,WT1,CTCF,Unknown,24534946
202,WT1,ETS1,Activation,18212735;20842112
203,WT1,HOXA10,Unknown,23888944
204,WT1,MYB,Repression,7559553


# Self-Regulated TF-Target Pairs

In [5]:
# Create a new DataFrame from the TRRUST raw data where tf1 (TF) and tf2 (target) are the same
same_tfs = raw_data[raw_data.tf1 == raw_data.tf2].reset_index(drop=True)

# Aggregate regulation types and PMIDs of any TF-target duplicates, joining them with a semicolon
processed_same_tfs = same_tfs.groupby(['tf1', 'tf2']).agg({'regulation_type': lambda x: ';'.join(set(sorted(list(x)))), 'pmid': lambda x: ';'.join(set(sorted(list(x))))}).reset_index()

In [6]:
processed_same_tfs

Unnamed: 0,tf1,tf2,regulation_type,pmid
0,BCL6,BCL6,Repression,12407182
1,E2F1,E2F1,Unknown,7958836
2,ERG,ERG,Activation,21536859
3,ESR1,ESR1,Unknown,16636675
4,FLI1,FLI1,Activation,19829305
5,FOS,FOS,Repression,2498646
6,GATA1,GATA1,Unknown;Activation,17628529;20542454;16492768
7,GATA3,GATA3,Unknown,20484083
8,HIF1A,HIF1A,Repression,14744852
9,HOXD9,HOXD9,Activation,7926763


# Standard TF-Target Pairs

In [9]:
# Identify `tf1`-`tf2` duplicates in the raw data
duplicates = raw_data[raw_data.duplicated(subset=['tf1', 'tf2'], keep=False)].reset_index(drop=True)

# Create ambiguous (same directions) DataFrame to later exclude by filtering the previously identified duplicates from different_directions
same_directions = duplicates[~duplicates.isin(different_directions.to_dict(orient='list')).all(axis=1)].reset_index(drop=True)

# Create standard TFs DataFrame
## exclude self-regulated TFs
t = pd.merge(raw_data, same_tfs, how='outer', indicator=True)
t = t[t._merge=='left_only'].drop('_merge', axis=1)

## exclude bidirectional TFs
t = pd.merge(t, different_directions, how='outer', indicator=True)
t = t[t._merge=='left_only'].drop('_merge', axis=1)

## exclude ambiguous TFs
t = pd.merge(t, same_directions, how='outer', indicator=True)
t = t[t._merge=='left_only'].drop('_merge', axis=1)

## aggregate regulation types and PMIDs with ‘;’
processed_t = t.groupby(['tf1', 'tf2']).agg({'regulation_type': lambda x: ';'.join(x), 'pmid': lambda x: ';'.join(x)}).reset_index()

processed_t

Unnamed: 0,tf1,tf2,regulation_type,pmid
0,AATF,BAX,Repression,22909821
1,AATF,CDKN1A,Unknown,17157788
2,AATF,KLK3,Unknown,23146908
3,AATF,MYC,Activation,20549547
4,AATF,TP53,Unknown,17157788
...,...,...,...,...
7385,ZNF76,CDKN1A,Repression,15280358
7386,ZNF76,PCYT1A,Activation,14702349
7387,ZNF76,TALDO1,Unknown,14702349
7388,ZNRD1,ABCB1,Activation,16373708


## Selecting a random sample of 250

In [8]:
# selecting one random sample of 250 TF-target pairs
processed_standard_tfs = processed_t.sample(n=250, replace=False, random_state=42, ignore_index=True)
processed_standard_tfs

Unnamed: 0,tf1,tf2,regulation_type,pmid
0,YY1,VEGFB,Activation,20660740
1,NFIC,CYP1A1,Repression,10490621
2,ETV3,MMP1,Unknown,10913304
3,SPI1,CXCR1,Activation,9867862
4,NR4A1,RNF7,Repression,22159226
...,...,...,...,...
245,RBMX,FOS,Repression,16707624
246,CEBPA,IL5,Unknown,10453008
247,STAT3,NANOG,Activation,21689689
248,VDR,CALB1,Activation,1317496


When choosing 10 different random samples, we set random_state to the current iteration (i.e., 0 to 9).

# Ambiguous TF-Target Pairs

In [10]:
df = raw_data.copy()

# Group by tf and target and count the number of unique `regulation_type`s 
duplicate_pairs = df.groupby(['tf1', 'tf2'])['regulation_type'].nunique()

# Identify tf-target pairs with more than one regulation_type
valid_pairs = duplicate_pairs[duplicate_pairs > 1].index

# Filter the original DataFrame to include only valid pairs
filtered_df = df[df.set_index(['tf1', 'tf2']).index.isin(valid_pairs)]

# Drop duplicates (based on tf, target, and regulation type) from the filtered DataFrame while keeping at least one of each group
result = filtered_df.drop_duplicates(subset=['tf1', 'tf2', 'regulation_type'])

# Drop self-regulated pairs
same_directions = result.drop(index=result[result.tf1 == result.tf2].index)

# Aggregate regulation types and PMIDs of any TF-target duplicates, joining them with a semicolon
result = same_directions.groupby(['tf1', 'tf2']).agg({'regulation_type': lambda x: ';'.join(set(sorted(list(x)))), 'pmid': lambda x: ';'.join(set(sorted(list(x))))}).reset_index()

# Exclude rows in the bidirectional DataFrame (different_directions)
processed_same_directions = result[~result.isin(processed_different_directions.to_dict(orient='list')).all(axis=1)].reset_index(drop=True)

processed_same_directions

Unnamed: 0,tf1,tf2,regulation_type,pmid
0,ABL1,CSF1,Unknown;Activation,23418320;18619508
1,ABL1,JUN,Repression;Activation,19357699;15145216
2,AHR,CYP1A1,Unknown;Repression;Activation,10359656;12376470;15325265;19376845;21357676;2...
3,AHR,CYP1B1,Unknown;Activation,16115918;19287966;12376470;19376845;21742528;2...
4,AHR,IL6,Unknown;Activation,20511231;18483242;23349129
...,...,...,...,...
796,ZBTB16,CCNA2,Unknown;Repression,20731660;14527952
797,ZBTB17,CDKN1A,Unknown;Repression,22000024;16142238
798,ZEB1,CDH1,Unknown;Repression;Activation,22147512;12161443;15311212;23743934;19839049
799,ZFP36,HIF1A,Repression;Activation,19962963
