### Run definitions

In [None]:
import os
import pandas as pd
import sys

def extract_numeric_part(residue_code):
    numeric_part = ''.join(filter(str.isdigit, residue_code))
    return int(numeric_part)

def process_tsv_file(file_path, freq):
    interaction_type = os.path.splitext(os.path.basename(file_path))[0].split('_')[1]
    if interaction_type == "all":
        return []

    df = pd.read_csv(file_path, sep='\t', comment='#', header=None,
                     names=['residue_1', 'residue_2', 'contact_frequency'])

    df = df[df['contact_frequency'] > freq]

    df['residue_1'] = df['residue_1'].str[2:]
    df['residue_2'] = df['residue_2'].str[2:]

    df['residue_1_num'] = df['residue_1'].apply(extract_numeric_part)
    df = df.sort_values(by='residue_1_num')

    df.drop(columns=['residue_1_num'], inplace=True)
    
    interactions = []
    for index, row in df.iterrows():
        interactions.append([interaction_type, f"{row['contact_frequency']*100:.0f}""%", row['residue_1'], row['residue_2']])

    return interactions

In [None]:
# Directory containing .tsv files (from GetContacts)
directory = '/mnt/storage1/adam/CGRP/New/CGRP1-37/ligand_receptor_interactions/'

output_csv_file = (directory+'interactions.csv')    #don't change

freq_threshold = 0.5        # Frequency threshold
range_start = 462           #start of peptide ligand
range_end = 499             #end of peptide ligand

all_interactions = []
for filename in os.listdir(directory):
    if filename.endswith("freq.tsv"):                       #change this if you want to analyse individul interactions (i.e., hp_freq.tsv)
        file_path = os.path.join(directory, filename)
        all_interactions.extend(process_tsv_file(file_path, freq_threshold))
        
for interaction in all_interactions:
    print(interaction)

In [None]:
df_interactions = pd.DataFrame(all_interactions, columns=['Interaction Type', 'Frequency', 'Residue 1', 'Residue 2'])
df_interactions.to_csv(output_csv_file, index=False)

### If ligand is at the end of the protein sequence, use these

In [None]:
df = pd.read_csv(output_csv_file)
residue1_numbers = df['Residue 1'].str.split(':', expand=True)[1].astype(int)
residue2_numbers = df['Residue 2'].str.split(':', expand=True)[1].astype(int)

# Check if residue 1 is below the first residue number and swap residue 1 and residue 2 column if true
swap_mask = residue1_numbers < range_start
df.loc[swap_mask, ['Residue 1', 'Residue 2']] = df.loc[swap_mask, ['Residue 2', 'Residue 1']].to_numpy()

df.to_csv(directory+'reordered.csv', index=False)

In [None]:
df = pd.read_csv(directory+'reordered.csv')
df['numbers'] = df['Residue 1'].str.extract(r'(\d+)$').astype(int)
df_sorted = df.sort_values(by='numbers')

df_sorted = df_sorted.drop(columns=['numbers'])

df_sorted.to_csv(directory+'Interactions_ordered_by_ligand.csv', index=False)  #change the name of output .csv file if needed

### If the ligand is in the middle of the protein sequence, use this:

In [None]:
df = pd.read_csv(output_csv_file)
residue1_numbers = df['Residue 1'].str.split(':', expand=True)[1].astype(int)
residue2_numbers = df['Residue 2'].str.split(':', expand=True)[1].astype(int)

swap_mask = (residue1_numbers >= range_start) & (residue1_numbers <= range_end)
df.loc[swap_mask, ['Residue 1', 'Residue 2']] = df.loc[swap_mask, ['Residue 2', 'Residue 1']].to_numpy()

df.to_csv(directory+'reordered1.csv', index=False)

In [None]:
df = pd.read_csv(directory+'reordered1.csv')
df['numbers'] = df['Residue 2'].str.extract(r'(\d+)$').astype(int)
df_sorted = df.sort_values(by='numbers')

df_sorted = df_sorted.drop(columns=['numbers'])

df_sorted.to_csv(directory+'all_Interactions_ordered_by_ligand.csv', index=False)  #change the name of output .csv file if needed