In [3]:
import pandas as pd
from pathlib import Path
import protclust

data = Path('../data')

In [3]:
dfs = [pd.read_csv(data / f'ec_{i + 1}_reviewed.tsv', sep='\t') for i in range(7)]
for idx, df in enumerate(dfs):
    df['label'] = idx + 1

In [None]:
df = pd.concat(dfs)
df

In [6]:
# filter by length
filtered_df = df[(df['Sequence'].apply(len) <= 1000) & (df['Sequence'].apply(len) >= 100)]

filtered_df

Unnamed: 0,Entry,Sequence,EC number,label
0,A0A059TC02,MRSVSGQVVCVTGAGGFIASWLVKILLEKGYTVRGTVRNPDDPKNG...,1.2.1.44,1
1,A0A072ULZ1,MEENKKTVDGSVDFTEEQEALVVKSWNAMKNNSCDLSLKFFTKILE...,1.7.2.-,1
2,A0A072VDF2,MPAATAAAAAESSSVSGETICVTGAGGFIASWMVKLLLEKGYTVRG...,1.2.1.-; 1.2.1.44,1
3,A0A075BSX9,MTEKIYDAIVVGAGFSGLVAARELSAQGRSVLIIEARHRLGGRTHV...,1.5.3.5,1
4,A0A075TMP0,MASTTPSTYKQAVFKEQGAGLTLEEVALTLPKRDEILVKVEACGVC...,1.1.1.-,1
...,...,...,...,...
14598,Q9ZYM7,MFINWTMMLISFSIVFLYMFFMTFYFNIFFIFEYNLMSILSFEYKF...,7.1.1.2,7
14599,Q9ZZ38,MYTLISIIGKYISCLPALLIVAFLTISERKTMASMQRRLGQNIVGY...,7.1.1.2,7
14600,Q9ZZ43,MVYFMFIMLVGLILGLMAVASNPSPYFAALGLVVAAGVGCGLLVGH...,7.1.1.2,7
14601,Q9ZZ47,MNLIMSSVAATALISLILAFVAFWLPSLNPDNEKLSPYECGFDPLG...,7.1.1.2,7


In [7]:
cleaned_df = protclust.clean(filtered_df, sequence_col='Sequence')

2025-04-16 01:07:29,827 - protein_clustering - INFO - Cleaning sequences in column 'Sequence' with valid amino acids: ACDEFGHIKLMNPQRSTVWY
2025-04-16 01:07:29,831 - protein_clustering - INFO - Input dataframe has 267755 sequences
2025-04-16 01:07:33,403 - protein_clustering - INFO - Removed 563 sequences with invalid amino acids
2025-04-16 01:07:33,403 - protein_clustering - INFO - Final dataframe has 267192 valid sequences


In [8]:
clustered_df = protclust.cluster(
    cleaned_df, 
    sequence_col='Sequence', 
    id_col='Entry', 
    min_seq_id=0.95, 
    coverage=0.8, 
    alignment_mode=3, 
    random_state=42
    )

2025-04-16 01:07:40,134 - protein_clustering - INFO - Starting sequence clustering with MMseqs2
2025-04-16 01:07:40,135 - protein_clustering - INFO - Parameters: min_seq_id=0.95, coverage=0.8, cov_mode=0, alignment_mode=3, cluster_mode=0, cluster_steps=1, random_state=42
2025-04-16 01:07:40,143 - protein_clustering - INFO - Clustering 267192 sequences
2025-04-16 01:17:20,486 - protein_clustering - INFO - Found 168044 clusters
2025-04-16 01:17:20,705 - protein_clustering - INFO - Clustering complete, added 'representative_sequence' column to DataFrame


In [10]:
clustered_df.to_csv(data / 'clustered_data.csv')

In [4]:
clustered_df = pd.read_csv(data / 'clustered_data.csv')

In [None]:
train_df, test_df = protclust.split(clustered_df, test_size=0.1, random_state=42)

2025-04-16 09:47:00,961 - protein_clustering - INFO - Splitting data by 'representative_sequence' with target test size 0.1
2025-04-16 09:47:00,963 - protein_clustering - INFO - Total sequence count: 267192
2025-04-16 09:47:00,964 - protein_clustering - INFO - Target test count: 26719


In [5]:
# get unique representative sequences
representative_entries = clustered_df['representative_sequence'].unique()
representative_df = clustered_df[clustered_df['Entry'].isin(representative_entries)].copy()

In [None]:
representative_df.reset_index(drop=True)
train_df, test_df = protclust.split(representative_df, test_size=0.1, random_state=42)

2025-04-16 11:36:22,382 - protein_clustering - INFO - Splitting data by 'representative_sequence' with target test size 0.1
2025-04-16 11:36:22,385 - protein_clustering - INFO - Total sequence count: 172134
2025-04-16 11:36:22,387 - protein_clustering - INFO - Target test count: 17213
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f2b7a963c70>>
Traceback (most recent call last):
  File "/hpc/group/singhlab/user/mas296/MICROMAMBA/envs/enzyme_diffusion/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [7]:
train_df.head(3)

Unnamed: 0.1,Unnamed: 0,Entry,Sequence,EC number,label,representative_sequence
35900,194668,J7LMP2,MDLAKQISVVDSSLQDVTRNITRPLANFHPNVWGDRFLLNNSDQVQ...,4.2.3.100,4,J7LMP2
142168,257730,O21280,MNLTKIQNLTIHNITGIRSNKIIFQNINFSLEKGSLFIIQGSNGSG...,7.6.2.5,7,O21280
114743,37239,P39421,MKYSVMQLKDFKIKSMDASVRASIREELLSEGFNLSEIELLIHCIT...,2.4.2.31,2,P39421


In [None]:
train_df = train_df.reset_index(drop=True).drop(labels=['representative_sequence'], axis=1)
test_df = test_df.reset_index(drop=True).drop(labels=['representative_sequence'], axis=1)

In [9]:
train_df.to_csv(data / 'train_ec_all.csv', index=False)
test_df.to_csv(data / 'test_ec_all.csv', index=False)