In [3]:
import pandas as pd

# Path to the file
file_path = "../data/cath-domain-list.txt"

# Correct column names as per your description
columns = [
    "domain_id",     # 1: CATH domain name
    "class",         # 2: Class number (C-level)
    "architecture",  # 3: Architecture number (A-level)
    "topology",      # 4: Topology number (T-level)
    "homology",      # 5: Homologous superfamily number (H-level)
    "s35",           # 6: S35 sequence cluster number
    "s60",           # 7: S60 sequence cluster number
    "s95",           # 8: S95 sequence cluster number
    "s100",          # 9: S100 sequence cluster number
    "s100_count",    # 10: S100 sequence count
    "length",        # 11: Domain length
    "resolution"     # 12: Structure resolution (in Å)
]

# Read and filter file (skip comment lines)
with open(file_path, 'r') as f:
    lines = [line.strip() for line in f if not line.startswith('#') and line.strip()]

# Split each line into columns
data = [line.split() for line in lines]

# Create the DataFrame
df = pd.DataFrame(data, columns=columns)

# Convert numeric columns (everything except domain_id) to appropriate numeric types
for col in columns[1:]:  # Skip 'domain_id' which is a string
    df[col] = pd.to_numeric(df[col], errors='coerce')

df.head()

Unnamed: 0,domain_id,class,architecture,topology,homology,s35,s60,s95,s100,s100_count,length,resolution
0,1oaiA00,1,10,8,10,1,1,1,1,1,59,1.0
1,1go5A00,1,10,8,10,1,1,1,1,2,69,999.0
2,3frhA01,1,10,8,10,2,1,1,1,1,58,1.2
3,3friA01,1,10,8,10,2,1,1,1,2,54,1.8
4,3b89A01,1,10,8,10,2,1,1,2,1,54,2.6


In [7]:
df.describe()

Unnamed: 0,class,architecture,topology,homology,s35,s60,s95,s100,s100_count,length,resolution
count,601328.0,601328.0,601328.0,601328.0,601328.0,601328.0,601328.0,601328.0,601328.0,601328.0,601328.0
mean,2.350057,39.072376,302.498571,359.958106,46.563029,1.454586,2.189472,2.354243,39.943593,160.459671,67.138614
std,0.873659,25.869857,464.340722,1314.629248,102.067688,1.059137,6.971045,6.886318,135.702155,87.883097,245.751471
min,1.0,10.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,9.0,0.48
25%,2.0,20.0,40.0,10.0,2.0,1.0,1.0,1.0,2.0,99.0,1.95
50%,3.0,40.0,70.0,10.0,8.0,1.0,1.0,1.0,5.0,136.0,2.39
75%,3.0,60.0,420.0,140.0,40.0,2.0,1.0,2.0,21.0,204.0,2.9
max,6.0,180.0,4200.0,12820.0,1117.0,21.0,211.0,214.0,2278.0,1221.0,1000.0


In [36]:
# First, let's count domains in each homology group
homology_counts = df.groupby(['class', 'architecture', 'topology', 'homology'])['domain_id'].count()

# Filter for homology groups with more than 100 domains
large_homology_groups = homology_counts[homology_counts > 250]

# Get descriptive statistics for these large homology groups
large_homology_stats = large_homology_groups.describe()

large_homology_stats

count      407.000000
mean      1135.840295
std       2830.423429
min        254.000000
25%        360.000000
50%        549.000000
75%       1022.000000
max      49516.000000
Name: domain_id, dtype: float64

In [42]:
df.groupby(['class', 'architecture', 'topology', 'homology', "s100"])['domain_id'].count()


class  architecture  topology  homology  s100
1      10            8         10        1       270
                                         2        58
                                         3         2
                                         4         1
                                         5         1
                                                ... 
6      20            440       10        1        10
                     450       10        1         3
                                         2         1
                               20        1         1
                     460       10        1         1
Name: domain_id, Length: 22251, dtype: int64

In [25]:
from Bio import SeqIO

sequences = list(SeqIO.parse("../data/cath-domain-seqs.fa", "fasta"))

In [27]:
from Bio import SeqIO
import pandas as pd
import re

# Load sequences from the FASTA file
sequences = list(SeqIO.parse("../data/cath-domain-seqs.fa", "fasta"))

# Create a DataFrame from the sequences
seq_data = []
for record in sequences:
    # Extract the domain ID from the sequence ID
    # Format appears to be "cath|C_A_T|domainID/range"
    id_parts = record.id.split('|')
    if len(id_parts) >= 3:
        # Extract just the domain ID portion (removing any range information)
        domain_id = id_parts[2].split('/')[0]
        
        # Add to our data
        seq_data.append({
            'domain_id': domain_id,
            'sequence': str(record.seq),
            'length_from_seq': len(record.seq)
        })

# Create DataFrame from sequence data
seq_df = pd.DataFrame(seq_data)

# Now merge with the original DataFrame
merged_df = pd.merge(df, seq_df, on='domain_id', how='inner')

# Check if the lengths match
merged_df['length_matches'] = merged_df['length'] == merged_df['length_from_seq']

# Display the first few rows
print(f"Total records in original df: {len(df)}")
print(f"Total sequences: {len(seq_df)}")
print(f"Total merged records: {len(merged_df)}")
print(f"Records with matching lengths: {merged_df['length_matches'].sum()}")

# Display a sample of the merged data
merged_df.head()

Total records in original df: 601328
Total sequences: 601328
Total merged records: 601328
Records with matching lengths: 292448


Unnamed: 0,domain_id,class,architecture,topology,homology,s35,s60,s95,s100,s100_count,length,resolution,sequence,length_from_seq,length_matches
0,1oaiA00,1,10,8,10,1,1,1,1,1,59,1.0,PTLSPEQQEMLQAFSTQSGMNLEWSQKCLQDNNWDYTRSAQAFTHL...,59,True
1,1go5A00,1,10,8,10,1,1,1,1,2,69,999.0,PAPTPSSSPVPTLSPEQQEMLQAFSTQSGMNLEWSQKCLQDNNWDY...,69,True
2,3frhA01,1,10,8,10,2,1,1,1,1,58,1.2,YPMNINDALTSILASKKYRALCPDTVRRILTEEWGRHKSPKQTVEA...,58,True
3,3friA01,1,10,8,10,2,1,1,1,2,54,1.8,YPMNINDALTSILASKKYRALCPDTVRRILTEEWGRHKSPKQTVEA...,57,False
4,3b89A01,1,10,8,10,2,1,1,2,1,54,2.6,SLNINDALTSILASKKYRALCPDTVRRILTEEWGRHKSPKQTVEAA...,56,False


In [38]:
merged_df.to_csv("../data/domains-and-seqs.csv", index=False)

In [39]:
merged_df[merged_df["domain_id"] == "1oaiA00"]

Unnamed: 0,domain_id,class,architecture,topology,homology,s35,s60,s95,s100,s100_count,length,resolution,sequence,length_from_seq,length_matches,length_diff
0,1oaiA00,1,10,8,10,1,1,1,1,1,59,1.0,PTLSPEQQEMLQAFSTQSGMNLEWSQKCLQDNNWDYTRSAQAFTHL...,59,True,0


In [30]:
# Analyze the length differences
merged_df['length_diff'] = merged_df['length'] - merged_df['length_from_seq']

# Summary statistics of length differences
print("Length difference statistics:")
print(merged_df['length_diff'].describe())

# Count of exact matches vs. differences
print("\nCount of records by length difference:")
length_diff_counts = merged_df['length_diff'].value_counts().sort_index()
print(length_diff_counts)

# Percentage of records with small differences (e.g., ±5 residues)
small_diff_pct = ((merged_df['length_diff'].abs() <= 5).sum() / len(merged_df)) * 100
print(f"\nPercentage of records with length difference ≤ 5 residues: {small_diff_pct:.2f}%")

# Look at some examples of records with large differences
large_diff = merged_df[merged_df['length_diff'].abs() > 10].sort_values('length_diff', ascending=False).head()
print("\nExamples of records with large length differences:")
print(large_diff[['domain_id', 'class', 'architecture', 'topology', 'homology', 'length', 'length_from_seq', 'length_diff']])

Length difference statistics:
count    601328.000000
mean         -4.598803
std           7.617173
min        -194.000000
25%          -7.000000
50%          -1.000000
75%           0.000000
max          13.000000
Name: length_diff, dtype: float64

Count of records by length difference:
length_diff
-194    1
-185    1
-146    1
-125    1
-108    1
       ..
 2      8
 3      3
 5      8
 6      2
 13     3
Name: count, Length: 110, dtype: int64

Percentage of records with length difference ≤ 5 residues: 71.05%

Examples of records with large length differences:
       domain_id  class  architecture  topology  homology  length  \
544043   3sdiM00      3            60        20        10     233   
544042   3sdi100      3            60        20        10     233   
223175   4nj9H02      2            60        40        10      94   
530538   1pkmA01      3            40      1380        20     176   
530623   2vgbA01      3            40      1380        20     174   

        length_fr