In [10]:
import pandas as pd

# Path to the file
file_path = "../data/cath-domain-list.txt"

# Correct column names as per your description
columns = [
    "domain_id",     # 1: CATH domain name
    "class",         # 2: Class number (C-level)
    "architecture",  # 3: Architecture number (A-level)
    "topology",      # 4: Topology number (T-level)
    "homology",      # 5: Homologous superfamily number (H-level)
    "s35",           # 6: S35 sequence cluster number
    "s60",           # 7: S60 sequence cluster number
    "s95",           # 8: S95 sequence cluster number
    "s100",          # 9: S100 sequence cluster number
    "s100_count",    # 10: S100 sequence count
    "length",        # 11: Domain length
    "resolution"     # 12: Structure resolution (in Ã…)
]

# Read and filter file (skip comment lines)
with open(file_path, 'r') as f:
    lines = [line.strip() for line in f if not line.startswith('#') and line.strip()]

# Split each line into columns
data = [line.split() for line in lines]

# Create the DataFrame
df = pd.DataFrame(data, columns=columns)

# Convert numeric columns (everything except domain_id) to appropriate numeric types
for col in columns[1:]:  # Skip 'domain_id' which is a string
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [11]:
from Bio import SeqIO
import pandas as pd

# Load sequences from the FASTA file
sequences = list(SeqIO.parse("../data/cath-domain-seqs.fa.txt", "fasta"))

# Create a DataFrame from the sequences
seq_data = []
for record in sequences:
    # Extract the domain ID from the sequence ID
    # Format appears to be "cath|C_A_T|domainID/range"
    id_parts = record.id.split('|')
    if len(id_parts) >= 3:
        # Extract just the domain ID portion (removing any range information)
        domain_id = id_parts[2].split('/')[0]
        
        # Add to our data
        seq_data.append({
            'domain_id': domain_id,
            'sequence': str(record.seq),
        })

# Create DataFrame from sequence data
seq_df = pd.DataFrame(seq_data)

# Now merge with the original DataFrame
merged_df = pd.merge(df, seq_df, on='domain_id', how='inner')

In [12]:
merged_df.to_csv("../data/domains-and-seqs-merged.csv", index=False)