In [10]:
import requests
import pandas as pd
import io
import time
import random

# Original query was random but 90% was human taxonomy, let's try to diversify
def download_diverse_uniprot_enzymes(max_entries=30000, output_file="diverse_eukaryotic_enzymes.csv"):
    """
    Download enzyme data from a diverse set of eukaryotic organisms and save as CSV

    Parameters:
    max_entries (int): Maximum number of entries to download
    output_file (str): Output CSV filename
    """
    print("Downloading diverse eukaryotic enzyme data from UniProt...")

    # Define major eukaryotic taxonomic groups to ensure diversity
    eukaryotic_groups = [
        {"name": "Mammals", "id": 40674},
        {"name": "Birds", "id": 8782},
        {"name": "Reptiles", "id": 8504},
        {"name": "Amphibians", "id": 8292},
        {"name": "Fish", "id": 7898},
        {"name": "Insects", "id": 50557},
        {"name": "Arachnids", "id": 6854},
        {"name": "Crustaceans", "id": 6657},
        {"name": "Molluscs", "id": 6447},
        {"name": "Fungi", "id": 4751},
        {"name": "Plants", "id": 33090},
        {"name": "Algae", "id": 3041},
        {"name": "Protists", "id": 2759}  # This is general eukaryotes, will catch others
    ]

    # Entries per taxonomic group
    entries_per_group = max(1000, max_entries // len(eukaryotic_groups))

    all_data = []
    base_url = "https://rest.uniprot.org/uniprotkb/search"

    # Process each taxonomic group separately
    for group in eukaryotic_groups:
        print(f"\nFetching enzymes from {group['name']} (taxonomy_id:{group['id']})...")

        # UniProt allows max 500 entries per request
        batch_size = 500
        offset = 0
        group_entries = 0

        # Add a random sort parameter to get different results each time
        sort_options = ["length desc", "length asc", "accession asc", "accession desc"]
        sort_param = random.choice(sort_options)

        while offset < entries_per_group:
            remaining = min(batch_size, entries_per_group - offset)

            params = {
                'query': f'reviewed:true AND taxonomy_id:{group["id"]} AND ec:*',
                'format': 'tsv',
                'fields': 'accession,id,organism_name,ec,length,protein_name',
                'size': remaining,
                'offset': offset,
                'sort': sort_param
            }

            print(f"  Fetching batch {offset//batch_size + 1} (entries {offset+1}-{offset+remaining})...")
            response = requests.get(base_url, params=params)

            if response.status_code != 200:
                print(f"  Error fetching data: {response.status_code}")
                print(f"  Response content: {response.text[:500]}...")
                break

            # Process batch
            content = io.StringIO(response.text)
            batch_df = pd.read_csv(content, sep='\t')

            if len(batch_df) == 0:
                print("  No more data available for this group.")
                break

            # Add taxonomic group info
            batch_df['taxonomic_group'] = group['name']

            all_data.append(batch_df)
            offset += len(batch_df)
            group_entries += len(batch_df)

            # Sleep briefly to avoid overwhelming the API
            time.sleep(1)

        print(f"  Retrieved {group_entries} entries from {group['name']}")

    # Combine all groups
    if all_data:
        df = pd.concat(all_data, ignore_index=True)
        total_entries = len(df)
        print(f"\nRetrieved a total of {total_entries} diverse eukaryotic enzyme entries")

        # Limit entries per organism for more diversity
        df = limit_entries_per_organism(df)

        # Display organism distribution
        print("\nOrganism distribution (top 15):")
        org_column = 'Organism' if 'Organism' in df.columns else 'organism_name'
        org_counts = df[org_column].value_counts().head(15)
        for org, count in org_counts.items():
            print(f"  {org}: {count} entries")

        # Display taxonomic group distribution
        print("\nTaxonomic group distribution:")
        group_counts = df['taxonomic_group'].value_counts()
        for group, count in group_counts.items():
            print(f"  {group}: {count} entries")

        # Basic cleaning
        df = clean_enzyme_data(df)

        # Save to CSV
        df.to_csv(output_file, index=False)
        print(f"\nData saved to {output_file}")
        return df
    else:
        print("No data was retrieved.")
        return None

def limit_entries_per_organism(df, max_per_organism=200):
    """
    Limit the number of entries per organism to ensure diversity

    Parameters:
    df (pd.DataFrame): Original data
    max_per_organism (int): Maximum entries per organism

    Returns:
    pd.DataFrame: Data with limited entries per organism
    """
    org_column = 'Organism' if 'Organism' in df.columns else 'organism_name'

    # Get organism counts
    org_counts = df[org_column].value_counts()

    # Identify organisms with too many entries
    orgs_to_limit = org_counts[org_counts > max_per_organism].index.tolist()

    if not orgs_to_limit:
        return df

    print(f"\nLimiting {len(orgs_to_limit)} organisms to max {max_per_organism} entries each for better diversity")

    # Create a new dataframe with limited entries
    filtered_rows = []

    # Keep all entries for organisms under the limit
    under_limit = df[~df[org_column].isin(orgs_to_limit)]
    filtered_rows.append(under_limit)

    # Sample entries for organisms over the limit
    for org in orgs_to_limit:
        org_data = df[df[org_column] == org]
        sampled = org_data.sample(max_per_organism, random_state=42)
        filtered_rows.append(sampled)

    result = pd.concat(filtered_rows, ignore_index=True)

    print(f"Original entry count: {len(df)}")
    print(f"After limiting per-organism entries: {len(result)}")

    return result

def clean_enzyme_data(df):
    """
    Basic cleaning of enzyme data

    Parameters:
    df (pd.DataFrame): Raw UniProt data

    Returns:
    pd.DataFrame: Cleaned data
    """
    print("\nCleaning data...")

    # Rename columns for clarity
    rename_dict = {}

    # Handle different possible column names
    if 'Entry' in df.columns:
        rename_dict['Entry'] = 'accession'

    if 'Organism' in df.columns:
        rename_dict['Organism'] = 'organism'
    elif 'organism_name' in df.columns:
        rename_dict['organism_name'] = 'organism'

    if 'EC number' in df.columns:
        rename_dict['EC number'] = 'ec_number'
    elif 'ec' in df.columns:
        rename_dict['ec'] = 'ec_number'

    if 'Length' in df.columns:
        rename_dict['Length'] = 'length'

    if 'Protein names' in df.columns:
        rename_dict['Protein names'] = 'protein_name'
    elif 'protein_name' in df.columns:
        rename_dict['protein_name'] = 'protein_name'

    # Apply renaming
    df = df.rename(columns=rename_dict)

    # Convert length to numeric
    if 'length' in df.columns:
        df['length'] = pd.to_numeric(df['length'], errors='coerce')

        # Create length bins (10 amino acids per bin)
        bin_size = 10
        min_length = 50
        max_length = 600

        # Create bins
        bins = list(range(min_length, max_length + bin_size, bin_size))
        df['length_bin'] = pd.cut(df['length'], bins=bins)

        # Filter for lengths between 50 and 600 amino acids
        df_filtered = df[(df['length'] >= 50) & (df['length'] <= 600)]
        print(f"Filtered from {len(df)} to {len(df_filtered)} entries in the 50-600 aa range")
        df = df_filtered

    # Display length distribution statistics
    print(f"Length range: {df['length'].min()}-{df['length'].max()} amino acids")
    print(f"Mean length: {df['length'].mean():.2f} amino acids")
    print(f"Median length: {df['length'].median():.2f} amino acids")

    return df

# Run the function to download and save diverse data
download_diverse_uniprot_enzymes(max_entries=30000, output_file="diverse_eukaryotic_enzymes.csv")

Downloading diverse eukaryotic enzyme data from UniProt...

Fetching enzymes from Mammals (taxonomy_id:40674)...
  Fetching batch 1 (entries 1-500)...
  Fetching batch 2 (entries 501-1000)...
  Fetching batch 3 (entries 1001-1500)...
  Fetching batch 4 (entries 1501-2000)...
  Fetching batch 5 (entries 2001-2307)...
  Retrieved 2307 entries from Mammals

Fetching enzymes from Birds (taxonomy_id:8782)...
  Fetching batch 1 (entries 1-500)...
  Fetching batch 2 (entries 501-1000)...
  Fetching batch 3 (entries 1001-1500)...
  Fetching batch 4 (entries 1501-2000)...
  Fetching batch 5 (entries 2001-2307)...
  Retrieved 2307 entries from Birds

Fetching enzymes from Reptiles (taxonomy_id:8504)...
  Fetching batch 1 (entries 1-500)...
  Fetching batch 2 (entries 501-1000)...
  Fetching batch 3 (entries 1001-1500)...
  Fetching batch 4 (entries 1501-2000)...
  Fetching batch 5 (entries 2001-2307)...
  Retrieved 2307 entries from Reptiles

Fetching enzymes from Amphibians (taxonomy_id:8292)..

Unnamed: 0,accession,Entry Name,organism,ec_number,length,protein_name,taxonomic_group,length_bin
79,P86215,PRDX6_MESAU,Mesocricetus auratus (Golden hamster),1.11.1.27; 2.3.1.23; 3.1.1.4,50,Peroxiredoxin-6 (EC 1.11.1.27) (1-Cys peroxire...,Mammals,
80,Q9N117,RELN_BOVIN,Bos taurus (Bovine),3.4.21.-,52,Reelin (EC 3.4.21.-),Mammals,"(50.0, 60.0]"
81,P09873,CHYM_FELCA,Felis catus (Cat) (Felis silvestris catus),3.4.23.4,54,Chymosin (EC 3.4.23.4),Mammals,"(50.0, 60.0]"
82,P10626,ACRO_CAPHI,Capra hircus (Goat),3.4.21.10,60,Acrosin (EC 3.4.21.10) [Cleaved into: Acrosin ...,Mammals,"(50.0, 60.0]"
83,P13636,PEPA_URSTH,Ursus thibetanus (Asiatic black bear) (Selenar...,3.4.23.1,60,Pepsin A (EC 3.4.23.1),Mammals,"(50.0, 60.0]"
...,...,...,...,...,...,...,...,...
24124,O01391,UCHL_APLCA,Aplysia californica (California sea hare),3.4.19.12,214,Ubiquitin carboxyl-terminal hydrolase (EC 3.4....,Molluscs,"(210.0, 220.0]"
24125,P21901,KAPL_APLCA,Aplysia californica (California sea hare),2.7.11.1,351,Spermatozoon-associated protein kinase (SAK) (...,Molluscs,"(350.0, 360.0]"
24126,P21901,KAPL_APLCA,Aplysia californica (California sea hare),2.7.11.1,351,Spermatozoon-associated protein kinase (SAK) (...,Molluscs,"(350.0, 360.0]"
24127,P81124,GSTO_APLCA,Aplysia californica (California sea hare),1.20.4.2; 1.8.5.1; 2.5.1.18,92,Probable glutathione transferase (EC 2.5.1.18)...,Molluscs,"(90.0, 100.0]"
