<a href="https://colab.research.google.com/github/nibaskumar93n-debug/Morphoinformatics/blob/main/Eggnogg_mapper2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q biopython pandas requests
!mkdir -p /content/{proteome,non_paralogous,blast_results}
import requests, os, pandas as pd
from Bio import SeqIO

In [2]:
# --- STEP 1: Upload your proteome ---
species_name = "Bacteroides_ovatus"
uploaded_proteome = "/content/proteome/Bacteroides_ovatus.fasta"

if os.path.exists(uploaded_proteome):
    os.rename(uploaded_proteome, f"/content/proteome/{species_name}.fasta")
    proteome_path = f"/content/proteome/{species_name}.fasta"
    print(f"‚úÖ Proteome uploaded: {proteome_path}")
else:
    raise FileNotFoundError("‚ùå Please upload your FASTA file manually in Colab first!")

# --- Count total proteins ---
total_proteins = sum(1 for _ in SeqIO.parse(proteome_path, "fasta"))
print(f"üß© Total proteins in proteome: {total_proteins}")


‚úÖ Proteome uploaded: /content/proteome/Bacteroides_ovatus.fasta
üß© Total proteins in proteome: 6058


In [3]:
# --- STEP 2. Remove paralogous sequences using CD-HIT (60% identity)
!apt-get install -y cd-hit

# --- STEP 2: Remove paralogs using CD-HIT (60% identity) ---
non_paralog_path = f"/content/non_paralogous/{species_name}_nonparalog.fasta"
os.makedirs("/content/non_paralogous", exist_ok=True)
!cd-hit -i "$proteome_path" -o "$non_paralog_path" -c 0.6 -n 4 -d 0 > /dev/null

# --- Count after CD-HIT ---
non_paralog_count = sum(1 for _ in SeqIO.parse(non_paralog_path, "fasta"))
print(f"üß¨ Non-paralogous proteins retained: {non_paralog_count} ({(non_paralog_count/total_proteins)*100:.1f}% retained)")

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
cd-hit is already the newest version (4.8.1-4).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
üß¨ Non-paralogous proteins retained: 5607 (92.6% retained)


In [4]:
# --- STEP 3: Remove human homologs ---

# 3a. Install BLAST+
!apt-get install -y ncbi-blast+ > /dev/null
# 3a. Download human reference proteome (UniProt)
!wget -q -O /content/human.fasta "https://rest.uniprot.org/uniprotkb/stream?query=proteome:UP000005640&format=fasta"

# 3b. Build human BLAST database
!makeblastdb -in /content/human.fasta -dbtype prot -out /content/human_db > /dev/null

# 3c. Run BLASTp vs human
blast_out = f"/content/blast_results/{species_name}_vs_human.tsv"
os.makedirs("/content/blast_results", exist_ok=True)
!blastp -query "$non_paralog_path" -db /content/human_db -outfmt "6 qseqid sseqid pident evalue qcovs" -evalue 1e-5 -num_threads 2 -out "$blast_out"

print("‚úÖ BLASTp vs Human completed.")

# --- 3d. Filter for non-homologous proteins (‚â§30% identity, ‚â•70% coverage) ---
df_human = pd.read_csv(blast_out, sep="\t", names=["qseqid","sseqid","pident","evalue","qcovs"])
human_homologs = set(df_human[(df_human["pident"] > 30) & (df_human["qcovs"] >= 70)]["qseqid"])
non_homologous_ids = []

for record in SeqIO.parse(non_paralog_path, "fasta"):
    if record.id not in human_homologs:
        non_homologous_ids.append(record.id)

print(f"üö´ Human-homologous proteins removed: {len(human_homologs)}")
print(f"‚úÖ Non-homologous proteins retained: {len(non_homologous_ids)} ({(len(non_homologous_ids)/non_paralog_count)*100:.1f}% retained)")

# --- Save non-homologous FASTA ---
non_hom_fasta = f"/content/{species_name}_nonhomolog.fasta"
with open(non_hom_fasta, "w") as out:
    for record in SeqIO.parse(non_paralog_path, "fasta"):
        if record.id in non_homologous_ids:
            SeqIO.write(record, out, "fasta")

‚úÖ BLASTp vs Human completed.
üö´ Human-homologous proteins removed: 367
‚úÖ Non-homologous proteins retained: 5240 (93.5% retained)


In [None]:
# Unzip DEG10
!gunzip -c /content/DEG10.aa.gz > /content/DEG10.aa.fasta


In [5]:
# --- STEP 4: Predict essential proteins using DEG10 ---
!makeblastdb -in /content/DEG10.aa.fasta -dbtype prot -out /content/deg10_db > /dev/null

blast_deg_out = f"/content/{species_name}_vs_deg10.tsv"
!blastp -query "$non_hom_fasta" -db /content/deg10_db -outfmt "6 qseqid sseqid pident evalue qcovs bitscore" -evalue 1e-5 -num_threads 2 -out "$blast_deg_out"

print("‚úÖ BLASTp vs DEG10 completed.")

FASTA-Reader: Ignoring invalid residues at position(s): On line 91713: 44
FASTA-Reader: Ignoring invalid residues at position(s): On line 102730: 48
FASTA-Reader: Ignoring invalid residues at position(s): On line 110967: 18
FASTA-Reader: Ignoring invalid residues at position(s): On line 112557: 18
FASTA-Reader: Ignoring invalid residues at position(s): On line 112604: 18
FASTA-Reader: Ignoring invalid residues at position(s): On line 112775: 18
FASTA-Reader: Ignoring invalid residues at position(s): On line 113161: 18
FASTA-Reader: Ignoring invalid residues at position(s): On line 113389: 18
FASTA-Reader: Ignoring invalid residues at position(s): On line 113405: 18
FASTA-Reader: Ignoring invalid residues at position(s): On line 113418: 18
FASTA-Reader: Ignoring invalid residues at position(s): On line 113681: 18
FASTA-Reader: Ignoring invalid residues at position(s): On line 113850: 18
FASTA-Reader: Ignoring invalid residues at position(s): On line 114182: 18
FASTA-Reader: Ignoring inv

In [6]:
 # --- STEP 5: Filter essential-like hits (stricter GEPTOP mimic logic) ---Getting too many essential proteins so using stricter threshold
df_deg = pd.read_csv(blast_deg_out, sep="\t", names=["qseqid","sseqid","pident","evalue","qcovs","bitscore"])

# ‚úÖ Tightened thresholds for higher confidence
filtered = df_deg[
    (df_deg["pident"] >= 40) &
    (df_deg["qcovs"] >= 80) &
    (df_deg["bitscore"] >= 100) &
    (df_deg["evalue"] <= 1e-10)
]

# Keep only the best hit per protein
best_hits = filtered.sort_values("evalue").drop_duplicates("qseqid", keep="first")

print(f"‚≠ê Total DEG10 hits passing strict threshold: {len(filtered)}")
print(f"üéØ Unique predicted essential proteins: {len(best_hits)} ({(len(best_hits)/len(non_homologous_ids))*100:.1f}% of non-homologous proteins)")

‚≠ê Total DEG10 hits passing strict threshold: 3485
üéØ Unique predicted essential proteins: 658 (12.6% of non-homologous proteins)


In [7]:
# --- STEP 6: Extract FASTA for essential proteins ---
ids_to_keep = set(best_hits["qseqid"])
output_fasta = f"/content/{species_name}_predicted_essential_revised_threshold.fasta"

with open(output_fasta, "w") as out:
    for record in SeqIO.parse(non_hom_fasta, "fasta"):
        if record.id in ids_to_keep:
            SeqIO.write(record, out, "fasta")

print(f"üíæ FASTA saved: {output_fasta}")

üíæ FASTA saved: /content/Bacteroides_ovatus_predicted_essential_revised_threshold.fasta


In [8]:
# --- STEP 7: Count number of sequences in the output FASTA ---
count = 0
with open(output_fasta, "r") as f:
    for line in f:
        if line.startswith(">"):
            count += 1

print(f"üß¨ Total essential proteins extracted: {count}")


üß¨ Total essential proteins extracted: 658


In [9]:
import pandas as pd
import requests
import time
from tqdm import tqdm
# Install Biopython
!pip install biopython -q

from Bio import SeqIO
import pandas as pd

# Install openpyxl for Excel support
!pip install openpyxl -q

# 1. Load eggnog-mapper output (Excel format)
print("üìÅ Loading eggnog-mapper Excel file...")
df = pd.read_excel('Eggnog_ovatus.xlsx')


üìÅ Loading eggnog-mapper Excel file...


In [10]:
# Check total proteins
total_proteins = len(df)
print(f"Total proteins in file: {total_proteins}")

# 2Ô∏è‚É£ Extract protein-KO mappings
print("\nüîç Extracting KO assignments...")
protein_ko_list = []
assigned_proteins = set()  # Track unique proteins with KOs
unassigned_proteins = set()  # Track unique proteins without KOs

for idx, row in df.iterrows():
    protein = row['query']  # Your column is named 'query'
    ko_field = row['KEGG_ko'] if 'KEGG_ko' in df.columns else ''

    if pd.notna(ko_field) and ko_field != '-' and str(ko_field).strip() != '':
        # Parse KO IDs (format: ko:K00001,ko:K00002 or just K00001,K00002)
        kos = [k.strip().replace('ko:', '') for k in str(ko_field).split(',')]
        kos = [k for k in kos if k.startswith('K')]

        if kos:
            assigned_proteins.add(protein)
            for ko in kos:
                protein_ko_list.append({'Protein': protein, 'KO': ko})
        else:
            unassigned_proteins.add(protein)
    else:
        unassigned_proteins.add(protein)

protein_ko_df = pd.DataFrame(protein_ko_list)

# Count statistics
assigned = len(assigned_proteins)
unassigned = len(unassigned_proteins)

print(f"Columns detected: ['Protein', 'KO']")
print(f"Assigned KO IDs: {assigned}")
print(f"Unassigned proteins: {unassigned}")

Total proteins in file: 656

üîç Extracting KO assignments...
Columns detected: ['Protein', 'KO']
Assigned KO IDs: 443
Unassigned proteins: 213


In [11]:
# 3Ô∏è‚É£ Remove NA and get unique KO IDs
ko_list = protein_ko_df['KO'].unique().tolist()

# 4Ô∏è‚É£ Map each KO to KEGG pathways via KEGG REST API
def get_pathways_for_ko(ko):
    url = f"https://rest.kegg.jp/link/pathway/ko:{ko}"
    try:
        res = requests.get(url)
        if res.status_code == 200 and res.text.strip():
            lines = res.text.strip().split("\n")
            pathways = []
            for l in lines:
                parts = l.split("\t")
                if len(parts) > 1:  # only if both columns exist
                    pathways.append(parts[1].replace("path:", ""))
            return pathways
    except Exception as e:
        pass
    return []

ko_to_path = {}
for ko in tqdm(ko_list, desc="Mapping KO ‚Üí Pathway"):
    ko_to_path[ko] = get_pathways_for_ko(ko)
    time.sleep(0.1)  # Rate limiting

# 5Ô∏è‚É£ Create DataFrame of KO ‚Üí Pathway
path_df = (
    pd.DataFrame([(ko, p) for ko, plist in ko_to_path.items() for p in plist],
                 columns=["KO", "Pathway"])
)

# 6Ô∏è‚É£ Identify KO IDs with no pathway mapping
mapped_kos = set(path_df["KO"])
unmapped_kos = [ko for ko in ko_list if ko not in mapped_kos]
print(f"\nKO-assigned proteins with NO pathway mapping: {len(unmapped_kos)}")

# 7Ô∏è‚É£ Download human pathway list - CORRECTED
print("\nüß¨ Fetching human pathways...")
human_pathways = requests.get("http://rest.kegg.jp/list/pathway/hsa").text

# FIX: Properly extract pathway numbers (remove "path:" and "hsa" prefix)
human_path_numbers = [
    line.split("\t")[0].replace("path:", "").replace("hsa", "")
    for line in human_pathways.strip().split("\n")
]

# 8Ô∏è‚É£ Extract pathway numbers from bacterial pathways and check if shared
path_df["Pathway_Number"] = path_df["Pathway"].str.extract(r'(\d{5})')
path_df["Shared_with_Human"] = path_df["Pathway_Number"].isin(human_path_numbers)

shared = path_df[path_df["Shared_with_Human"]].Pathway.nunique()
unique = path_df[~path_df["Shared_with_Human"]].Pathway.nunique()

print(f"\nüß≠ Pathway summary:")
print(f"Total distinct pathways: {path_df.Pathway.nunique()}")
print(f"Shared with Human: {shared}")
print(f"Unique bacterial: {unique}")

# 9Ô∏è‚É£ Save all results
path_df.to_csv("/content/eggNOG_pathway_analysis.csv", index=False)
print("\n‚úÖ All results saved to: /content/eggNOG_pathway_analysis.csv")

# üîü Save shared pathways
shared_df = path_df[path_df["Shared_with_Human"]]
shared_df.to_csv("/content/shared_with_human_pathways.csv", index=False)
print(f"üß¨ Shared pathways saved: /content/shared_with_human_pathways.csv ({shared} pathways)")

# 1Ô∏è‚É£1Ô∏è‚É£ Save unique bacterial pathways
unique_df = path_df[~path_df["Shared_with_Human"]]
unique_df.to_csv("/content/unique_bacterial_pathways_eggnog.csv", index=False)
print(f"ü¶† Unique bacterial pathways saved: /content/unique_bacterial_pathways_eggnog.csv ({unique} pathways)")


Mapping KO ‚Üí Pathway: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 431/431 [02:19<00:00,  3.08it/s]



KO-assigned proteins with NO pathway mapping: 137

üß¨ Fetching human pathways...

üß≠ Pathway summary:
Total distinct pathways: 214
Shared with Human: 152
Unique bacterial: 62

‚úÖ All results saved to: /content/eggNOG_pathway_analysis.csv
üß¨ Shared pathways saved: /content/shared_with_human_pathways.csv (152 pathways)
ü¶† Unique bacterial pathways saved: /content/unique_bacterial_pathways_eggnog.csv (62 pathways)


In [12]:
import pandas as pd

# --- Input files ---
eggnog_file = "/content/Eggnog_ovatus.xlsx"                      # eggNOG-mapper output (Excel format)
pathway_file = "/content/eggNOG_pathway_analysis.csv"      # KO ‚Üí Pathway mapping (comma-delimited)

# --- Load data ---
print("üìÅ Loading eggnog-mapper Excel file...")
df = pd.read_excel(eggnog_file)

# Extract protein-KO mappings
print("\nüîç Extracting KO assignments...")
protein_ko_list = []
assigned_proteins = set()
unassigned_proteins = set()

for idx, row in df.iterrows():
    protein = row['query']
    ko_field = row['KEGG_ko'] if 'KEGG_ko' in df.columns else ''

    if pd.notna(ko_field) and ko_field != '-' and str(ko_field).strip() != '':
        # Parse KO IDs (format: ko:K00001,ko:K00002 or just K00001,K00002)
        kos = [k.strip().replace('ko:', '') for k in str(ko_field).split(',')]
        kos = [k for k in kos if k.startswith('K')]

        if kos:
            assigned_proteins.add(protein)
            for ko in kos:
                protein_ko_list.append({'Protein': protein, 'KO': ko})
        else:
            unassigned_proteins.add(protein)
    else:
        unassigned_proteins.add(protein)

# Create mapping dataframe
mapping_df = pd.DataFrame(protein_ko_list)

# Load pathway analysis
path_df = pd.read_csv(pathway_file)

# --- Summary: Assigned vs Unassigned KOs ---
assigned = len(assigned_proteins)
unassigned = len(unassigned_proteins)

# --- KO ‚Üí pathway mapping ---
ko_list = mapping_df["KO"].unique().tolist()
ko_with_no_pathway = [ko for ko in ko_list if ko not in path_df["KO"].unique()]
num_no_pathway = len(ko_with_no_pathway)

# --- Filter only unique bacterial pathways ---
unique_pathways_df = path_df[path_df["Shared_with_Human"] == False]

# --- Merge protein ‚Üí KO with KO ‚Üí pathway ---
merged_df = pd.merge(mapping_df, unique_pathways_df, on="KO", how="inner")
merged_df = merged_df.drop_duplicates(subset=["Protein", "KO", "Pathway"])

# --- Save merged protein ‚Üí KO ‚Üí pathway CSV ---
output_file = "/content/Bacteroides_ovatus_eggNOG_merged_information.csv"
merged_df.to_csv(output_file, index=False)

# --- Save summary info ---
summary_file = "/content/Bacteroides_ovatus_eggNOG_KO_summary.csv"
summary_df = pd.DataFrame({
    "Metric": ["Assigned KO IDs", "Unassigned proteins", "KO-assigned proteins with NO pathway mapping",
               "Total distinct pathways", "Shared with Human", "Unique bacterial pathways"],
    "Count": [assigned, unassigned, num_no_pathway,
              path_df["Pathway"].nunique(),
              path_df[path_df["Shared_with_Human"]].Pathway.nunique(),
              unique_pathways_df.Pathway.nunique()]
})
summary_df.to_csv(summary_file, index=False)

# --- Print info ---
print(f"\n‚úÖ Merged protein ‚Üí KO ‚Üí pathway file saved: {output_file}")
print(f"‚úÖ Summary file saved: {summary_file}")
print("\nüìä Summary:")
print(summary_df)

üìÅ Loading eggnog-mapper Excel file...

üîç Extracting KO assignments...

‚úÖ Merged protein ‚Üí KO ‚Üí pathway file saved: /content/Bacteroides_ovatus_eggNOG_merged_information.csv
‚úÖ Summary file saved: /content/Bacteroides_ovatus_eggNOG_KO_summary.csv

üìä Summary:
                                         Metric  Count
0                               Assigned KO IDs    443
1                           Unassigned proteins    213
2  KO-assigned proteins with NO pathway mapping    137
3                       Total distinct pathways    214
4                             Shared with Human    152
5                     Unique bacterial pathways     62


In [14]:
import pandas as pd

# --- Load eggNOG-mapper results ---
print("üìÅ Loading eggnog-mapper Excel file...")
df = pd.read_excel('/content/Eggnog_ovatus.xlsx')

# Extract protein-KO mappings
print("\nüîç Extracting KO assignments...")
protein_ko_list = []

for idx, row in df.iterrows():
    protein = row['query']
    ko_field = row['KEGG_ko'] if 'KEGG_ko' in df.columns else ''

    if pd.notna(ko_field) and ko_field != '-' and str(ko_field).strip() != '':
        # Parse KO IDs (format: ko:K00001,ko:K00002 or just K00001,K00002)
        kos = [k.strip().replace('ko:', '') for k in str(ko_field).split(',')]
        kos = [k for k in kos if k.startswith('K')]

        if kos:
            for ko in kos:
                protein_ko_list.append({'Protein': protein, 'KO': ko})

# Create mapping dataframe (protein ‚Üî KO)
eggnog_df = pd.DataFrame(protein_ko_list)

# --- Load KO ‚Üî Pathway data (from your previous analysis) ---
path_df = pd.read_csv("/content/eggNOG_pathway_analysis.csv")

# --- Filter for unique bacterial pathways ---
unique_df = path_df[path_df["Shared_with_Human"] == False]

# --- Get list of unique KO IDs ---
unique_kos = unique_df["KO"].unique()

# --- Subset proteins belonging to those KOs ---
unique_proteins = eggnog_df[eggnog_df["KO"].isin(unique_kos)]

# --- Remove duplicates (in case same protein has multiple unique KOs) ---
unique_proteins = unique_proteins.drop_duplicates()

# --- Save list of unique proteins ---
unique_proteins.to_csv("/content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins.csv", index=False)
print(f"‚úÖ Unique proteins saved: {unique_proteins.shape[0]} protein-KO pairs")
print(f"üìä Unique protein IDs: {unique_proteins['Protein'].nunique()}")

üìÅ Loading eggnog-mapper Excel file...

üîç Extracting KO assignments...
‚úÖ Unique proteins saved: 166 protein-KO pairs
üìä Unique protein IDs: 155


In [15]:
from Bio import SeqIO
import pandas as pd

# --- INPUT FILES ---
fasta_file = "/content/Bacteroides_ovatus_predicted_essential_revised_threshold.fasta"
pathway_file = "/content/eggNOG_pathway_analysis.csv"
eggnog_file = "/content/Eggnog_ovatus.xlsx"

# --- LOAD DATA ---
print("üìÅ Loading files...")
path_df = pd.read_csv(pathway_file)

# Load eggNOG-mapper results and extract protein-KO mappings
df = pd.read_excel(eggnog_file)
protein_ko_list = []

for idx, row in df.iterrows():
    protein = row['query']
    ko_field = row['KEGG_ko'] if 'KEGG_ko' in df.columns else ''

    if pd.notna(ko_field) and ko_field != '-' and str(ko_field).strip() != '':
        # Parse KO IDs
        kos = [k.strip().replace('ko:', '') for k in str(ko_field).split(',')]
        kos = [k for k in kos if k.startswith('K')]

        if kos:
            for ko in kos:
                protein_ko_list.append({'protein': protein, 'KO': ko})

mapping_df = pd.DataFrame(protein_ko_list)

# Get KOs that are NOT shared with human (unique bacterial)
unique_kos = path_df.loc[path_df["Shared_with_Human"] == False, "KO"].unique().tolist()

# Get protein IDs associated with those unique KOs
unique_proteins = mapping_df[mapping_df["KO"].isin(unique_kos)]["protein"].unique().tolist()

print(f"‚úÖ Unique bacterial KOs: {len(unique_kos)}")
print(f"‚úÖ Corresponding protein IDs: {len(unique_proteins)}")

# --- FILTER FASTA ---
output_fasta = "/content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins.fasta"
count = 0

with open(output_fasta, "w") as out_f:
    for record in SeqIO.parse(fasta_file, "fasta"):
        if any(pid in record.id for pid in unique_proteins):
            SeqIO.write(record, out_f, "fasta")
            count += 1

print(f"\nüéØ Unique-pathway protein sequences saved: {output_fasta}")
print(f"Total sequences written: {count}")

üìÅ Loading files...
‚úÖ Unique bacterial KOs: 154
‚úÖ Corresponding protein IDs: 155

üéØ Unique-pathway protein sequences saved: /content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins.fasta
Total sequences written: 155


In [16]:
# Install Biopython
!pip install biopython -q

from Bio import SeqIO
import pandas as pd

# --- INPUT FILES ---
fasta_file = "/content/Bacteroides_ovatus_predicted_essential_revised_threshold.fasta"
pathway_file = "/content/eggNOG_pathway_analysis.csv"
eggnog_file = "/content/Eggnog_ovatus.xlsx"

# --- LOAD DATA ---
print("üìÅ Loading files...")
path_df = pd.read_csv(pathway_file)

# Load eggNOG-mapper results (keep ALL columns)
df = pd.read_excel(eggnog_file)
print(f"üìä Columns in eggNOG file: {df.columns.tolist()}")

# Extract protein-KO mappings while keeping full rows
protein_ko_list = []

for idx, row in df.iterrows():
    protein = row['query']
    ko_field = row['KEGG_ko'] if 'KEGG_ko' in df.columns else ''

    if pd.notna(ko_field) and ko_field != '-' and str(ko_field).strip() != '':
        # Parse KO IDs
        kos = [k.strip().replace('ko:', '') for k in str(ko_field).split(',')]
        kos = [k for k in kos if k.startswith('K')]

        if kos:
            for ko in kos:
                # Create a dict with ALL original columns plus the individual KO
                row_dict = row.to_dict()
                row_dict['KO_individual'] = ko  # Add parsed KO as separate column
                protein_ko_list.append(row_dict)

# Create dataframe with all columns
mapping_df = pd.DataFrame(protein_ko_list)

# Get KOs that are NOT shared with human (unique bacterial)
unique_kos = path_df.loc[path_df["Shared_with_Human"] == False, "KO"].unique().tolist()

# Filter for unique bacterial pathways
unique_proteins_df = mapping_df[mapping_df["KO_individual"].isin(unique_kos)].copy()
unique_proteins_list = unique_proteins_df["query"].unique().tolist()

print(f"\n‚úÖ Unique bacterial KOs: {len(unique_kos)}")
print(f"‚úÖ Corresponding protein IDs: {len(unique_proteins_list)}")
print(f"‚úÖ Total protein-KO associations: {len(unique_proteins_df)}")


# Also save as CSV for easier viewing
output_csv = "/content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins_full_annotation.csv"
unique_proteins_df.to_csv(output_csv, index=False)
print(f"üìä CSV version saved: {output_csv}")

# --- FILTER FASTA ---
output_fasta = "/content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins.fasta"
count = 0

with open(output_fasta, "w") as out_f:
    for record in SeqIO.parse(fasta_file, "fasta"):
        if any(pid in record.id for pid in unique_proteins_list):
            SeqIO.write(record, out_f, "fasta")
            count += 1

print(f"\nüéØ Unique-pathway protein sequences saved: {output_fasta}")
print(f"Total sequences written: {count}")

# --- SUMMARY ---
print(f"\n{'='*60}")
print("üìã SUMMARY OF OUTPUT FILES:")
print(f"{'='*60}")
print(f"2. {output_csv}")
print(f"   ‚Üí Same data in CSV format")
print(f"3. {output_fasta}")
print(f"   ‚Üí FASTA sequences of unique pathway proteins")
print(f"{'='*60}")

üìÅ Loading files...
üìä Columns in eggNOG file: ['query', 'seed_ortholog', 'evalue', 'score', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category', 'Description', 'Preferred_name', 'GOs', 'EC', 'KEGG_ko', 'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE', 'KEGG_TC', 'CAZy', 'BiGG_Reaction', 'PFAMs']

‚úÖ Unique bacterial KOs: 154
‚úÖ Corresponding protein IDs: 155
‚úÖ Total protein-KO associations: 166
üìä CSV version saved: /content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins_full_annotation.csv

üéØ Unique-pathway protein sequences saved: /content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins.fasta
Total sequences written: 155

üìã SUMMARY OF OUTPUT FILES:
2. /content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins_full_annotation.csv
   ‚Üí Same data in CSV format
3. /content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins.fasta
   ‚Üí FASTA sequences of unique pathway proteins


In [17]:
import pandas as pd

# --- LOAD DATA ---
print("üìÅ Loading files...")
pathway_file = "/content/eggNOG_pathway_analysis.csv"
eggnog_file = "/content/Eggnog_ovatus.xlsx"

path_df = pd.read_csv(pathway_file)
df = pd.read_excel(eggnog_file)

# Extract protein-KO mappings
protein_ko_list = []

for idx, row in df.iterrows():
    protein = row['query']
    ko_field = row['KEGG_ko'] if 'KEGG_ko' in df.columns else ''

    if pd.notna(ko_field) and ko_field != '-' and str(ko_field).strip() != '':
        # Parse KO IDs
        kos = [k.strip().replace('ko:', '') for k in str(ko_field).split(',')]
        kos = [k for k in kos if k.startswith('K')]

        if kos:
            protein_ko_list.append({'protein': protein, 'KOs': kos})

# Get unique bacterial KOs
unique_kos = path_df.loc[path_df["Shared_with_Human"] == False, "KO"].unique().tolist()
print(f"‚úÖ Unique bacterial KOs from pathway analysis: {len(unique_kos)}")

# Find proteins that have at least ONE unique bacterial KO
proteins_with_unique_ko = []
for item in protein_ko_list:
    protein = item['protein']
    kos = item['KOs']
    # Check if this protein has ANY unique bacterial KO
    if any(ko in unique_kos for ko in kos):
        proteins_with_unique_ko.append(protein)

# Get unique protein list
unique_protein_list = list(set(proteins_with_unique_ko))
print(f"‚úÖ Proteins with at least one unique bacterial KO: {len(unique_protein_list)}")

# Now filter the ORIGINAL eggNOG dataframe to keep only these proteins
# This preserves ONE row per protein with ALL original annotations
filtered_df = df[df['query'].isin(unique_protein_list)].copy()

# Add a column showing which of their KOs are unique to bacteria
def get_unique_kos_for_protein(ko_field):
    if pd.notna(ko_field) and ko_field != '-' and str(ko_field).strip() != '':
        kos = [k.strip().replace('ko:', '') for k in str(ko_field).split(',')]
        kos = [k for k in kos if k.startswith('K')]
        unique_only = [ko for ko in kos if ko in unique_kos]
        return ','.join(unique_only) if unique_only else ''
    return ''

filtered_df['Unique_Bacterial_KOs'] = filtered_df['KEGG_ko'].apply(get_unique_kos_for_protein)

print(f"\n‚úÖ Final filtered proteins: {len(filtered_df)}")
print(f"‚úÖ This matches our unique protein count: {len(unique_protein_list)}")

# --- SAVE RESULTS ---
output_excel = "/content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins_ONE_ROW_PER_PROTEIN.xlsx"
filtered_df.to_excel(output_excel, index=False)

output_csv = "/content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins_ONE_ROW_PER_PROTEIN.csv"
filtered_df.to_csv(output_csv, index=False)

print(f"\nüìä Files saved:")
print(f"  - {output_excel}")
print(f"  - {output_csv}")

# Show sample
print(f"\n{'='*60}")
print("üìã SAMPLE DATA (ONE ROW PER PROTEIN):")
print(f"{'='*60}")
important_cols = ['query', 'Preferred_name', 'KEGG_ko', 'Unique_Bacterial_KOs', 'Description']
available_cols = [col for col in important_cols if col in filtered_df.columns]
print(filtered_df[available_cols].head(10))

# Verification
print(f"\n{'='*60}")
print("‚úÖ VERIFICATION:")
print(f"{'='*60}")
print(f"Expected: 152 unique proteins")
print(f"Got: {len(filtered_df)} proteins")
print(f"Match: {'YES ‚úì' if len(filtered_df) == 152 else 'NO - needs investigation'}")

üìÅ Loading files...
‚úÖ Unique bacterial KOs from pathway analysis: 154
‚úÖ Proteins with at least one unique bacterial KO: 155

‚úÖ Final filtered proteins: 155
‚úÖ This matches our unique protein count: 155

üìä Files saved:
  - /content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins_ONE_ROW_PER_PROTEIN.xlsx
  - /content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins_ONE_ROW_PER_PROTEIN.csv

üìã SAMPLE DATA (ONE ROW PER PROTEIN):
                             query Preferred_name              KEGG_ko  \
0   tr|A0A139KRT6|A0A139KRT6_BACOV            pfp  ko:K00895,ko:K21071   
1   tr|A0A139L0C5|A0A139L0C5_BACOV          ribBA            ko:K14652   
2   tr|A0A139L8I8|A0A139L8I8_BACOV            asd            ko:K00133   
3   tr|A0A1G6GBI4|A0A1G6GBI4_BACOV           fabZ            ko:K16363   
4   tr|A0A5M5D3S3|A0A5M5D3S3_BACOV           murE            ko:K01928   
6   tr|A0A139KLL4|A0A139KLL4_BACOV           hisH            ko:K02501   
7   tr|A0A139KLM4|A0A139KLM4_BACOV  

In [18]:
import pandas as pd

# Load your filtered eggNOG file
df = pd.read_excel("/content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins_ONE_ROW_PER_PROTEIN.xlsx")

# Show all column names
print("Available columns:")
print(df.columns.tolist())

# Show first few rows to see gene names
print("\nFirst 5 rows:")
print(df[['query', 'Preferred_name', 'Description']].head())  # Adjust column names as needed

Available columns:
['query', 'seed_ortholog', 'evalue', 'score', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category', 'Description', 'Preferred_name', 'GOs', 'EC', 'KEGG_ko', 'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE', 'KEGG_TC', 'CAZy', 'BiGG_Reaction', 'PFAMs', 'Unique_Bacterial_KOs']

First 5 rows:
                            query Preferred_name  \
0  tr|A0A139KRT6|A0A139KRT6_BACOV            pfp   
1  tr|A0A139L0C5|A0A139L0C5_BACOV          ribBA   
2  tr|A0A139L8I8|A0A139L8I8_BACOV            asd   
3  tr|A0A1G6GBI4|A0A1G6GBI4_BACOV           fabZ   
4  tr|A0A5M5D3S3|A0A5M5D3S3_BACOV           murE   

                                         Description  
0  Catalyzes the phosphorylation of D-fructose 6-...  
1  Catalyzes the conversion of D-ribulose 5-phosp...  
2  Catalyzes the NADPH-dependent formation of L-a...  
3  Catalyzes the hydrolysis of UDP-3-O-myristoyl-...  
4  Catalyzes the addition of meso-diaminopimelic ...  


In [19]:
import pandas as pd
import re

# Path to your PSORTb CSV
psortb_csv = "/content/PSORTb_results.csv"

# Read as plain text (since all info is in one column)
df_raw = pd.read_csv(psortb_csv, header=None, names=["Text"], dtype=str)
print(f"‚úÖ Loaded {len(df_raw)} rows from PSORTb result")

‚úÖ Loaded 3565 rows from PSORTb result


In [20]:
import pandas as pd
import re

# Load your raw PSORTb results
df_raw = pd.read_csv("/content/PSORTb_results.csv")

# ‚úÖ ADD THIS LINE - removes NaN rows before processing
df_raw = df_raw.dropna(subset=[df_raw.columns[0]])

# Join all rows into one large text block
content = "\n".join(df_raw.iloc[:,0].tolist())

# Rest of your code stays the same...
entries = re.split(r"SeqID:", content)
records = []

for entry in entries:
    entry = entry.strip()
    if not entry:
        continue

    seq_match = re.search(r"^\s*(\S+)", entry)
    loc_match = re.search(r"Final Prediction:\s*(\w+)", entry)

    if seq_match and loc_match:
        seqid = seq_match.group(1).strip()
        loc = loc_match.group(1).strip()
        records.append((seqid, loc))

df = pd.DataFrame(records, columns=["SeqID", "Localization"])
df.to_csv("/content/psortb_cleaned.csv", index=False)

print(f"‚úÖ Extracted {len(df)} protein predictions")
print("üíæ Saved as: /content/psortb_cleaned.csv")
df.head()

‚úÖ Extracted 155 protein predictions
üíæ Saved as: /content/psortb_cleaned.csv


Unnamed: 0,SeqID,Localization
0,Analysis,Cytoplasmic
1,tr|A0A139L0C5|A0A139L0C5_BACOV,Cytoplasmic
2,tr|A0A139L8I8|A0A139L8I8_BACOV,Cytoplasmic
3,tr|A0A1G6GBI4|A0A1G6GBI4_BACOV,Cytoplasmic
4,tr|A0A5M5D3S3|A0A5M5D3S3_BACOV,Cytoplasmic


In [21]:
import pandas as pd

# Load both files
print("üìÅ Loading files...")
eggnog_df = pd.read_excel("/content/Bacteroides_ovatus_eggNOG_unique_pathway_proteins_ONE_ROW_PER_PROTEIN.xlsx")
psortb_df = pd.read_csv("/content/psortb_cleaned.csv", sep=",")

# Check columns
print(f"\nüìä eggNOG file: {eggnog_df.shape[0]} rows, {eggnog_df.shape[1]} columns")
print(f"üìä PSORTb file: {psortb_df.shape[0]} rows, {psortb_df.shape[1]} columns")

# Find the protein ID column in eggNOG (usually 'query' or '#query')
possible_id_cols = [col for col in eggnog_df.columns if 'query' in col.lower()]
if possible_id_cols:
    eggnog_id_col = possible_id_cols[0]
else:
    eggnog_id_col = eggnog_df.columns[0]  # Use first column as fallback

print(f"\nüîç Using eggNOG ID column: '{eggnog_id_col}'")
print(f"üîç Using PSORTb ID column: 'SeqID'")

# Show sample IDs to verify they match
print(f"\nüìù Sample eggNOG IDs:")
print(eggnog_df[eggnog_id_col].head(3).tolist())
print(f"\nüìù Sample PSORTb IDs:")
print(psortb_df['SeqID'].head(3).tolist())

# Merge the dataframes
merged_df = pd.merge(
    eggnog_df,
    psortb_df,
    left_on=eggnog_id_col,
    right_on='SeqID',
    how='left'  # Keep all eggNOG proteins even if no PSORTb result
)

# Check merge success
print(f"\n‚úÖ Merged file: {len(merged_df)} proteins")
print(f"‚úÖ Proteins with localization data: {merged_df['Localization'].notna().sum()}")
print(f"‚ö†Ô∏è Proteins without localization: {merged_df['Localization'].isna().sum()}")

# Show localization distribution
print(f"\nüìä Localization distribution:")
localization_counts = merged_df['Localization'].value_counts(dropna=False)
print(localization_counts)

# Save complete annotation + localization (CSV only)
output_csv = "/content/Final_annotated_with_localization.csv"
merged_df.to_csv(output_csv, index=False)

print(f"\nüíæ Complete file saved:")
print(f"  - {output_csv}")

# ========================================
# CREATE SEPARATE CSV FILES FOR EACH LOCALIZATION
# ========================================

print(f"\n{'='*60}")
print("üìÇ Creating separate CSV files for each localization...")
print(f"{'='*60}")

# Get unique localizations (excluding NaN)
localizations = merged_df['Localization'].dropna().unique()

for loc in localizations:
    # Filter for this localization
    loc_df = merged_df[merged_df['Localization'] == loc]

    # Create safe filename (replace spaces/special chars)
    safe_loc_name = loc.replace(" ", "_").replace("/", "_")

    # Save as CSV only
    csv_file = f"/content/Localization_{safe_loc_name}.csv"
    loc_df.to_csv(csv_file, index=False)

    print(f"‚úÖ {loc}: {len(loc_df)} proteins ‚Üí {csv_file}")

# Also create file for proteins WITHOUT localization
unknown_df = merged_df[merged_df['Localization'].isna()]
if len(unknown_df) > 0:
    unknown_df.to_csv("/content/Localization_Unknown.csv", index=False)
    print(f"\n‚ö†Ô∏è Unknown localization: {len(unknown_df)} proteins ‚Üí /content/Localization_Unknown.csv")

# Create summary table
print(f"\n{'='*60}")
print("üìä SUMMARY TABLE:")
print(f"{'='*60}")
summary_df = pd.DataFrame({
    'Localization': localization_counts.index,
    'Count': localization_counts.values,
    'Percentage': (localization_counts.values / len(merged_df) * 100).round(2)
})
summary_df.to_csv("/content/Localization_Summary.csv", index=False)
print(summary_df.to_string(index=False))
print(f"\nüíæ Summary saved: /content/Localization_Summary.csv")

# Show sample of merged data
print(f"\n{'='*60}")
print("üìã SAMPLE MERGED DATA:")
print(f"{'='*60}")
important_cols = [eggnog_id_col, 'Preferred_name', 'Localization', 'Score',
                  'Unique_Bacterial_KOs', 'Description']
available_cols = [col for col in important_cols if col in merged_df.columns]
print(merged_df[available_cols].head(10).to_string(index=False))

üìÅ Loading files...

üìä eggNOG file: 155 rows, 22 columns
üìä PSORTb file: 155 rows, 2 columns

üîç Using eggNOG ID column: 'query'
üîç Using PSORTb ID column: 'SeqID'

üìù Sample eggNOG IDs:
['tr|A0A139KRT6|A0A139KRT6_BACOV', 'tr|A0A139L0C5|A0A139L0C5_BACOV', 'tr|A0A139L8I8|A0A139L8I8_BACOV']

üìù Sample PSORTb IDs:
['tr|A0A139KRT6|A0A139KRT6_BACOV', 'tr|A0A139L0C5|A0A139L0C5_BACOV', 'tr|A0A139L8I8|A0A139L8I8_BACOV']

‚úÖ Merged file: 155 proteins
‚úÖ Proteins with localization data: 155
‚ö†Ô∏è Proteins without localization: 0

üìä Localization distribution:
Localization
Cytoplasmic            119
CytoplasmicMembrane     22
Unknown                 11
Periplasmic              3
Name: count, dtype: int64

üíæ Complete file saved:
  - /content/Final_annotated_with_localization.csv

üìÇ Creating separate CSV files for each localization...
‚úÖ Cytoplasmic: 119 proteins ‚Üí /content/Localization_Cytoplasmic.csv
‚úÖ CytoplasmicMembrane: 22 proteins ‚Üí /content/Localization_Cytop