In [5]:
import csv
import pandas as pd
import re

Read In Excel File with Protein Abundancies

In [None]:
# Path to Proteins.xlsx file
excel_file_path = "C:/Users/JoshK/OneDrive/Desktop/MMP/deepGOplus/Proteins.xlsx"

# Read the Excel sheets into a DataFrames
increased_proteins = pd.read_excel(excel_file_path, sheet_name="Increase", engine="openpyxl")
decreased_proteins = pd.read_excel(excel_file_path, sheet_name="Decrease", engine="openpyxl")

# Save as CSV
increased_proteins.to_csv("increased_proteins.csv", index=False)
decreased_proteins.to_csv("decreased_proteins.csv", index=False)

Get all Protein IDs and Sequences from the S. mutans Proteome

In [None]:
# Path to FASTA Sequence File
fasta_file = "C:/Users/JoshK/OneDrive/Desktop/MMP/deepGOplus/Streptococcus_mutans_proteome_UP000002512_2025_01_16.fasta"

# Variables for functions
protein_data = []  # List to store rows with ID and sequence
current_id = None  # Tracks the current protein ID being processed
current_sequence = ""  # Tracks the sequence for the current protein

# Open and read the FASTA file
with open(fasta_file, "r") as file:
    for line in file:
        # Remove trailing whitespace
        line = line.strip()
        
        if line.startswith(">"):
            # If a new header line is found, save the previous ID and sequence
            if current_id and current_sequence:
                protein_data.append([current_id, current_sequence])
            # Extract the protein ID from the header
            current_id = line[4:10]  # Assuming IDs are at fixed positions 4-10
            current_sequence = ""  # Reset the sequence for the new protein
        elif current_id:
            # Append the sequence lines to the current sequence
            current_sequence += line

    # Append the last protein's data
    if current_id and current_sequence:
        protein_data.append([current_id, current_sequence])

# Write the data to a CSV file
output_file = "proteome_proteins_sequences.csv"
with open(output_file, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    # Write header row
    writer.writerow(["Accession", "Sequence"])
    # Write each protein's ID and sequence
    writer.writerows(protein_data)

print(f"Protein data has been saved to {output_file}.")

Protein data has been saved to proteome_proteins_sequences.csv.


Match Desired Proteins to sequenuences from Proteome

In [None]:
# Read in protein sequences from proteome, and dataframes for S. mutans
proteome_proteins_sequences = pd.read_csv("proteome_proteins_sequences.csv")

increased_proteins = pd.read_csv("increased_proteins.csv")
decreased_proteins = pd.read_csv("decreased_proteins.csv")
# Merge the two DataFrames on "Protein ID"
increased_sequences = pd.merge(increased_proteins, proteome_proteins_sequences, on="Accession", how="inner")
decreased_sequences = pd.merge(decreased_proteins, proteome_proteins_sequences, on="Accession", how="inner")

# Get just the ID and Sequences Columns
increased_sequences = increased_sequences[["Accession","Sequence"]]
decreased_sequences = decreased_sequences[["Accession","Sequence"]]
# Save as CSV
increased_sequences.to_csv("increased_sequences.csv", index=False)
decreased_sequences.to_csv("decreased_sequences.csv", index=False)

Create FASTA files for each

In [42]:
# Lists of protein IDs and sequences
protein_ids = decreased_sequences["Accession"]
sequences = decreased_sequences["Sequence"]
# File path for the FASTA file
fasta_file_path = "output.fasta"

# Write protein IDs and sequences to a FASTA file
with open(fasta_file_path, "w") as fasta_file:
    for protein_id, sequence in zip(protein_ids, sequences):
        fasta_file.write(f">{protein_id}\n{sequence}\n")

print(f"FASTA file created at {fasta_file_path}")

FASTA file created at output.fasta


Reformat the deepGOplus CSV output

In [1]:
# Variable for current file
number = "decreased_31_32"
# File path
file_path = f"C:/Users/JoshK/OneDrive/Desktop/MMP/deepGOplus/predictions_{number}.csv"

# Function variabbles
data = []
current_protein = None
current_section = None

# Read the file line by line
with open(file_path, "r") as file:
    reader = csv.reader(file, quotechar='"', delimiter=",")
    for line in reader:
        if line[0].startswith(("I","Q")):  # Protein identifier line
            # Extract the 6-digit protein ID from specific positions in the string
            current_protein = line[0][0:6]
        elif line[0] in ["Cellular Component", "Molecular Function", "Biological Process"]:
            current_section = line[0]
        elif line[0].startswith("GO:"):  # GO term line
            go_id, description, score = line[0], line[1], line[2]
            data.append({
                "Protein": current_protein,
                "Section": current_section,
                "GO_ID": go_id,
                "Description": description,
                "Score": float(score)
            })

# Create a DataFrame
df = pd.DataFrame(data)

# Save to a CSV file
df.to_csv(f"parsed_predictions_{number}.csv", index=False)

# Group by Protein and Section, and take the top GO term by Score
top_go_terms = df.loc[df.groupby(["Protein", "Section"])["Score"].idxmax()]

# Save to a CSV file if needed
top_go_terms.to_csv(f"top_go_terms_{number}.csv", index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/JoshK/OneDrive/Desktop/MMP/deepGOplus/predictions_decreased_31_32.csv'

Combine CSVs

In [None]:
import pandas as pd
# List of file paths to CSV files
# files = [
#     "parsed_predictions_increased_1_10.csv",
#     "parsed_predictions_increased_11_20.csv",
#     "parsed_predictions_increased_21_30.csv",
#     "parsed_predictions_increased_31_39.csv"
# ]
# files = [
#     "parsed_predictions_decreased_1_10.csv",
#     "parsed_predictions_decreased_11_20.csv",
#     "parsed_predictions_decreased_21_30.csv",
#     "parsed_predictions_decreased_31_32.csv"
# ]
files = [
    "top_go_terms_increased_1_10.csv",
    "top_go_terms_increased_11_20.csv",
    "top_go_terms_increased_21_30.csv",
    "top_go_terms_increased_31_39.csv"
]

# Empty list for dataframes
dataframes = []

# Read each CSV file and append it to the list
for file_path in files:
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Combine all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
output_file = "combined_top_predictions_increased.csv"
combined_df.to_csv(output_file, index=False)

print(f"All CSV files have been combined and saved as {output_file}.")

All CSV files have been combined and saved as combined_top_predictions_increased.csv.
