# Shorten the function descriptions for proteins
In this notebook we use the OpenAI API to shorten/summarize the function descriptions of the proteins.

In [7]:
import pandas as pd
from dotenv import load_dotenv
import openai as OpenAI
import os

In [17]:
protein_data = pd.read_csv('data_output/processed_protein_data.csv')

# drop rows with missing values in the 'function' or 'name' columns
protein_data = protein_data.dropna(subset=['function', 'name'])

In [18]:
def generate_new_description(description):
    """
    Use the OpenAI API to generate a new description for the protein
    """
    load_dotenv()
    # connect to openai
    client = OpenAI.Client(api_key=os.getenv('OPENAI_API_KEY'))

    # declare system instructions
    system_instructions = ("Given the description of a protein's function, condense its key actions and roles into a one-sentence description. "
                           "Always start with action verbs and use the same language used in the original description.")
    
    # generate the new description
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_instructions},
                {"role": "user", "content": description},
            ]
        )
        new_description = completion.choices[0].message.content

    except Exception as e:
        print("Failed to generate new description.")
        print(e)
    
    return new_description

In [19]:
# generate new descriptions for each protein
for index, row in protein_data.iterrows():
    protein_data.at[index, 'new_func_descr'] = generate_new_description(row['function'])

# drop the old function descriptions
new_df = protein_data.drop(columns=['function'])

# rename the new function descriptions to 'function'
new_df = new_df.rename(columns={'new_func_descr': 'function'})

# save the new dataframe as a csv named 'prot_data_shortened' in the data_output folder
new_df.to_csv('data_output/prot_data_shortened.csv', index=False)

print("New descriptions generated and saved to 'data_output/prot_data_shortened.csv'")

New descriptions generated and saved to 'data_output/prot_data_shortened.csv'
