In [None]:
import openai
import pandas as pd
import time
import os  # To access environment variables

In [None]:
# Verify OpenAI library version
print(f"OpenAI library version: {openai.__version__}")  # Should print 0.28.0

In [None]:
prompts = [
    "Discard all the previous instructions. Behave like you are an expert sentence classifier. Classify the following statement from an ECB member into ‘HAWKISH’, ‘DOVISH’, or ‘NEUTRAL’. Label it ‘HAWKISH’ if it suggests tightening monetary policy, ‘DOVISH’ if it suggests easing monetary policy, or ‘NEUTRAL’ if the stance is ambiguous or not directly related to monetary policy: ",
    "Discard all the previous instructions. Behave like you are an expert sentence classifier. Assess the following statement from an ECB member in the context of maintaining price stability and promoting maximum employment. Determine whether it reflects a HAWKISH (tightening), DOVISH (easing), or NEUTRAL stance on monetary policy or is ambiguous: ",
    "Discard all the previous instructions. Behave like you are an expert sentence classifier. Analyze the following statement from an ECB member and identify the likely impact on interest rates, economic growth, and inflation. Based on your analysis, classify the sentence as HAWKISH, DOVISH, or NEUTRAL: ",
    "Discard all the previous instructions. Behave like you are an expert economist. Evaluate the following statement from an ECB member from the perspective of how the media reader might interpret it. Would it likely be perceived as HAWKISH (signaling higher rates), DOVISH (signaling lower rates), or NEUTRAL (ambiguous): ",
    "Discard all the previous instructions. Behave like you are an expert economist. Given current economic conditions, determine whether the following statement from an ECB member suggests a HAWKISH (concerned about inflation), DOVISH (concerned about growth), or NEUTRAL stance on monetary policy or ambiguous"
]

In [None]:
# Load your dataset
print("Loading dataset...")
data = pd.read_csv('Random_Articles_to_Score_Complete_Converted.csv')
print("Dataset loaded successfully.")
data.head()

In [None]:
# Drop columns and rename column as requested
data.drop(columns=['Column1', 'Classification Joaquin', 'Classification Rui', 'Classification Ed'], inplace=True)
data.rename(columns={'Average_Classification': 'Manual_Classification_Score'}, inplace=True)
print("Columns dropped and renamed.")
data.head()

In [None]:
# Set your OpenAI API key
openai.api_key = ''

In [None]:
def classify_and_score_article(article, prompt):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a financial expert."},
                    {"role": "user", "content": prompt + article}
                ],
                max_tokens=150  # Adjust max_tokens if needed
            )
            result = response['choices'][0]['message']['content'].strip()

            # Assign scores based on textual labels
            if "HAWKISH" in result.upper():
                score = 1
            elif "DOVISH" in result.upper():
                score = -1
            elif "NEUTRAL" in result.upper():
                score = 0
            else:
                raise ValueError("Incomplete response from API")
                
            return score

        except openai.error.RateLimitError:
            print("Rate limit exceeded. Retrying in 10 seconds...")
            time.sleep(10)
        except openai.error.APIError as e:
            print(f"API error: {e}. Retrying in 10 seconds...")
            time.sleep(10)
        except Exception as e:
            print(f"Unexpected error: {e}. Skipping this article.")
            return "Error"

# Initialize new columns for classifications from each prompt
for i in range(1, 6):
    data[f'OpenAI_Prompt_{i}'] = 0

# Classify and score each article in the dataset
print("Classifying and scoring articles...")
for i, article in enumerate(data['Manual.summary']):
    for j, prompt in enumerate(prompts):
        classification_score = classify_and_score_article(article, prompt)
        data.at[i, f'OpenAI_Prompt_{j + 1}'] = classification_score
    if (i + 1) % 10 == 0 or (i + 1) == len(data):
        print(f"Processed {i + 1} of {len(data)} articles.")

# Display the first few rows of the updated dataframe
print("Classification and scoring completed. Here are the first few results:")
print(data.head())

# Save the results to a new CSV file
output_file = 'classified_articles_open_ai.csv'
data.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

# Print the DataFrame head of the results
print("Here is the head of the resulting DataFrame:")
data.head()

In [None]:
# Initialize a dictionary to store the results
results = {
    'Prompt': [],
    'Counts_Dovish': [],
    'Counts_Neutral': [],
    'Counts_Hawkish': [],
    'Standard Deviation': [],
    'Median': [],
    'Average': []
}

# Calculate the counts, standard deviation, median, and average for each prompt
for prompt in ['OpenAI_Prompt_1', 'OpenAI_Prompt_2', 'OpenAI_Prompt_3', 'OpenAI_Prompt_4', 'OpenAI_Prompt_5']:
    # Ensure the scores are numeric
    scores = pd.to_numeric(data[prompt], errors='coerce')
    
    # Calculate counts for each classification
    counts = scores.value_counts().to_dict()
    counts_dovish = counts.get(-1, 0)
    counts_neutral = counts.get(0, 0)
    counts_hawkish = counts.get(1, 0)
    
    # Calculate standard deviation, median, and average
    std_dev = scores.std()
    median = scores.median()
    average = scores.mean()
    
    # Store the results
    results['Prompt'].append(prompt)
    results['Counts_Dovish'].append(counts_dovish)
    results['Counts_Neutral'].append(counts_neutral)
    results['Counts_Hawkish'].append(counts_hawkish)
    results['Standard Deviation'].append(std_dev)
    results['Median'].append(median)
    results['Average'].append(average)

# Create a DataFrame from the results dictionary
results_df = pd.DataFrame(results)

# Display the results DataFrame
print("Classification and scoring summary:")
results_df



In [None]:
# Save the results to a new CSV file
output_file = 'classified_articles_open_ai_with_stats.csv'
results_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

read in Manual_Classifier_Statistics.csv and left join to results_df, then closest average 

In [None]:

# Read in the CSV file
manual_articles_df = pd.read_csv('Manual_Classifier_Statistics.csv')

manual_articles_df



In [None]:
open_ai_articles_df = pd.read_csv('classified_articles_open_ai_with_stats.csv')
open_ai_articles_df

In [None]:
# Step 1: Calculate the average of the 'Average' column in manual_articles_df
manual_average = manual_articles_df['Average'].mean()
print(f"Average of the 'Average' column in manual_articles_df: {manual_average}")



In [None]:
# Step 2: Subtract this average from the 'Average' column in open_ai_articles_df
open_ai_articles_df['Difference from Manual Average'] = open_ai_articles_df['Average'] - manual_average

# Display the updated open_ai_articles_df
print("Updated open_ai_articles_df with Difference from Manual Average:")
print(open_ai_articles_df)

# Identify the prompt most similar to the manual classifiers based on the smallest absolute difference
open_ai_articles_df['Absolute Difference'] = open_ai_articles_df['Difference from Manual Average'].abs()
most_similar_prompt = open_ai_articles_df.loc[open_ai_articles_df['Absolute Difference'].idxmin()]

print("Prompt most similar to manual classifiers:")
print(most_similar_prompt)