In [45]:
import openai
import pandas as pd
import time
import os  # To access environment variables

In [46]:
# Verify OpenAI library version
print(f"OpenAI library version: {openai.__version__}")  # Should print 0.28.0

OpenAI library version: 0.28.0


In [47]:
prompts = [
    "Discard all the previous instructions. Behave like you are an expert sentence classifier. Classify the following statement from an ECB member into ‘HAWKISH’, ‘DOVISH’, or ‘NEUTRAL’. Label it ‘HAWKISH’ if it suggests tightening monetary policy, ‘DOVISH’ if it suggests easing monetary policy, or ‘NEUTRAL’ if the stance is ambiguous or not directly related to monetary policy: ",
    "Discard all the previous instructions. Behave like you are an expert sentence classifier. Assess the following statement from an ECB member in the context of maintaining price stability and promoting maximum employment. Determine whether it reflects a HAWKISH (tightening), DOVISH (easing), or NEUTRAL stance on monetary policy or is ambiguous: ",
    "Discard all the previous instructions. Behave like you are an expert sentence classifier. Analyze the following statement from an ECB member and identify the likely impact on interest rates, economic growth, and inflation. Based on your analysis, classify the sentence as HAWKISH, DOVISH, or NEUTRAL: ",
    "Discard all the previous instructions. Behave like you are an expert economist. Evaluate the following statement from an ECB member from the perspective of how the media reader might interpret it. Would it likely be perceived as HAWKISH (signaling higher rates), DOVISH (signaling lower rates), or NEUTRAL (ambiguous): ",
    "Discard all the previous instructions. Behave like you are an expert economist. Given current economic conditions, determine whether the following statement from an ECB member suggests a HAWKISH (concerned about inflation), DOVISH (concerned about growth), or NEUTRAL stance on monetary policy or ambiguous"
]

In [48]:
# Load your dataset
print("Loading dataset...")
data = pd.read_csv('Random_Articles_to_Score_Complete_Converted.csv')
print("Dataset loaded successfully.")
data.head()

Loading dataset...
Dataset loaded successfully.


Unnamed: 0,Column1,Manual.summary,Classification Joaquin,Classification Rui,Classification Ed,Average_Classification
0,0,"Speaking at the IMF in Washington, ECB Presi...",1,1,1,1.0
1,1,"Starting in October, the ECB wants to use a ...",0,0,0,0.0
2,2,ECB Chief Economist Philip Lane has proposed ...,1,0,1,0.666667
3,3,Former German Chancellor Angela Merkel has rec...,0,0,0,0.0
4,4,The ECB has raised its interest rates again ...,1,1,1,1.0


In [49]:
# Drop columns and rename column as requested
data.drop(columns=['Column1', 'Classification Joaquin', 'Classification Rui', 'Classification Ed'], inplace=True)
data.rename(columns={'Average_Classification': 'Manual_Classification_Score'}, inplace=True)
print("Columns dropped and renamed.")
data.head()

Columns dropped and renamed.


Unnamed: 0,Manual.summary,Manual_Classification_Score
0,"Speaking at the IMF in Washington, ECB Presi...",1.0
1,"Starting in October, the ECB wants to use a ...",0.0
2,ECB Chief Economist Philip Lane has proposed ...,0.666667
3,Former German Chancellor Angela Merkel has rec...,0.0
4,The ECB has raised its interest rates again ...,1.0


In [50]:
# Set your OpenAI API key
openai.api_key = 'sk-proj-dvUlhrqVeqGmdn2a7b3pT3BlbkFJbWEX2j67TZIrKSZvCfzm'

In [51]:
def classify_and_score_article(article, prompt):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a financial expert."},
                    {"role": "user", "content": prompt + article}
                ],
                max_tokens=150  # Adjust max_tokens if needed
            )
            result = response['choices'][0]['message']['content'].strip()

            # Assign scores based on textual labels
            if "HAWKISH" in result.upper():
                score = 1
            elif "DOVISH" in result.upper():
                score = -1
            elif "NEUTRAL" in result.upper():
                score = 0
            else:
                raise ValueError("Incomplete response from API")
                
            return score

        except openai.error.RateLimitError:
            print("Rate limit exceeded. Retrying in 10 seconds...")
            time.sleep(10)
        except openai.error.APIError as e:
            print(f"API error: {e}. Retrying in 10 seconds...")
            time.sleep(10)
        except Exception as e:
            print(f"Unexpected error: {e}. Skipping this article.")
            return "Error"

# Initialize new columns for classifications from each prompt
for i in range(1, 6):
    data[f'OpenAI_Prompt_{i}'] = 0

# Classify and score each article in the dataset
print("Classifying and scoring articles...")
for i, article in enumerate(data['Manual.summary']):
    for j, prompt in enumerate(prompts):
        classification_score = classify_and_score_article(article, prompt)
        data.at[i, f'OpenAI_Prompt_{j + 1}'] = classification_score
    if (i + 1) % 10 == 0 or (i + 1) == len(data):
        print(f"Processed {i + 1} of {len(data)} articles.")

# Display the first few rows of the updated dataframe
print("Classification and scoring completed. Here are the first few results:")
print(data.head())

# Save the results to a new CSV file
output_file = 'classified_articles_open_ai.csv'
data.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

# Print the DataFrame head of the results
print("Here is the head of the resulting DataFrame:")
data.head()

Classifying and scoring articles...
Unexpected error: Incomplete response from API. Skipping this article.
Processed 10 of 100 articles.
Unexpected error: Incomplete response from API. Skipping this article.
Processed 20 of 100 articles.
Unexpected error: Incomplete response from API. Skipping this article.
Processed 30 of 100 articles.
Processed 40 of 100 articles.
Unexpected error: Incomplete response from API. Skipping this article.
Processed 50 of 100 articles.
Processed 60 of 100 articles.
Unexpected error: Incomplete response from API. Skipping this article.
Processed 70 of 100 articles.
Unexpected error: Incomplete response from API. Skipping this article.
Processed 80 of 100 articles.
Processed 90 of 100 articles.
Unexpected error: Incomplete response from API. Skipping this article.
Unexpected error: Incomplete response from API. Skipping this article.
Unexpected error: Incomplete response from API. Skipping this article.
Processed 100 of 100 articles.
Classification and scori

Unnamed: 0,Manual.summary,Manual_Classification_Score,OpenAI_Prompt_1,OpenAI_Prompt_2,OpenAI_Prompt_3,OpenAI_Prompt_4,OpenAI_Prompt_5
0,"Speaking at the IMF in Washington, ECB Presi...",1.0,1,1,1,1,1
1,"Starting in October, the ECB wants to use a ...",0.0,0,1,0,0,0
2,ECB Chief Economist Philip Lane has proposed ...,0.666667,0,1,1,-1,-1
3,Former German Chancellor Angela Merkel has rec...,0.0,0,0,0,0,1
4,The ECB has raised its interest rates again ...,1.0,1,1,Error,1,1


In [52]:
# Initialize a dictionary to store the results
results = {
    'Prompt': [],
    'Counts_Dovish': [],
    'Counts_Neutral': [],
    'Counts_Hawkish': [],
    'Standard Deviation': [],
    'Median': [],
    'Average': []
}

# Calculate the counts, standard deviation, median, and average for each prompt
for prompt in ['OpenAI_Prompt_1', 'OpenAI_Prompt_2', 'OpenAI_Prompt_3', 'OpenAI_Prompt_4', 'OpenAI_Prompt_5']:
    # Ensure the scores are numeric
    scores = pd.to_numeric(data[prompt], errors='coerce')
    
    # Calculate counts for each classification
    counts = scores.value_counts().to_dict()
    counts_dovish = counts.get(-1, 0)
    counts_neutral = counts.get(0, 0)
    counts_hawkish = counts.get(1, 0)
    
    # Calculate standard deviation, median, and average
    std_dev = scores.std()
    median = scores.median()
    average = scores.mean()
    
    # Store the results
    results['Prompt'].append(prompt)
    results['Counts_Dovish'].append(counts_dovish)
    results['Counts_Neutral'].append(counts_neutral)
    results['Counts_Hawkish'].append(counts_hawkish)
    results['Standard Deviation'].append(std_dev)
    results['Median'].append(median)
    results['Average'].append(average)

# Create a DataFrame from the results dictionary
results_df = pd.DataFrame(results)

# Display the results DataFrame
print("Classification and scoring summary:")
results_df



Classification and scoring summary:


Unnamed: 0,Prompt,Counts_Dovish,Counts_Neutral,Counts_Hawkish,Standard Deviation,Median,Average
0,OpenAI_Prompt_1,17,24,58,0.769443,1.0,0.414141
1,OpenAI_Prompt_2,13,5,81,0.69468,1.0,0.686869
2,OpenAI_Prompt_3,13,17,65,0.725939,1.0,0.547368
3,OpenAI_Prompt_4,21,9,69,0.825161,1.0,0.484848
4,OpenAI_Prompt_5,16,2,81,0.744526,1.0,0.656566


In [67]:
# Save the results to a new CSV file
output_file = 'classified_articles_open_ai_with_stats.csv'
results_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

Results saved to classified_articles_open_ai_with_stats.csv


read in Manual_Classifier_Statistics.csv and left join to results_df, then closest average 

In [70]:

# Read in the CSV file
manual_articles_df = pd.read_csv('Manual_Classifier_Statistics.csv')

manual_articles_df



Unnamed: 0,Classifier,Standard Deviation,Median,Average,Counts_Dovish,Counts_Neutral,Counts_Hawkish
0,Joaquin,0.658971,1.0,0.49,9,33,58
1,Rui,0.658741,1.0,0.52,9,30,61
2,Ed,0.666667,0.5,0.4,10,40,50


In [72]:
open_ai_articles_df = pd.read_csv('classified_articles_open_ai_with_stats.csv')
open_ai_articles_df

Unnamed: 0,Prompt,Counts_Dovish,Counts_Neutral,Counts_Hawkish,Standard Deviation,Median,Average
0,openai_prompt_1,17,24,58,0.769443,1.0,0.414141
1,openai_prompt_2,13,5,81,0.69468,1.0,0.686869
2,openai_prompt_3,13,17,65,0.725939,1.0,0.547368
3,openai_prompt_4,21,9,69,0.825161,1.0,0.484848
4,openai_prompt_5,16,2,81,0.744526,1.0,0.656566


In [73]:
# Step 1: Calculate the average of the 'Average' column in manual_articles_df
manual_average = manual_articles_df['Average'].mean()
print(f"Average of the 'Average' column in manual_articles_df: {manual_average}")



Average of the 'Average' column in manual_articles_df: 0.47000000000000003


In [77]:
# Step 2: Subtract this average from the 'Average' column in open_ai_articles_df
open_ai_articles_df['Difference from Manual Average'] = open_ai_articles_df['Average'] - manual_average

# Display the updated open_ai_articles_df
print("Updated open_ai_articles_df with Difference from Manual Average:")
print(open_ai_articles_df)

# Identify the prompt most similar to the manual classifiers based on the smallest absolute difference
open_ai_articles_df['Absolute Difference'] = open_ai_articles_df['Difference from Manual Average'].abs()
most_similar_prompt = open_ai_articles_df.loc[open_ai_articles_df['Absolute Difference'].idxmin()]

print("Prompt most similar to manual classifiers:")
print(most_similar_prompt)

Updated open_ai_articles_df with Difference from Manual Average:
            Prompt  Counts_Dovish  Counts_Neutral  Counts_Hawkish  \
0  openai_prompt_1             17              24              58   
1  openai_prompt_2             13               5              81   
2  openai_prompt_3             13              17              65   
3  openai_prompt_4             21               9              69   
4  openai_prompt_5             16               2              81   

   Standard Deviation  Median   Average  Difference from Manual Average  \
0            0.769443     1.0  0.414141                       -0.055859   
1            0.694680     1.0  0.686869                        0.216869   
2            0.725939     1.0  0.547368                        0.077368   
3            0.825161     1.0  0.484848                        0.014848   
4            0.744526     1.0  0.656566                        0.186566   

   Absolute Difference  
0             0.055859  
1             0.216

: 