In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Download trained and monetary policy statements

In [None]:
df_articles=pd.read_csv('/Users/ruimaciel/Desktop/Local_ECB_Cacophony_Master_Thesis/df_final_with_bert_predictions.csv')

In [None]:
df_monetary_policy=pd.read_excel('/Users/ruimaciel/Desktop/Local_ECB_Cacophony_Master_Thesis/ecb_monetary_policy_decisions_classified.xlsx')

In [None]:
df_articles.columns

In [None]:
df_monetary_policy.columns

# Transforming Monetary policy Statements

In [None]:
# Ensure all text is in the same case and spaces are stripped
df_monetary_policy['Classification Joaquin'] = df_monetary_policy['Classification Joaquin'].str.lower().str.strip()
df_monetary_policy['Classification Ed'] = df_monetary_policy['Classification Ed'].str.lower().str.strip()
df_monetary_policy['Classification Rui'] = df_monetary_policy['Classification Rui'].str.lower().str.strip()

# Define the mapping
sentiment_mapping = {
    'dovish': -1,
    'neutral': 0,
    'hawkish': 1
}

# Apply the mapping to each column
df_monetary_policy['Classification Joaquin'] = df_monetary_policy['Classification Joaquin'].map(sentiment_mapping)
df_monetary_policy['Classification Ed'] = df_monetary_policy['Classification Ed'].map(sentiment_mapping)
df_monetary_policy['Classification Rui'] = df_monetary_policy['Classification Rui'].map(sentiment_mapping)

# Check the results
print(df_monetary_policy[['Classification Joaquin', 'Classification Ed', 'Classification Rui']].head())


In [None]:
# Calculate the average of the three columns
df_monetary_policy['Average Classification'] = df_monetary_policy[['Classification Joaquin', 'Classification Ed', 'Classification Rui']].mean(axis=1)

# Check the results
print(df_monetary_policy[['Classification Joaquin', 'Classification Ed', 'Classification Rui', 'Average Classification']].head())


In [None]:
# Convert the 'Date' column to datetime format if not already
df_monetary_policy['Date'] = pd.to_datetime(df_monetary_policy['Date'])

# Now strip off the time
df_monetary_policy['Date'] = df_monetary_policy['Date'].dt.date

# Check the result
df_monetary_policy['Date'].head()


In [None]:
# Create a new column "Name_of_Speaker" and fill it with "ECB_MONETARY_STATEMENT"
df_monetary_policy['Name_of_Speaker'] = 'ECB_MONETARY_STATEMENT'

# Drop the specified columns
df_monetary_policy = df_monetary_policy.drop(columns=['Title', 'Link', 'Classification Joaquin', 'Classification Ed', 'Classification Rui'])

# Rename columns
df_monetary_policy = df_monetary_policy.rename(columns={'Average Classification': 'Sentiment', 'Article': 'Statement'})

In [None]:
df_monetary_policy

# Analysis of the Open_AI and Bert_Model

In [None]:
# Keep only the specified columns
df_articles = df_articles[['Date', 'Manual.summary', 'Name_of_Speaker', 'OpenAI_Score', 'bert_predictions_everything']]

# Transform Date to keep only the date part
df_articles['Date'] = pd.to_datetime(df_articles['Date']).dt.date

# Rename columns
df_articles = df_articles.rename(columns={'Manual.summary': 'Statement'})

In [None]:
# Mapping dictionary
label_mapping = {'LABEL_2': 0, 'LABEL_1': 1, 'LABEL_0': -1}

# Replace the labels in the DataFrame
df_articles['bert_predictions_everything'] = df_articles['bert_predictions_everything'].replace(label_mapping)


In [None]:
df_articles

In [None]:
# Filter rows where OpenAI_Score is not NaN
filtered_df = df_articles.dropna(subset=['OpenAI_Score'])

filtered_df

In [None]:
# Extracting values from each row
for index, row in filtered_df.iterrows():
    print(f"Row {index}: OpenAI_Score = {row['OpenAI_Score']}, bert_predictions_everything = {row['bert_predictions_everything']}")

In [None]:
# Replace 'Error' with NaN and convert to float
filtered_df.replace('Error', np.nan, inplace=True)
filtered_df['OpenAI_Score'] = filtered_df['OpenAI_Score'].astype(float)
filtered_df['bert_predictions_everything'] = filtered_df['bert_predictions_everything'].astype(float)

# Compare columns and count matches
matches = (filtered_df['OpenAI_Score'] == filtered_df['bert_predictions_everything']).sum()

# Display the count of matches
print(f"Number of matches: {matches}")

In [None]:
# Group by 'OpenAI_Score' and count the occurrences of each 'bert_predictions_everything' value
bert_counts = filtered_df.groupby(['OpenAI_Score', 'bert_predictions_everything']).size().unstack(fill_value=0)

# Display the counts
print("Counts of bert_predictions_everything values for each OpenAI_Score:")
print(bert_counts)


In [None]:
confusion_matrix = pd.DataFrame(bert_counts, index=[-1.0, 0.0, 1.0])
confusion_matrix.columns = pd.MultiIndex.from_tuples([("bert_predictions_everything", col) for col in confusion_matrix.columns])

# Calculating precision and recall
precision = {}
recall = {}
f1_scores = {}

for label in confusion_matrix.index:
    tp = confusion_matrix.loc[label, ("bert_predictions_everything", label)]
    fp = confusion_matrix[("bert_predictions_everything", label)].sum() - tp
    fn = confusion_matrix.loc[label].sum() - tp
    precision[label] = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall[label] = tp / (tp + fn) if (tp + fn) != 0 else 0
    f1_scores[label] = 2 * (precision[label] * recall[label]) / (precision[label] + recall[label]) if (precision[label] + recall[label]) != 0 else 0

# Display the results
print("Precision per class:", precision)
print("Recall per class:", recall)
print("F1 Score per class:", f1_scores)


In [None]:
df_articles

# Cleaning Articles for creating the index

In [None]:
# Step 1: Count 'Error' in 'bert_predictions_everything'
error_count = df_articles['bert_predictions_everything'].value_counts().get('Error', 0)
print("Count of 'Error' in bert_predictions_everything:", error_count)

error_count = df_articles['OpenAI_Score'].value_counts().get('Error', 0)
print("Count of 'Error' in OpenAI_Score:", error_count)

# Step 2: Remove rows where 'bert_predictions_everything' is 'Error'
print(df_articles.shape[0])
df_articles = df_articles[df_articles['bert_predictions_everything'] != 'Error']
print(df_articles.shape[0])
df_articles = df_articles[df_articles['OpenAI_Score'] != 'Error']
print(df_articles.shape[0])

In [None]:
df_articles['OpenAI_Score'] = df_articles['OpenAI_Score'].astype(float)
df_articles['bert_predictions_everything'] = df_articles['bert_predictions_everything'].astype(float)

In [None]:
# Create 'Sentiment' column
df_articles['Sentiment'] = df_articles['OpenAI_Score'].combine_first(df_articles['bert_predictions_everything'])

# Calculate and print the unique counts and the amount of each in the 'Sentiment' column
unique_values_counts = df_articles['Sentiment'].value_counts()
print("Unique values and their counts in the 'Sentiment' column:")
print(unique_values_counts)

In [None]:
df_articles.drop(columns=['OpenAI_Score', 'bert_predictions_everything'], inplace=True)

In [None]:
df_articles

# Getting the information out

In [None]:
# Assuming df_scraped is already loaded with data

# Count occurrences of each unique value in 'Name_of_Speaker'
name_counts = df_articles['Name_of_Speaker'].value_counts()
print(name_counts)


# Plotting the counts
plt.figure(figsize=(10, 8))  # Set the figure size for better readability
name_counts.plot(kind='bar', color='skyblue')  # Create a bar plot
plt.title('Count of Each Speaker in Data')  # Title of the plot
plt.xlabel('Name of Speaker')  # Label for the x-axis
plt.ylabel('Counts')  # Label for the y-axis
plt.xticks(rotation=45, ha='right')  # Rotate the x-axis labels for better readability
plt.tight_layout()  # Adjust subplots to give some padding
plt.show()  # Display the plot

In [None]:
# Adding the Position column based on the given condition
executive_council_members = [
    'Christine Lagarde', 'Luis de Guindos', 'Joachim Nagel',
    'Isabel Schnabel', 'Philip Lane', 'Piero Cipollone'
]

df_articles['Position'] = np.where(df_articles['Name_of_Speaker'].isin(executive_council_members), 'Executive Council', 'Governor')


In [None]:
# Display the total count of unique values in the "Position" column
unique_position_counts = df_articles['Position'].value_counts()
print(unique_position_counts)

# Sum the occurrences of the executive council members in the "Name_of_Speaker" column
executive_council_count = df_articles['Name_of_Speaker'].isin(executive_council_members).sum()
print("Count of executive council members in 'Name_of_Speaker':", executive_council_count)


# Merging with Monetary Policy Statements

In [None]:
df_articles.columns

In [None]:
df_monetary_policy.columns

In [None]:
# Add the 'Position' column to df_monetary_policy with "Monetary Policy Statement" values
df_monetary_policy['Position'] = 'Monetary Policy Statement'
print(df_monetary_policy.shape)
print(df_articles.shape)

# Concatenate the DataFrames
df_combined = pd.concat([df_articles, df_monetary_policy], ignore_index=True)
print(df_combined.shape)

# Display the combined DataFrame
df_combined

In [None]:
df_combined.to_csv('/Users/ruimaciel/Desktop/Barcelona/Master_Thesis/ECB_Perceived_Cacophony/Rui_final_notebooks/df_ready_for_index', index=False)

# Trying things

In [None]:
# Group by 'Name_of_Speaker' and calculate the average sentiment
grouped_df = df_combined.groupby('Name_of_Speaker')['Sentiment'].mean().reset_index()

# Sort the resulting DataFrame by 'Sentiment'
grouped_df = grouped_df.sort_values(by='Sentiment')

# Print the resulting DataFrame in the desired format
for index, row in grouped_df.iterrows():
    print(f"{row['Name_of_Speaker']}, {row['Sentiment']}")


In [None]:
# Count the number of -1, 0, and 1 sentiments for each speaker
sentiment_counts = df_combined.groupby('Name_of_Speaker')['Sentiment'].apply(lambda x: x.value_counts().reindex([-1, 0, 1], fill_value=0)).unstack()

# Calculate the percentages
sentiment_percentages = sentiment_counts.div(sentiment_counts.sum(axis=1), axis=0) * 100

# Add total number of rows per speaker
sentiment_counts['Total'] = sentiment_counts.sum(axis=1)

# Sort by the 1 sentiment percentage
sentiment_percentages_sorted = sentiment_percentages.sort_values(by=1, ascending=False)

# Merge the percentages with the total counts
sentiment_percentages_sorted['Total'] = sentiment_counts['Total']

# Print the sentiment percentages with total counts
print("\nSentiment percentages for each speaker sorted by 1 percentage:")
print(sentiment_percentages_sorted)

In [None]:
# Group by 'Name_of_Speaker' and calculate the average sentiment
grouped_df = df_combined.groupby('Position')['Sentiment'].mean().reset_index()

# Sort the resulting DataFrame by 'Sentiment'
grouped_df = grouped_df.sort_values(by='Sentiment')

# Print the resulting DataFrame in the desired format
for index, row in grouped_df.iterrows():
    print(f"{row['Position']}, {row['Sentiment']}")

# Cluster

for each,number of dovish message per month, number of hawkish per month, drivers of this difference. 

also add a lag. if i had a lot hawkish messages last do i have more this month.

to observe the persistence of the hawkish

In [None]:
# Cluster
target variable

hawkish, 

dovish

difference between the two

absolute value the difference.