In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(final_cleaned_['Cosine Similarity'], final_cleaned_['Return'], alpha=0.6)
plt.title('Cosine Distance vs Return')
plt.xlabel('Cosine Distance (Change in 10-K)')
plt.ylabel('Stock Return (%)')
plt.grid(True)
plt.show()

correlation = final_cleaned_['Cosine Similarity'].corr(final_cleaned_['Return'])
print(f"Correlation between Cosine Distance and Return: {correlation:.4f}")




In [None]:
import matplotlib.pyplot as plt

# Filter data to only include years up to 2014
filtered_df = final_cleaned[final_cleaned['Year'] <= 2014]

# Create scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(filtered_df['Cosine Distance'], filtered_df['Return'], alpha=0.6)
plt.title('Cosine Distance vs Return (Up to 2014)')
plt.xlabel('Cosine Distance (Change in 10-K)')
plt.ylabel('Stock Return (%)')
plt.grid(True)
plt.show()

# Calculate correlation on filtered data
correlation = filtered_df['Cosine Distance'].corr(filtered_df['Return'])
print(f"Correlation between Cosine Distance and Return (Up to 2014): {correlation:.4f}")

In [None]:
## Create 5 bins of cosine distance
final_cleaned['Distance Bin'] = pd.qcut(final_cleaned['Cosine Distance'], q=5, labels=False)

# Group by year and bin, then average return
pivot = (
    final_cleaned.groupby(['Year', 'Distance Bin'])['Return']
    .mean()
    .unstack()
)

# Plot line chart of return by bin for each year
pivot.plot(figsize=(10, 6), marker='o')
plt.title('Average Return by Cosine Distance Bin Over Time')
plt.xlabel('Year')
plt.ylabel('Average Return (%)')
plt.legend(title='Cosine Distance Bin (0 = lowest, 4 = highest)')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Group by Year and take the average Cosine Distance
avg_cosine_by_year = final_cleaned.groupby('Year')['Cosine Distance'].mean()

# Plot
plt.figure(figsize=(10, 6))
plt.plot(avg_cosine_by_year.index, avg_cosine_by_year.values, marker='o', linestyle='-')
plt.title('Average Cosine Distance by Year')
plt.xlabel('Year')
plt.ylabel('Average Cosine Distance')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Filter to just Apple (assuming Apple's ticker is 'AAPL')
apple_data = final_cleaned_[final_cleaned_['Symbol'] == 'AAPL']

# Group by Year and take the average Cosine Distance
apple_cosine_by_year = apple_data.groupby('Year')['Cosine Similarity'].mean()

# Plot
plt.figure(figsize=(10, 6))
plt.plot(apple_cosine_by_year.index, apple_cosine_by_year.values, marker='o', linestyle='-')
plt.title('Apple (AAPL) Cosine Distance by Year')
plt.xlabel('Year')
plt.ylabel('Cosine  Similarity')
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Step 1: Create year-specific bins for Cosine Similarity
final_cleaned_['Similarity Bin'] = (
    final_cleaned_
    .groupby('Year')['Cosine Similarity']
    .transform(lambda x: pd.qcut(x, q=5, labels=[1, 2, 3, 4, 5], duplicates='drop'))
)

# Step 2: Group by Year and Bin, compute average return
bin_ret_by_year = (
    final_cleaned_
    .groupby(['Year', 'Similarity Bin'])['Return']
    .mean()
    .unstack()  # makes bins into columns for plotting
)

# Step 3: Plot
plt.figure(figsize=(10, 6))
for bin_label in bin_ret_by_year.columns:
    plt.plot(bin_ret_by_year.index, bin_ret_by_year[bin_label], marker='o', label=f'Bin {bin_label}')
 
plt.title('Average Return by Cosine Similarity Bin (Yearly Re-weighted)')
plt.xlabel('Year')
plt.ylabel('Average Return (%)')
plt.legend(title='Similarity Bin (per year)')
plt.grid(True)
plt.show()

In [None]:
# Step 1: Convert % return to decimal return factor
ret_factors = 1 + bin_ret_by_year / 100

# Step 2: Compute cumulative product over time
cumulative_returns = 100 * ret_factors.cumprod()  # starting with $100

# Step 3: Plot
plt.figure(figsize=(10, 6))
for bin_label in cumulative_returns.columns:
    plt.plot(cumulative_returns.index, cumulative_returns[bin_label], marker='o', label=f'Bin {bin_label}')

plt.title('Portfolio Value by Cosine Similarity Bin (Starting with $100)')
plt.xlabel('Year')
plt.ylabel('Portfolio Value ($)')
plt.legend(title='Bin')
plt.grid(True)
plt.show()

In [None]:
# Step 1: Re-bin cosine similarity by year
final_cleaned_['Similarity Bin'] = (
    final_cleaned_
    .groupby('Year')['Cosine Similarity']
    .transform(lambda x: pd.qcut(x, q=5, labels=[1, 2, 3, 4, 5], duplicates='drop'))
)

# Step 2: Group by year and bin, compute average cosine similarity
cos_sim_by_bin = (
    final_cleaned_
    .groupby(['Year', 'Similarity Bin'])['Cosine Similarity']
    .mean()
    .unstack()
)

# Step 3: Plot
plt.figure(figsize=(10, 6))
for bin_label in cos_sim_by_bin.columns:
    plt.plot(cos_sim_by_bin.index, cos_sim_by_bin[bin_label], marker='o', label=f'Bin {bin_label}')

plt.title('Average Cosine Similarity by Bin Over Time (Reweighted Each Year)')
plt.xlabel('Year')
plt.ylabel('Average Cosine Similarity')
plt.legend(title='Similarity Bin')
plt.grid(True)
plt.show()

In [None]:
import plotly.express as px

# Filter to one company, e.g., AAPL
firm = 'AAPL'
firm_df = final_cleaned_[final_cleaned_['Symbol'] == firm].copy()

# Create similarity bin per year (you already have this logic)
firm_df['Similarity Bin'] = (
    firm_df
    .groupby('Year')['Cosine Similarity']
    .transform(lambda x: pd.qcut(x, q=5, labels=[1, 2, 3, 4, 5], duplicates='drop'))
)

# Plot: Year vs Return, colored by Similarity Bin, with hover showing cosine similarity
fig = px.scatter(
    firm_df,
    x='Year',
    y='Return',
    color='Similarity Bin',
    size='Cosine Similarity',  # optional: size the dots by similarity
    hover_data=['Filing Date', 'Cosine Similarity'],
    title=f'{firm}: Return by Year with Cosine Similarity Bins',
    labels={'Return': 'Return (%)'}
)

fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
fig.show()