
Environment Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pathlib import Path
import plotly.graph_objects as go

In [2]:
# Plot settings

sns.set(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (12,6)

In [3]:
# Load data
data_path = Path("../data/final/ibm_df.csv")
df = pd.read_csv(data_path, parse_dates=["Date"])
print(df.head(2))

        Date      Close   Volume    Return  Interest  Sentiment  Article_Count
0 2020-11-23  93.343201  5910318       NaN -0.154639        NaN            NaN
1 2020-11-24  96.708809  8109115  0.036056 -0.010309        NaN            NaN



EDA

In [5]:
# Shape

rows, cols = df.shape
print(f"Dataset Shape: \n Rows={rows:,}, Columns={cols}")

Dataset Shape: 
 Rows=429, Columns=7


In [None]:
# Columns

print(df.columns.tolist())

In [4]:
# Missing Values (should all be 0)
print("\nMissing values:\n", df.isna().sum())


Missing values:
 Date               0
Close              0
Volume             0
Return             1
Interest           0
Sentiment        290
Article_Count    290
dtype: int64


In [None]:
# Summary Stats

print(df.describe())

In [None]:
# Time Series

# Create figure
fig = go.Figure()

# Stock price
fig.add_trace(go.Scatter(
    x=df["Date"], y=df["Return"],
    mode="lines",
    name="IBM Stock Return",
    line=dict(color="blue")
))

# Google Interest
fig.add_trace(go.Scatter(
    x=df["Date"], y=df["Interest"],
    mode="lines",
    name="IBM Google Interest",
    yaxis="y2",
    line=dict(color="green")
))

# Sentiment
fig.add_trace(go.Scatter(
    x=df["Date"], y=df["Sentiment"],
    mode="lines+markers",
    name="IBM Headline Sentiment",
    yaxis="y3",
    line=dict(color="red")
))

# Layout with multiple y-axes
fig.update_layout(
    title="IBM Stock Return vs Google Interest vs Headline Sentiment",
    xaxis=dict(title="Date"),
    yaxis=dict(title="Return Price", side="left"),
    yaxis2=dict(title="Public Interest", overlaying="y", side="right"),
    yaxis3=dict(title="Public Sentiment", overlaying="y", side="right", position=0.95),
    legend=dict(orientation="h", y=-0.2),
    template="plotly_white",
    width=1000, height=500
)

fig.show()


In [None]:
# Correlation heatmap
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.heatmap(df[['Return', 'Interest', 'Sentiment']].corr(), 
            annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Heatmap")
plt.show()

# plt.savefig("results/correlation_heatmap.png")

In [None]:
# Distribution of sentiment

plt.figure(figsize=(8, 4))
sns.histplot(df["Sentiment"], kde=True, bins=30)
plt.title("Distribution of NYT Sentiment Scores")
plt.xlabel("Sentiment")
plt.ylabel("Frequency")
plt.show()

# plt.savefig(f"results/{col}_distribution.png")

In [None]:
# Rolling averages 

# smooth out curves 
df['Return_7d'] = df['Return'].rolling(7).mean()
df['Return_30d'] = df['Return'].rolling(30).mean()
df['Interest_7d'] = df['Interest'].rolling(7).mean()
df['Sentiment_7d'] = df['Sentiment'].rolling(7).mean()


plt.figure(figsize=(10, 5))
plt.plot(df['Date'], df['Return'], label='Daily Return', alpha=0.4)
plt.plot(df['Date'], df['Return_7d'], label='7-Day Avg')
plt.plot(df['Date'], df['Return_30d'], label='30-Day Avg')
plt.title("Return Rolling Averages")
plt.xlabel("Date")
plt.ylabel("Return")
plt.legend()
plt.tight_layout()
plt.show()


# plt.savefig("results/close_rolling_avg.png")


Feature Engineering


Modeling