
Environment Setup

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pathlib import Path

In [6]:
# Plot settings

sns.set(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (12,6)

In [17]:
# Load data
data_path = Path("../data/final/ibm_df.csv")
df = pd.read_csv(data_path, parse_dates=["Date"])
print(df.head(2))

        Date       Open       High        Low      Close   Volume  Interest  \
0 2020-11-25  95.550677  96.638865  94.913306  96.537819  4326151        42   
1 2020-11-27  96.537813  97.400586  96.312398  96.654404  2187395        27   

   Sentiment  
0     0.0000  
1     0.4588  



EDA

In [18]:
# General

print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing values:\n", df.isna().sum())
print("\nSummary stats:\n", df.describe())

Dataset Shape: (427, 8)

Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Interest', 'Sentiment']

Missing values:
 Date         0
Open         0
High         0
Low          0
Close        0
Volume       0
Interest     0
Sentiment    0
dtype: int64

Summary stats:
                                 Date        Open        High         Low  \
count                            427  427.000000  427.000000  427.000000   
mean   2023-03-02 02:08:08.992974336  149.867366  151.318989  148.519007   
min              2020-11-25 00:00:00   92.309425   92.915700   91.221220   
25%              2021-12-27 12:00:00  107.346881  108.085377  105.733657   
50%              2023-01-26 00:00:00  127.667408  129.289653  126.708353   
75%              2024-02-26 12:00:00  176.662645  179.029427  175.284103   
max              2025-11-20 00:00:00  319.890015  324.899994  314.529999   
std                              NaN   54.974637   55.700140   54.219858   

            Close        Volume    In

In [19]:
import pandas as pd
import plotly.graph_objects as go

# Choose sentiment column to plot
sentiment_col = "Sentiment"

# Create figure
fig = go.Figure()

# Stock Close price
fig.add_trace(go.Scatter(
    x=df["Date"], y=df["Close"],
    mode="lines",
    name="IBM Close",
    line=dict(color="blue")
))

# Google Interest
fig.add_trace(go.Scatter(
    x=df["Date"], y=df["Interest"],
    mode="lines",
    name="Google Interest",
    yaxis="y2",
    line=dict(color="green")
))

# Sentiment
fig.add_trace(go.Scatter(
    x=df["Date"], y=df[sentiment_col],
    mode="lines+markers",
    name="Headline Sentiment",
    yaxis="y3",
    line=dict(color="red")
))

# Layout with multiple y-axes
fig.update_layout(
    title="IBM Stock vs Google Interest vs Headline Sentiment",
    xaxis=dict(title="Date"),
    yaxis=dict(title="Close Price", side="left"),
    yaxis2=dict(title="Google Interest", overlaying="y", side="right"),
    yaxis3=dict(title="Sentiment", overlaying="y", side="right", position=0.95),
    legend=dict(orientation="h", y=-0.2),
    template="plotly_white",
    width=1000, height=500
)

fig.show()


In [None]:
# Time series

sent_cols_mean = [c for c in df.columns if "_mean" in c]

plt.figure()
plt.plot(df["Date"], df["Close"], label="Close Price")
plt.plot(df["Date"], df["Interest"], label="Google Interest")
# Plot first sentiment column for comparison
if sent_cols_mean:
    plt.plot(df["Date"], df[sent_cols_mean[0]], label=sent_cols_mean[0])
plt.title("Time Series: Close Price, Google Interest, Sentiment")
plt.xlabel("Date")
plt.ylabel("Value")
plt.legend()
plt.show()
# plt.savefig("results/time_series_stock_google_sentiment.png")

In [None]:
# Correlation heatmap

plt.figure()
corr_cols = ["Close", "Interest"] + sent_cols_mean
sns.heatmap(df[corr_cols].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()
# plt.savefig("results/correlation_heatmap.png")

In [None]:
# Distribution of sentiment

for col in sent_cols_mean:
    plt.figure()
    sns.histplot(df[col].dropna(), bins=50, kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()
    # plt.savefig(f"results/{col}_distribution.png")

In [None]:
# Rolling averages (7-day and 30-day) for Close and Sentiment

rolling_cols = ["Close", "Interest"] + sent_cols_mean
for col in rolling_cols:
    df[f"{col}_7d_roll"] = df[col].rolling(window=7, min_periods=1).mean()
    df[f"{col}_30d_roll"] = df[col].rolling(window=30, min_periods=1).mean()

plt.figure()
plt.plot(df["Date"], df["Close_7d_roll"], label="Close 7-day MA")
plt.plot(df["Date"], df["Close_30d_roll"], label="Close 30-day MA")
plt.title("IBM Close Price Rolling Averages")
plt.legend()
plt.show()
# plt.savefig("results/close_rolling_avg.png")

In [None]:
# Daily article volume

plt.figure()
sns.lineplot(data=df, x="Date", y="Article_Count")
plt.title("Daily Number of NYT Articles Mentioning IBM")
plt.show()
# plt.savefig("results/daily_article_volume.png")

In [None]:
# Correlation table

correlations = df[corr_cols].corr()
print("\nCorrelations:\n", correlations)
# correlations.to_csv("results/correlations.csv")

In [None]:
# Summary stats

summary_stats = df.describe()
print("\nSummary Stats:\n", summary_stats)
# summary_stats.to_csv("results/summary_stats.csv")


Feature Engineering


Modeling