
Environment Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
# Plot settings

sns.set(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (12,6)

In [4]:
# load df

df = pd.read_csv("ibm_df.csv", parse_dates=["Date"])
df = df.sort_values("Date")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Interest,Headline_Sentiment,Abstract_Sentiment,Snippet_Sentiment
0,2020-11-23,91.275649,93.677434,91.15128,93.343201,5910318,44,,,
1,2020-11-24,93.941702,96.949768,93.902841,96.708809,8109115,51,,,
2,2020-11-25,95.550677,96.638865,94.913306,96.537819,4326151,42,0.0,,
3,2020-11-27,96.537813,97.400586,96.312398,96.654404,2187395,27,0.4588,0.4939,0.4939
4,2020-11-30,96.460086,97.159635,95.675035,96.009262,6263448,38,0.4588,0.4939,0.4939



EDA

In [None]:
# General

print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing values:\n", df.isna().sum())
print("\nSummary stats:\n", df.describe())

In [None]:
# Time series

sent_cols_mean = [c for c in df.columns if "_mean" in c]

plt.figure()
plt.plot(df["Date"], df["Close"], label="Close Price")
plt.plot(df["Date"], df["Interest"], label="Google Interest")
# Plot first sentiment column for comparison
if sent_cols_mean:
    plt.plot(df["Date"], df[sent_cols_mean[0]], label=sent_cols_mean[0])
plt.title("Time Series: Close Price, Google Interest, Sentiment")
plt.xlabel("Date")
plt.ylabel("Value")
plt.legend()
plt.show()
# plt.savefig("results/time_series_stock_google_sentiment.png")

In [None]:
# Correlation heatmap

plt.figure()
corr_cols = ["Close", "Interest"] + sent_cols_mean
sns.heatmap(df[corr_cols].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()
# plt.savefig("results/correlation_heatmap.png")

In [None]:
# Distribution of sentiment

for col in sent_cols_mean:
    plt.figure()
    sns.histplot(df[col].dropna(), bins=50, kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()
    # plt.savefig(f"results/{col}_distribution.png")

In [None]:
# Rolling averages (7-day and 30-day) for Close and Sentiment

rolling_cols = ["Close", "Interest"] + sent_cols_mean
for col in rolling_cols:
    df[f"{col}_7d_roll"] = df[col].rolling(window=7, min_periods=1).mean()
    df[f"{col}_30d_roll"] = df[col].rolling(window=30, min_periods=1).mean()

plt.figure()
plt.plot(df["Date"], df["Close_7d_roll"], label="Close 7-day MA")
plt.plot(df["Date"], df["Close_30d_roll"], label="Close 30-day MA")
plt.title("IBM Close Price Rolling Averages")
plt.legend()
plt.show()
# plt.savefig("results/close_rolling_avg.png")

In [None]:
# Daily article volume

plt.figure()
sns.lineplot(data=df, x="Date", y="Article_Count")
plt.title("Daily Number of NYT Articles Mentioning IBM")
plt.show()
# plt.savefig("results/daily_article_volume.png")

In [None]:
# Correlation table

correlations = df[corr_cols].corr()
print("\nCorrelations:\n", correlations)
# correlations.to_csv("results/correlations.csv")

In [None]:
# Summary stats

summary_stats = df.describe()
print("\nSummary Stats:\n", summary_stats)
# summary_stats.to_csv("results/summary_stats.csv")


Feature Engineering


Modeling