In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Optional (for topic modeling)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [None]:
df = pd.read_csv('../data/raw/news_stock_data.csv')
df.head()


In [None]:
df.isnull().sum()
df.dropna(subset=['headline', 'date'], inplace=True)


In [None]:
df['headline_length'] = df['headline'].apply(len)
df['headline_length'].describe()

plt.figure(figsize=(8, 4))
sns.histplot(df['headline_length'], bins=30)
plt.title("Distribution of Headline Lengths")
plt.xlabel("Length")
plt.ylabel("Count")
plt.show()


In [None]:
plt.figure(figsize=(10, 5))
df['publisher'].value_counts().head(10).plot(kind='bar')
plt.title("Top 10 Publishers by Article Count")
plt.xlabel("Publisher")
plt.ylabel("Number of Articles")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df['date'] = pd.to_datetime(df['date'])

articles_per_day = df.groupby(df['date'].dt.date).size()

plt.figure(figsize=(12, 4))
articles_per_day.plot()
plt.title("Articles Published Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Vectorize headlines
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['headline'])

# Fit LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

# Display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx+1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda, vectorizer.get_feature_names_out(), 10)


In [None]:
df.to_csv('../data/processed/cleaned_data.csv', index=False)
