In [None]:
# Financial News Analysis - Exploratory Data Analysis

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Import custom analysis modules
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))  # Add project root to sys.path

# You can import full modules if the script contains functions
import src.text_processing as text_processing
import src.temporal_analysis as temporal_analysis
import src.market_analysis as market_analysis

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

# Configure plotting (avoid deprecated seaborn style)
sns.set_theme()
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

# Load the data
print("Loading data...")
df = pd.read_csv('../data/raw/raw_analyst_ratings.csv')

# Basic dataset information
print("\nDataset Overview:")
print("-" * 50)
print(f"Number of records: {len(df):,}")
print(f"Number of columns: {len(df.columns)}")
print("\nColumns:", df.columns.tolist())

# Display data types and missing values
print("\nData Types and Missing Values:")
print("-" * 50)
info_df = pd.DataFrame({
    'dtype': df.dtypes,
    'missing': df.isnull().sum(),
    'missing_pct': (df.isnull().sum() / len(df) * 100).round(2)
})
print(info_df)

# Display first few rows
print("\nSample Data:")
print("-" * 50)
print(df.head())


In [None]:

print("-" * 50)
print(f"Number of records: {len(df):,}")
print(f"Number of columns: {len(df.columns)}")
print("\nColumns:", df.columns.tolist())

# Display data types and missing values
print("\nData Types and Missing Values:")
print("-" * 50)
info_df = pd.DataFrame({
    'dtype': df.dtypes,
    'missing': df.isnull().sum(),
    'missing_pct': (df.isnull().sum() / len(df) * 100).round(2)
})
print(info_df)

# Display first few rows
print("\nSample Data:")
print("-" * 50)
print(df.head())


In [None]:
# 1. Descriptive Statistics - Headlines
print("Headline Analysis")
print("-" * 50)

# Calculate headline statistics
df['headline_length'] = df['headline'].str.len()
df['word_count'] = df['headline'].str.split().str.len()

# Display summary statistics
print("\nHeadline Statistics:")
print(df[['headline_length', 'word_count']].describe())

# Create visualizations
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Headline length distribution
sns.histplot(data=df, x='headline_length', bins=50, ax=ax1)
ax1.set_title('Distribution of Headline Lengths')
ax1.set_xlabel('Number of Characters')
ax1.set_ylabel('Count')

# Word count distribution
sns.histplot(data=df, x='word_count', bins=30, ax=ax2)
ax2.set_title('Distribution of Words per Headline')
ax2.set_xlabel('Number of Words')
ax2.set_ylabel('Count')

plt.tight_layout()
plt.show()

# Display example headlines
print("\nExample Headlines:")
print("-" * 50)
print("\nShortest Headlines:")
print(df.nsmallest(3, 'headline_length')[['headline', 'headline_length', 'word_count']])
print("\nLongest Headlines:")
print(df.nlargest(3, 'headline_length')[['headline', 'headline_length', 'word_count']])


In [None]:
# 2. Publisher Analysis
print("Publisher Analysis")
print("-" * 50)

# Calculate publisher statistics
publisher_stats = df['publisher'].value_counts()
publisher_pct = (publisher_stats / len(df) * 100).round(2)

# Create DataFrame with publisher statistics
publisher_df = pd.DataFrame({
    'Article_Count': publisher_stats,
    'Percentage': publisher_pct
})

print("\nTop 10 Publishers:")
print(publisher_df.head(10))

# Visualize publisher distribution
plt.figure(figsize=(15, 6))
publisher_stats.head(15).plot(kind='bar')
plt.title('Top 15 Publishers by Number of Articles')
plt.xlabel('Publisher')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Publisher diversity statistics
print("\nPublisher Statistics:")
print("-" * 50)
print(f"Total unique publishers: {len(publisher_stats):,}")
print(f"Average articles per publisher: {publisher_stats.mean():.2f}")
print(f"Median articles per publisher: {publisher_stats.median():.2f}")

# Calculate concentration metrics
top_10_pct = publisher_pct.head(10).sum()
print(f"\nTop 10 publishers account for {top_10_pct:.2f}% of all articles")

# Check for email addresses in publisher names
email_publishers = df['publisher'].str.contains('@', na=False)
if email_publishers.any():
    print("\nPublishers using email addresses:")
    print(df[email_publishers]['publisher'].unique())


In [None]:
# Text Analysis: Keyword Frequency from Headlines

print("Text Analysis - Keyword Frequency")
print("-" * 50)

import nltk
from nltk.corpus import stopwords
from collections import Counter
import string

# Ensure NLTK resources are available
# Download necessary NLTK data
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

nltk.download('stopwords')

# Define stop words and punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Tokenize and clean headlines
def clean_and_tokenize(text):
    tokens = text.lower().split()
    return [word for word in tokens if word.isalnum() and word not in stop_words]

# Apply tokenization
df['tokens'] = df['headline'].astype(str).apply(clean_and_tokenize)

# Flatten all tokens
all_tokens = [token for tokens in df['tokens'] for token in tokens]

# Count keyword frequency
word_freq = Counter(all_tokens)
top_keywords = pd.DataFrame(word_freq.most_common(20), columns=['Keyword', 'Frequency'])

# Plot top keywords
plt.figure(figsize=(12, 6))
sns.barplot(data=top_keywords, x='Keyword', y='Frequency')
plt.title('Top 20 Keywords in Headlines')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Display top keywords
print("\nMost Common Keywords:")
display(top_keywords)


In [None]:
# Publisher Analysis - Frequency Count and Domain Analysis

print("Publisher Analysis - Frequency and Email Domains")
print("-" * 50)

# Top publishers by count
publisher_counts = df['publisher'].value_counts()
print("\nTop 10 Publishers by Article Count:")
display(publisher_counts.head(10))

# Extract domain from email-like publishers
def extract_domain(publisher):
    if '@' in publisher:
        return publisher.split('@')[-1].lower()
    return None

df['publisher_domain'] = df['publisher'].apply(extract_domain)

# Count domains (excluding None)
domain_counts = df['publisher_domain'].dropna().value_counts()

print("\nTop Email Domains (from publishers):")
display(domain_counts.head(10))

# Optional: plot domain counts
plt.figure(figsize=(10, 5))
sns.barplot(x=domain_counts.head(10).index, y=domain_counts.head(10).values)
plt.title('Top 10 Publisher Email Domains')
plt.ylabel('Frequency')
plt.xlabel('Domain')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# 3. Temporal Analysis
print("Temporal Analysis")
print("-" * 50)

# Parse dates robustly
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Check for unparseable dates
num_invalid = df['date'].isna().sum()
if num_invalid > 0:
    print(f"⚠️ Warning: {num_invalid} unparseable date values found and set as NaT.")

# Drop rows with invalid dates if necessary
df = df.dropna(subset=['date'])

# Extract temporal components
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.day_name()
df['is_weekend'] = df['day_of_week'].isin(['Saturday', 'Sunday'])

# Calculate various time-based aggregations
daily_counts = df.groupby(df['date'].dt.date).size()
hourly_counts = df.groupby('hour').size()
dow_counts = df.groupby('day_of_week').size()
monthly_counts = df.groupby([df['year'], df['month']]).size()

# Print temporal statistics
print("\nTemporal Coverage:")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Total days covered: {len(daily_counts)}")
print(f"Average daily articles: {daily_counts.mean():.2f}")
print(f"Peak hour for publications: {hourly_counts.idxmax()}:00")
print(f"Weekend publication ratio: {df['is_weekend'].mean():.2%}")

# Create visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Daily volume
daily_counts.plot(ax=ax1)
ax1.set_title('Daily News Volume')
ax1.set_xlabel('Date')
ax1.set_ylabel('Number of Articles')

# Hourly patterns
hourly_counts.plot(kind='bar', ax=ax2)
ax2.set_title('Articles by Hour of Day')
ax2.set_xlabel('Hour (24-hour format)')
ax2.set_ylabel('Number of Articles')

# Day of week patterns
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_counts = dow_counts.reindex(dow_order)
dow_counts.plot(kind='bar', ax=ax3)
ax3.set_title('Articles by Day of Week')
ax3.set_xlabel('Day of Week')
ax3.set_ylabel('Number of Articles')
ax3.tick_params(axis='x', rotation=45)

# Monthly trends
monthly_counts.plot(ax=ax4)
ax4.set_title('Monthly Article Volume')
ax4.set_xlabel('Year-Month')
ax4.set_ylabel('Number of Articles')

plt.tight_layout()
plt.show()

# Identify high-volume days
print("\nTop 5 Days by Article Volume:")
print(daily_counts.nlargest(5))


In [189]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import sys
from pathlib import Path

# Add src to path
src_path = str(Path.cwd().parent / 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

# Import our utility modules
import text_processing
import temporal_analysis
import market_analysis

# Set plotting style
sns.set_theme()
plt.style.use('default')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)


In [190]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set plotting style
sns.set_theme()  # This sets both the style and palette
plt.style.use('default')  # Use default matplotlib style as base

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)


In [None]:
# Load the dataset
df = pd.read_csv('../data/raw/raw_analyst_ratings.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())

# Display first few rows
print("\nFirst few rows of the dataset:")
display(df.head())


In [None]:
# Calculate headline statistics
df['headline_length'] = df['headline'].str.len()
df['word_count'] = df['headline'].str.split().str.len()

# Display summary statistics
print("Headline Statistics:")
print(df[['headline_length', 'word_count']].describe())

# Create plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Plot headline length distribution
sns.histplot(data=df, x='headline_length', bins=50, ax=ax1)
ax1.set_title('Distribution of Headline Lengths')
ax1.set_xlabel('Number of Characters')
ax1.set_ylabel('Count')

# Plot word count distribution
sns.histplot(data=df, x='word_count', bins=30, ax=ax2)
ax2.set_title('Distribution of Word Counts in Headlines')
ax2.set_xlabel('Number of Words')
ax2.set_ylabel('Count')

plt.tight_layout()
plt.show()

# Display some example headlines of different lengths
print("\nExample Headlines:")
print("\nShortest Headlines:")
display(df.nsmallest(3, 'headline_length')[['headline', 'headline_length', 'word_count']])
print("\nLongest Headlines:")
display(df.nlargest(3, 'headline_length')[['headline', 'headline_length', 'word_count']])


In [None]:
# Import required libraries for text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Download all required NLTK data
print("Downloading required NLTK data...")
nltk.download(['punkt', 'stopwords', 'averaged_perceptron_tagger'])

# Function to process text
def process_text(text):
    try:
        # Convert to string if not already
        text = str(text)
        # Tokenize
        tokens = text.lower().split()  # Using simple split instead of word_tokenize for robustness
        # Remove stopwords and punctuation
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        return tokens
    except Exception as e:
        print(f"Error processing text: {e}")
        return []

# Process all headlines
all_tokens = []
for headline in df['headline']:
    all_tokens.extend(process_text(headline))

# Get most common words
word_freq = Counter(all_tokens)
most_common_words = pd.DataFrame(word_freq.most_common(20), columns=['Word', 'Frequency'])

# Plot most common words
plt.figure(figsize=(12, 6))
sns.barplot(data=most_common_words, x='Word', y='Frequency')
plt.title('20 Most Common Words in Headlines')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Display the most common words
print("\nMost Common Words:")
display(most_common_words)


In [None]:
# Let's examine our date data
print("Data type of date column:", df['date'].dtype)
print("\nSample dates from the dataset:")
print(df['date'].head())
print("\nUnique date formats (first 5):")
print(df['date'].drop_duplicates().head())

# Try converting with error handling
try:
    # First attempt with default parser
    df['date'] = pd.to_datetime(df['date'])
except Exception as e:
    print("\nError in default parsing:", str(e))
    try:
        # Second attempt with coerce option to handle errors
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        print("\nConverted dates with 'coerce' option:")
        print(df['date'].head())

        # Check for any NaT (Not a Time) values
        nat_count = df['date'].isna().sum()
        if nat_count > 0:
            print(f"\nWarning: {nat_count} dates could not be parsed")
    except Exception as e:
        print("\nError in coerced parsing:", str(e))

print("\nFinal data type of date column:", df['date'].dtype)

# Extract various time components
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour

# Create subplots for different temporal patterns
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# 1. Articles by month
monthly_counts = df['month'].value_counts().sort_index()
sns.barplot(x=monthly_counts.index, y=monthly_counts.values, ax=ax1)
ax1.set_title('Articles by Month')
ax1.set_xlabel('Month')
ax1.set_ylabel('Number of Articles')

# 2. Articles by day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_counts = df['day_of_week'].value_counts()
daily_counts = daily_counts.reindex(day_order)
sns.barplot(x=daily_counts.index, y=daily_counts.values, ax=ax2)
ax2.set_title('Articles by Day of Week')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45)
ax2.set_ylabel('Number of Articles')

# 3. Articles by hour
hourly_counts = df['hour'].value_counts().sort_index()
sns.barplot(x=hourly_counts.index, y=hourly_counts.values, ax=ax3)
ax3.set_title('Articles by Hour of Day')
ax3.set_xlabel('Hour')
ax3.set_ylabel('Number of Articles')

# 4. Articles over time (daily)
daily_articles = df.groupby(df['date'].dt.date).size()
daily_articles.plot(ax=ax4)
ax4.set_title('Articles Over Time')
ax4.set_xlabel('Date')
ax4.set_ylabel('Number of Articles')

plt.tight_layout()
plt.show()

# Display summary statistics
print("\nTemporal Distribution Summary:")
print("\nTop 5 Most Active Days:")
display(daily_articles.nlargest(5))

print("\nAverage Articles per:")
print(f"Day: {daily_articles.mean():.2f}")
print(f"Week Day: {daily_counts.mean():.2f}")
print(f"Hour: {hourly_counts.mean():.2f}")


In [None]:
# Analyze publishers
publisher_counts = df['publisher'].value_counts()

# Display basic publisher statistics
print("Publisher Statistics:")
print(f"Total number of unique publishers: {len(publisher_counts)}")
print(f"Average articles per publisher: {publisher_counts.mean():.2f}")
print(f"Median articles per publisher: {publisher_counts.median():.2f}")

# Plot top publishers
plt.figure(figsize=(12, 6))
publisher_counts.head(15).plot(kind='bar')
plt.title('Top 15 Publishers by Number of Articles')
plt.xlabel('Publisher')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Display top publishers and their article counts
print("\nTop 15 Publishers:")
display(pd.DataFrame({
    'Publisher': publisher_counts.head(15).index,
    'Number of Articles': publisher_counts.head(15).values,
    'Percentage of Total': (publisher_counts.head(15).values / len(df) * 100).round(2)
}))


In [None]:
# Analyze stock symbols
stock_counts = df['stock'].value_counts()

# Display basic stock statistics
print("Stock Statistics:")
print(f"Total number of unique stocks: {len(stock_counts)}")
print(f"Average articles per stock: {stock_counts.mean():.2f}")
print(f"Median articles per stock: {stock_counts.median():.2f}")

# Plot top stocks
plt.figure(figsize=(12, 6))
stock_counts.head(15).plot(kind='bar')
plt.title('Top 15 Most Mentioned Stocks')
plt.xlabel('Stock Symbol')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Display top stocks and their article counts
print("\nTop 15 Most Covered Stocks:")
display(pd.DataFrame({
    'Stock Symbol': stock_counts.head(15).index,
    'Number of Articles': stock_counts.head(15).values,
    'Percentage of Total': (stock_counts.head(15).values / len(df) * 100).round(2)
}))

# Analyze coverage distribution
coverage_stats = pd.DataFrame({
    'Articles': stock_counts.value_counts().sort_index()
})
coverage_stats['Stocks'] = coverage_stats.index
coverage_stats['Cumulative Stocks'] = coverage_stats['Articles'].cumsum()
coverage_stats['Percentage of Stocks'] = (coverage_stats['Cumulative Stocks'] / len(stock_counts) * 100).round(2)

print("\nCoverage Distribution:")
print("Number of stocks with:")
for articles in [1, 5, 10, 50, 100]:
    stocks_above = len(stock_counts[stock_counts >= articles])
    percentage = (stocks_above / len(stock_counts) * 100)
    print(f"{articles}+ articles: {stocks_above} stocks ({percentage:.2f}%)")
