In [32]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os
import sys

In [33]:
print("Current working directory:", os.getcwd())


Current working directory: c:\Users\mintesinot\financial-news-analysis\notebooks


In [2]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [25]:
def load_data(file_path=r'C:\Users\mintesinot\financial-news-analysis\data\raw_analyst_ratings.csv'):

    """Load the analyst rating dataset from a CSV file.
    
    Args:
        file_path (str): Path to the CSV file (default: 'data/raw_analyst_rating.csv').
    
    Returns:
        pd.DataFrame: Loaded DataFrame.
    
    Raises:
        FileNotFoundError: If the file or directory is not found.
        ValueError: If data loading fails.
    """
    data_dir = os.path.dirname(file_path)
    if not os.path.exists(data_dir):
        raise FileNotFoundError(f"Data directory '{data_dir}' not found. Please create it and place 'raw_analyst_rating.csv' there.")
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File '{file_path}' not found in '{data_dir}'. Please check the path.")
    
    try:
        df = pd.read_csv(file_path)
        expected_columns = ['headline', 'publisher', 'date', 'stock']
        missing_columns = [col for col in expected_columns if col not in df.columns]
        if missing_columns:
            print(f"Warning: Missing expected columns {missing_columns}. Available columns: {list(df.columns)}")
        return df
    except Exception as e:
        raise ValueError(f"Error loading data from '{file_path}': {str(e)}")

In [27]:
def descriptive_stats(df):
    """Compute descriptive statistics for headline lengths and article counts.
    
    Args:
        df (pd.DataFrame): Input DataFrame with 'headline' and 'publisher' columns.
    
    Returns:
        tuple: Descriptive statistics of headline lengths and publisher article counts.
    """
    if 'headline' not in df.columns:
        raise ValueError(f"Column 'headline' not found in dataset. Available columns: {list(df.columns)}")
    
    # Compute headline length, handling non-string values
    df['headline_length'] = df['headline'].apply(lambda x: len(str(x)))
    stats = df['headline_length'].describe()
    print("Headline Length Statistics:")
    print(stats)
    
    # Count articles per publisher
    publisher_counts = None
    if 'publisher' in df.columns:
        publisher_counts = df['publisher'].value_counts()
        print("\nArticles per Publisher (Top 10):")
        print(publisher_counts.head(10))
    else:
        print("Warning: 'publisher' column not found. Skipping publisher counts.")
    
    return stats, publisher_counts

In [28]:

def time_series_analysis(df):
    """Analyze publication frequency over time.
    
    Args:
        df (pd.DataFrame): Input DataFrame with 'date' column.
    
    Returns:
        pd.Series: Daily article counts.
    """
    if 'date' not in df.columns:
        raise ValueError(f"Column 'date' not found in dataset. Available columns: {list(df.columns)}")
    
    try:
        df['date'] = pd.to_datetime(df['date'], utc=True, errors='coerce')
        if df['date'].isna().all():
            raise ValueError("All 'date' values are invalid or could not be parsed.")
        df['date_only'] = df['date'].dt.date
    except Exception as e:
        raise ValueError(f"Error converting 'date' column to datetime: {str(e)}")
    
    daily_counts = df.groupby('date_only').size()
    
    plt.figure(figsize=(12, 6))
    daily_counts.plot(color='#1f77b4')
    plt.title('Article Publication Frequency Over Time')
    plt.xlabel('Date')
    plt.ylabel('Number of Articles')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.savefig('outputs/publication_frequency.png', dpi=300)
    plt.close()
    
    return daily_counts

In [29]:
def text_analysis(df):
    """Perform text analysis on headlines to identify common words.
    
    Args:
        df (pd.DataFrame): Input DataFrame with 'headline' column.
    
    Returns:
        list: Top 20 most common words and their counts.
    """
    if 'headline' not in df.columns:
        raise ValueError(f"Column 'headline' not found in dataset. Available columns: {list(df.columns)}")
    
    stop_words = set(stopwords.words('english'))
    all_words = []
    
    for headline in df['headline']:
        if not isinstance(headline, str):
            continue
        words = word_tokenize(str(headline).lower())
        words = [w for w in words if w.isalpha() and w not in stop_words]
        all_words.extend(words)
    
    word_freq = Counter(all_words)
    common_words = word_freq.most_common(20)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=[count for word, count in common_words], 
                y=[word for word, count in common_words], 
                hue=[word for word, count in common_words], 
                palette='viridis', 
                legend=False)
    plt.title('Top 20 Common Words in Headlines')
    plt.xlabel('Frequency')
    plt.ylabel('Word')
    plt.savefig('outputs/common_words.png', dpi=300)
    plt.close()
    
    return common_words

In [30]:
def publisher_domain_analysis(df):
    """Analyze unique domains from publisher emails.
    
    Args:
        df (pd.DataFrame): Input DataFrame with 'publisher' column.
    
    Returns:
        pd.Series: Counts of articles per publisher domain.
    """
    if 'publisher' not in df.columns:
        print(f"Warning: 'publisher' column not found. Skipping domain analysis. Available columns: {list(df.columns)}")
        return None
    
    df['domain'] = df['publisher'].str.extract(r'@([\w\.-]+)')
    domain_counts = df['domain'].value_counts()
    
    if domain_counts.empty:
        print("No valid domains extracted from publisher column.")
        return domain_counts
    
    plt.figure(figsize=(12, 6))
    domain_counts.head(10).plot(kind='bar', color='#2ca02c')
    plt.title('Top 10 Publisher Domains')
    plt.xlabel('Domain')
    plt.ylabel('Number of Articles')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('outputs/publisher_domains.png', dpi=300)
    plt.close()
    
    return domain_counts

In [31]:
if __name__ == "__main__":
    # Create outputs directory if it doesn't exist
    os.makedirs('outputs', exist_ok=True)
    
    try:
        df = load_data()
        stats, publisher_counts = descriptive_stats(df)
        daily_counts = time_series_analysis(df)
        common_words = text_analysis(df)
        domain_counts = publisher_domain_analysis(df)
        print("\nEDA Completed. Visualizations saved as PNG files in the 'outputs' directory.")
    except Exception as e:
        print(f"Error during EDA: {str(e)}")

Headline Length Statistics:
count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline_length, dtype: float64

Articles per Publisher (Top 10):
publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
Eddie Staley          57254
Hal Lindon            49047
ETF Professor         28489
Juan Lopez            28438
Benzinga Staff        28114
Name: count, dtype: int64

EDA Completed. Visualizations saved as PNG files in the 'outputs' directory.


In [23]:
import os
print(os.path.exists(r'C:\Users\mintesinot\financial-news-analysis\data\raw_analyst_rating.csv'))


False
