In [1]:
import nltk
from nltk.tokenize import word_tokenize
from urllib.parse import quote
import feedparser
import pandas as pd
from datetime import datetime, timezone
import os
import requests
import gdelt
import time
import random
from bs4 import BeautifulSoup
from textblob import TextBlob
from sklearn.preprocessing import MinMaxScaler

here


In [2]:
# Define the download directory
download_dir = os.path.join('../nltk')

# Download the 'punkt' package to the specified directory
nltk.download('punkt', download_dir=download_dir)

# Add the download directory to the NLTK data path
nltk.data.path.append(download_dir)
# Load the Loughran-McDonald word lists
lm_dict = pd.read_csv('../nltk/Loughran-McDonald_MasterDictionary_1993-2023.csv')

# Extract sentiment word lists
negative_words = lm_dict[lm_dict['Negative'] > 0]['Word'].tolist()
negative_words = [w.lower() for w in negative_words]
positive_words = lm_dict[lm_dict['Positive'] > 0]['Word'].tolist()
postiive_words = [w.lower() for w in positive_words]
uncertainty_words = lm_dict[lm_dict['Uncertainty'] > 0]['Word'].tolist()
uncertainty_words = [w.lower() for w in uncertainty_words]
litigious_words = lm_dict[lm_dict['Litigious'] > 0]['Word'].tolist()
litigious_words = [w.lower() for w in litigious_words]
strong_modal_words = lm_dict[lm_dict['Strong_Modal'] > 0]['Word'].tolist()
strong_modal_words = [w.lower() for w in strong_modal_words]
weak_modal_words = lm_dict[lm_dict['Weak_Modal'] > 0]['Word'].tolist()
weak_modal_words = [w.lower() for w in weak_modal_words]
constraining_words = lm_dict[lm_dict['Constraining'] > 0]['Word'].tolist()
constraining_words = [w.lower() for w in constraining_words]

[nltk_data] Downloading package punkt to ../nltk...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def get_news_sentiment_score(df):
    # Create an empty sentiment dataframe with the necessary columns
    sentiment_df = pd.DataFrame({
        'Date': df['published_date'],
        'SENTIMENT_POSITIVE': 0.0,
        'SENTIMENT_NEGATIVE': 0.0,
        'SENTIMENT_UNCERTAINTY': 0.0,
        'SENTIMENT_LITIGIOUS': 0.0,
        'SENTIMENT_STRONG_MODAL': 0.0,
        'SENTIMENT_WEAK_MODAL': 0.0,
        'SENTIMENT_CONSTRAINING': 0.0
    })

    # Check for any NaT (invalid date) values
    print(sentiment_df[sentiment_df['Date'].isna()])

    # Iterate over each title and analyze the sentiment
    for i, row in df.iterrows():
        words = row['title'].lower().split()  # Split the title into words
        for word in words:
            if word in negative_words:
                sentiment_df.loc[i, 'SENTIMENT_NEGATIVE'] += 1.0
            if word in positive_words:
                sentiment_df.loc[i, 'SENTIMENT_POSITIVE'] += 1.0
            if word in uncertainty_words:
                sentiment_df.loc[i, 'SENTIMENT_UNCERTAINTY'] += 1.0
            if word in litigious_words:
                sentiment_df.loc[i, 'SENTIMENT_LITIGIOUS'] += 1.0
            if word in strong_modal_words:
                sentiment_df.loc[i, 'SENTIMENT_STRONG_MODAL'] += 1.0
            if word in weak_modal_words:
                sentiment_df.loc[i, 'SENTIMENT_WEAK_MODAL'] += 1.0
            if word in constraining_words:
                sentiment_df.loc[i, 'SENTIMENT_CONSTRAINING'] += 1.0

    # Add the composite sentiment features
    sentiment_df["NET_SENTIMENT"] = sentiment_df["SENTIMENT_POSITIVE"] - sentiment_df["SENTIMENT_NEGATIVE"]
    sentiment_df["UNCERTAINTY_MODAL"] = sentiment_df["SENTIMENT_UNCERTAINTY"] + sentiment_df["SENTIMENT_STRONG_MODAL"] + sentiment_df["SENTIMENT_WEAK_MODAL"]
    sentiment_df["REGULATORY_PRESSURE"] = sentiment_df["SENTIMENT_LITIGIOUS"] + sentiment_df["SENTIMENT_CONSTRAINING"]

    sentiment_df = sentiment_df.drop([
        'SENTIMENT_POSITIVE',
        'SENTIMENT_NEGATIVE',
        'SENTIMENT_UNCERTAINTY',
        'SENTIMENT_LITIGIOUS',
        'SENTIMENT_STRONG_MODAL',
        'SENTIMENT_WEAK_MODAL',
        'SENTIMENT_CONSTRAINING'], axis=1)

    # Group by 'Date' and sum the sentiment scores for rows with the same Date
    aggregated_sentiment_df = sentiment_df.groupby('Date').sum().reset_index()

    # Calculate and print the percentage of zero values in each column
    zero_percentage = (aggregated_sentiment_df == 0).mean() * 100
    print("\nPercentage of zeros in each column:")
    print(zero_percentage)

    # Print the max values for each column
    max_values = aggregated_sentiment_df.max()
    print("\nMax values in each column:")
    print(max_values)

    # Print the min values for each column
    min_values = aggregated_sentiment_df.min()
    print("\nMin values in each column:")
    print(min_values)

    # List of columns to apply scaling to
    columns_to_scale = ['NET_SENTIMENT', 'UNCERTAINTY_MODAL', 'REGULATORY_PRESSURE']

    # Apply the scaling function
    scaled_sentiment_df = scale_sentiments(aggregated_sentiment_df, columns_to_scale)

    return scaled_sentiment_df


In [4]:
def get_commod_data_from_csv(file_path):
  df = pd.read_csv(file_path)
  df.rename(columns={'Settle': 'Close'}, inplace=True)
  return df

In [5]:
def scale_negative(value, min_negative):
    return 1 + ((value - min_negative) / (0 - min_negative)) * (4 - 1)  # Scaling between 1 and 4

def scale_positive(value, max_positive):
    return 6 + (value / max_positive) * (10 - 6)  # Scaling between 6 and 10

# Apply scaling
def custom_scale(value, min_negative, max_positive):
    if value < 0:
        return scale_negative(value, min_negative)
    elif value > 0:
        return scale_positive(value, max_positive)
    else:
        return 5  # Zero mapped to 5

In [6]:
# Function to scale sentiment columns
def scale_sentiments(sentiment_df, columns):
    for col in columns:
        # Get min and max values for negative and positive sentiments for each column
        min_negative = sentiment_df[sentiment_df[col] < 0][col].min()
        max_positive = sentiment_df[sentiment_df[col] > 0][col].max()
        
        # Apply the custom scaling for each column
        sentiment_df[f'scaled_{col}'] = sentiment_df[col].apply(
            lambda x: custom_scale(x, min_negative, max_positive)
        )
    return sentiment_df

In [7]:
news_folder_path = '../data/news'
news_sentiment_scores_folder_path = '../data/news_sentiments_scores'
for filename in os.listdir(news_folder_path):
    if filename.endswith('.csv'):
        news_file_path = os.path.join(f'{news_folder_path}/', filename)
        ticker_name = filename.split('.')[0]
        # Load CSV into a pandas DataFrame and parse dates
        news_df = pd.read_csv(news_file_path, parse_dates=['published_date'], dayfirst=True)
        # Check if the date parsing was successful
        if news_df['published_date'].isnull().any():
            print(f"Warning: Some dates in '{filename}' could not be parsed.")
        news_sentiment_df = get_news_sentiment_score(news_df)
        # Save the sentiment DataFrame to a CSV file
        sentiment_scores_file_path = os.path.join(f'{news_sentiment_scores_folder_path}/', f'{ticker_name}_sentiment.csv')
        news_sentiment_df.to_csv(sentiment_scores_file_path, index=False)  # Save without index
        print(f'Sentiment scores saved for {ticker_name} at {sentiment_scores_file_path}')

Empty DataFrame
Columns: [Date, SENTIMENT_POSITIVE, SENTIMENT_NEGATIVE, SENTIMENT_UNCERTAINTY, SENTIMENT_LITIGIOUS, SENTIMENT_STRONG_MODAL, SENTIMENT_WEAK_MODAL, SENTIMENT_CONSTRAINING]
Index: []

Percentage of zeros in each column:
Date                    0.000000
NET_SENTIMENT          87.224670
UNCERTAINTY_MODAL      85.903084
REGULATORY_PRESSURE    96.475771
dtype: float64

Max values in each column:
Date                   2021-06-27
NET_SENTIMENT                 0.0
UNCERTAINTY_MODAL             4.0
REGULATORY_PRESSURE           2.0
dtype: object

Min values in each column:
Date                   2014-01-03
NET_SENTIMENT                -4.0
UNCERTAINTY_MODAL             0.0
REGULATORY_PRESSURE           0.0
dtype: object
Sentiment scores saved for Cocoa at ../data/news_sentiments_scores/Cocoa_sentiment.csv
Empty DataFrame
Columns: [Date, SENTIMENT_POSITIVE, SENTIMENT_NEGATIVE, SENTIMENT_UNCERTAINTY, SENTIMENT_LITIGIOUS, SENTIMENT_STRONG_MODAL, SENTIMENT_WEAK_MODAL, SENTIMENT_CONST

In [15]:
output_folder_path = '../data/test/'

import os
import pandas as pd

# Loop through each sentiment score CSV file in the folder
for filename in os.listdir(news_sentiment_scores_folder_path):
    if filename.endswith('.csv'):
        news_sentiment_file_path = os.path.join(news_sentiment_scores_folder_path, filename)
        ticker_name = filename.split('_')[0]

        # Load news sentiment CSV
        news_sentiment_df = pd.read_csv(news_sentiment_file_path)

        # Define output file path
        output_file_path = os.path.join(output_folder_path, f'{ticker_name}.csv')

        # Check if output file exists
        if not os.path.isfile(output_file_path):
            print('File does not exist in output folder')
            break
        else:
            # Load output CSV
            output_df = pd.read_csv(output_file_path, parse_dates=['Date'], dayfirst=True)
            # Convert the 'Date' column to the desired format
            output_df['Date'] = output_df['Date'].dt.strftime('%d/%m/%Y')

        print(ticker_name)

        # Define columns to drop from output_df
        output_columns = ['scaled_NET_SENTIMENT', 'scaled_UNCERTAINTY_MODAL', 'scaled_REGULATORY_PRESSURE']

        # Check for columns to drop
        columns_to_drop = [col for col in output_columns if col in output_df.columns]
        if columns_to_drop:
            output_df.drop(columns=columns_to_drop, inplace=True)
            print(f"Dropped columns: {columns_to_drop}")
        else:
            print("No matching columns to drop.")

        # Define unscaled sentiment columns to drop from news_sentiment_df
        unscaled_news_sentiment_columns = ['NET_SENTIMENT', 'UNCERTAINTY_MODAL', 'REGULATORY_PRESSURE']
        news_sentiment_df.drop(columns=unscaled_news_sentiment_columns, inplace=True)

        # Step 1: Convert 'Date' to datetime format
        news_sentiment_df['Date'] = pd.to_datetime(news_sentiment_df['Date'], dayfirst=True, errors='coerce')

        # Step 2: Format the 'Date' back to the original format
        news_sentiment_df['Date'] = news_sentiment_df['Date'].dt.strftime('%d/%m/%Y')

        # Perform a left merge on the Date columns to ensure all rows from output_df are kept
        merged_df = pd.merge(output_df, news_sentiment_df, on='Date', how='left')

        # Fill missing sentiment values in the sentiment columns with NaN
        merged_df[['scaled_NET_SENTIMENT', 'scaled_UNCERTAINTY_MODAL', 'scaled_REGULATORY_PRESSURE']] = merged_df[['scaled_NET_SENTIMENT', 'scaled_UNCERTAINTY_MODAL', 'scaled_REGULATORY_PRESSURE']].fillna(value=5)
                
        # Save the merged DataFrame back to the output file
        merged_df.to_csv(output_file_path, index=False)

        print(f"Merged DataFrame saved successfully for {ticker_name}, with unmatched sentiment values filled with NaN.")



Cocoa
Dropped columns: ['scaled_NET_SENTIMENT', 'scaled_UNCERTAINTY_MODAL', 'scaled_REGULATORY_PRESSURE']
Merged DataFrame saved successfully for Cocoa, with unmatched sentiment values filled with NaN.
Coffee
Dropped columns: ['scaled_NET_SENTIMENT', 'scaled_UNCERTAINTY_MODAL', 'scaled_REGULATORY_PRESSURE']
Merged DataFrame saved successfully for Coffee, with unmatched sentiment values filled with NaN.
Corn
Dropped columns: ['scaled_NET_SENTIMENT', 'scaled_UNCERTAINTY_MODAL', 'scaled_REGULATORY_PRESSURE']
Merged DataFrame saved successfully for Corn, with unmatched sentiment values filled with NaN.
Cotton
Dropped columns: ['scaled_NET_SENTIMENT', 'scaled_UNCERTAINTY_MODAL', 'scaled_REGULATORY_PRESSURE']
Merged DataFrame saved successfully for Cotton, with unmatched sentiment values filled with NaN.
Soya Bean
Dropped columns: ['scaled_NET_SENTIMENT', 'scaled_UNCERTAINTY_MODAL', 'scaled_REGULATORY_PRESSURE']
Merged DataFrame saved successfully for Soya Bean, with unmatched sentiment valu