In [1]:
#import all libraries
import pandas as pd
import duckdb
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from xgboost import XGBRegressor
import pandas_market_calendars as mcal
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score
import numpy as np
import spacy
import time

In [2]:
# Ensure stopwords and tokenizer are available
nltk.download('stopwords')
nltk.download('punkt')
# Define stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hilun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hilun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#This will extract all ticker names and save it in a list 

# Database connection
db_file_path = r"C:\Users\hilun\OneDrive\Desktop\OMS\Practitam\financial_news.db"
conn = duckdb.connect(database=db_file_path, read_only=False)    

query = """
    SELECT DISTINCT ticker FROM "Headlines"."Daily_Price_Movement";
"""

# Execute query and fetch results into a DataFrame
ticker_a = conn.execute(query).fetchdf()

# Convert to a unique ticker list
# unique_tickers = ticker_a["ticker"].tolist()
# ticker_all=unique_tickers
# ticker_file=pd.read_csv('ticker with over1000 AC.csv')
# ticker_all = ticker_file["symbol"]  # Extract column

#ticker_all=['AAPL'] #uncomment this for troubleshooting with one stock

# **Close Database Connection**
conn.close()

In [4]:
# Load NYSE trading calendar
nyse = mcal.get_calendar('NYSE')
nlp = spacy.load("en_core_web_sm")

start_time = time.time()  # Start time measurement

In [5]:
# Function to find the next trading day
def next_trading_day(date):
    date = pd.Timestamp(date)
    while len(nyse.valid_days(start_date=date.strftime('%Y-%m-%d'), end_date=date.strftime('%Y-%m-%d'))) == 0:
        date += pd.Timedelta(days=1)
    return date

In [6]:
#text tokenization including NER removal
def tokenize_text(text):
    # Process the text using spaCy
    doc = nlp(text.lower())

    # Extract named entities
    named_entities = {ent.text.lower() for ent in doc.ents}

    # Tokenize and filter words
    words = word_tokenize(text.lower())
    words = [
        word for word in words 
        if word.isalpha() and len(word) > 2 and 
        word not in stop_words and 
        word not in named_entities  # Remove named entities
    ]
    
    return words

In [7]:
#function to calculate bollinger band
def bb(price, lookback):
    std=price.rolling(window=lookback, min_periods=lookback).std()
    sma=price.rolling(window=lookback, min_periods=lookback).mean()
    bottom=sma-(2*std)
    top=sma+(2*std)
    return sma, top, bottom

#function to calculate relative strength index
def get_rsi(price, lookback):
    daily_ret=price.diff()
    up = daily_ret.clip(lower=0)
    down = -1 * daily_ret.clip(upper=0)
    sma_up = up.rolling(window = lookback).mean()
    sma_down = down.rolling(window = lookback).mean()  
    rs=sma_up/sma_down
    rsi=100-(100/(1+rs))
    return rsi        	   		  		 			  		 			     			  	 

#function to calculate stochastic osillator
def get_so(price, lookback):
    high = price.rolling(window=lookback).max()
    low = price.rolling(window=lookback).min()
    K = 100*(price-low)/(high-low)
    D = K.rolling(window=3).mean()

    return(K,D)	

In [8]:
# This the main function to process a single ticker
def process_ticker(ticker, nyse, stop_words, result_df):
    print(f"Processing {ticker}...")
    # Database connection
    # I chose to open and close a db connection inside this function to avoid issues
    # I encountered inaccurate results when I opened and close a connect outside of loop

    #*****************************************************************************
    #change this to your local path
    db_file_path = r"C:\Users\hilun\OneDrive\Desktop\OMS\Practitam\financial_news.db" 
    #*****************************************************************************
    conn = duckdb.connect(database=db_file_path, read_only=False)    
    query = """
        SELECT 
            a.mapped_trading_date AS publish_date,
            a.description, 
            a.article_pubDate,
            dpm.close_price
        FROM "Headlines"."Articles_Trading_Day" a
        INNER JOIN "Headlines"."Daily_Price_Movement" dpm
        ON a.mapped_trading_date = dpm.trading_date  
        WHERE a.ticker = ?
        AND dpm.ticker = ?;
    """
    
    news_df = conn.execute(query, [ticker, ticker]).fetchdf()
    article_count=len(news_df)
    if news_df.empty:
        print(f"Skipping {ticker}: No data found.")
        return

    news_df["publish_date"] = pd.to_datetime(news_df["publish_date"]).dt.date
    news_df["description"] = news_df["description"].fillna("")

    # Group descriptions by date
    news_df = news_df.groupby("publish_date", as_index=False).agg({
        "description": lambda x: " ".join(x),
        "close_price": "first",
    })

    # Adjust for non-trading days
    news_df["adjusted_date"] = news_df["publish_date"].apply(next_trading_day)

    # Re-group after adjusting trading days
    news_df = news_df.groupby("adjusted_date", as_index=False).agg({
        "description": lambda x: " ".join(x),
        "close_price": "last",
    })
    
    news_df["price_change_percentage"] = ((news_df["close_price"].shift(-1) - news_df["close_price"]) / news_df["close_price"]) * 100
    news_df["tokenized_words"] = news_df["description"].astype(str).apply(tokenize_text)

    # Calculate token scores
    unique_words = set(word for words_list in news_df["tokenized_words"] for word in words_list)
    w_count=len(unique_words)
    word_scores = {word: [] for word in unique_words}

    for _, row in news_df.iterrows():
        words_list = row["tokenized_words"]
        price_change = row["price_change_percentage"]
        total_words = len(words_list)

        if total_words > 0:
            word_counts = {word: words_list.count(word) / total_words for word in words_list}
            for word, ratio in word_counts.items():
                word_scores[word].append(ratio * price_change)

    tk_info = pd.DataFrame({
        "word": list(word_scores.keys()),
        "score": [np.mean(scores) if scores else 0 for scores in word_scores.values()]
    }).dropna()
    token_scores_dict = dict(zip(tk_info["word"], tk_info["score"]))
    
    def calculate_token_score(tokens):
        return sum(token_scores_dict.get(token, 0) for token in tokens)

    news_df["token_score"] = news_df["tokenized_words"].apply(calculate_token_score)

    # Ensure no missing values in price change
    news_df = news_df.dropna()
    
    # Define the SQL query
    query = """
        SELECT trading_day_date, price 
        FROM Headlines.Pricing_News
        WHERE ticker = ?
    """
    
    # Execute the query and load the data into a pandas DataFrame
    df_p=conn.execute(query,[ticker]).fetchdf()

    # Adjust for non-trading days
    df_p = df_p.rename(columns={"trading_day_date": "trading_date"})
    df_p = df_p.rename(columns={"price": "close_price"})

    df_p["adjusted_trading_date"] =df_p["trading_date"].apply(next_trading_day)
    df_p = df_p.drop_duplicates(subset=['adjusted_trading_date'], keep='last')

    # Ensure 'trading_date' is in datetime format
    df_p['trading_date'] = pd.to_datetime(df_p['trading_date'])
    news_df['adjusted_date'] = pd.to_datetime(news_df['adjusted_date'])  # Ensure news_df dates are also datetime

    if news_df['adjusted_date'].empty:
        print(f"Skipping {ticker}: No data found.")
        return
    
    # Get start_date and end_date
    start_date = news_df['adjusted_date'].iloc[0]  # First news date
    end_date = news_df['adjusted_date'].iloc[-1]   # Last news date
    
    # Compute the date range
    date_lower_bound = start_date - pd.Timedelta(days=30)
    date_upper_bound = end_date + pd.Timedelta(days=30)
    
    #print(date_lower_bound, date_upper_bound )
    
    # Filter df_p to include only trading dates within this range
    df_p = df_p[(df_p['trading_date'] >= date_lower_bound) & (df_p['trading_date'] <= date_upper_bound)]
    # Convert to a pandas Series
    price = df_p['close_price']
    
    #add BB columns: SMA, Bottom, Top
    sma, bot, top=bb(price,14)
    rsi=get_rsi(price,4)
    #Stochastic Oscillator
    K, D=get_so(price,14)
    
    df_p['bb_sma']=sma
    df_p['bb_bottom']=bot
    df_p['bb_top']=top
    df_p['rsi']=rsi
    df_p['K']=K
    df_p['D']=D

    df_p['sma_future_7'] = df_p['close_price'].rolling(window=7, min_periods=1).mean().shift(-7)
    
    df_p = df_p.reset_index(drop=True)  # Reset index and remove old index
    #handling edge cases for the last 7 days
    t_len=len(df_p)
    for i in range(6):
        future_values = df_p['close_price'].iloc[t_len-1-i:t_len]  # Get up to 6 future values
        df_p.at[t_len-i-2, 'sma_future_7'] = future_values.mean()  # Assign safely  # Compute mean of available values

    news_df = news_df.merge(
    df_p[['adjusted_trading_date', 'bb_sma', 'bb_bottom', 'bb_top', 'rsi', 'K','D', 'sma_future_7']],
    left_on='adjusted_date',
    right_on='adjusted_trading_date',
    how='left'
    )
    
    news_df = news_df.drop(columns=['adjusted_trading_date'])
    
    #calculate the trigger column
    news_df["bb_trigger"] = 0  # Default value
    news_df.loc[(news_df["close_price"].shift(1) > news_df["bb_top"].shift(1)) & (news_df["close_price"] < news_df["bb_top"]), "bb_trigger"] = -1
    news_df.loc[(news_df["close_price"].shift(1) < news_df["bb_bottom"].shift(1)) & (news_df["close_price"] > news_df["bb_bottom"]), "bb_trigger"] = 1
    # Calculate the RSI trigger column
    news_df["rsi_trigger"] = 0  # Default value
    news_df.loc[news_df["rsi"] > 70, "rsi_trigger"] = -1
    news_df.loc[news_df["rsi"] < 30, "rsi_trigger"] = 1 
    
    news_df['bb_sma']=sma
    news_df['bb_bottom']=bot
    news_df['bb_top']=top
    news_df['rsi']=rsi
    news_df['K']=K
    news_df['D']=D

    #************************************************************
    #Make sure you add this file to your local path
    #add vix column
    vix_df = pd.read_csv("vixGaTechSP25.csv")
    #************************************************************
    
    # Rename columns (assumes first column is date and second is VIX value)
    vix_df = vix_df.rename(columns={vix_df.columns[0]: "adjusted_date", vix_df.columns[1]: "vix"})
    
    # Convert both date columns to datetime
    news_df["adjusted_date"] = pd.to_datetime(news_df["adjusted_date"])
    vix_df["adjusted_date"] = pd.to_datetime(vix_df["adjusted_date"])

    # Now merge
    news_df = pd.merge(news_df, vix_df, on="adjusted_date", how="left")
   
    #select single variable
    #X_combined = news_df["token_score"].values.reshape(-1, 1)  # Use token score as feature
    
    # Select multiple feature columns
    X_combined = news_df[["token_score", "vix"]].values 
    X_combined = np.array(X_combined).reshape(-1, 2)  # 4 features   
    # X_combined = news_df[["rsi","K","D",'bb_sma','bb_bottom','bb_top']].values 
    # X_combined = np.array(X_combined).reshape(-1, 6)  # 4 features
    y = news_df["price_change_percentage"].values

    # Train-Test Split
    split_index = int(len(news_df) * 0.8)
    X_train, X_test = X_combined[:split_index], X_combined[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]
    
    # Train XGBoost Model
    xgb_model = XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1)
    xgb_model.fit(X_train, y_train)

    # Predict on test data
    y_pred = xgb_model.predict(X_test)

    # Evaluate model performance
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    #4 average groups with 6 categories
    ave = news_df["price_change_percentage"].abs().mean()
    if ave > 3:
        l = [7, 3.5, 0, -3.5, -7]
    elif ave >= 2:
        l = [4, 2, 0, -2, -4]
    elif ave >= 1:
        l = [3, 1.5, 0, -1.5, -3]
    else:
        l = [2.5, 1.25, 0, -1.25, -2.5]

    # Categorization/Classification function
    def categorize_value(x):
        if x > l[0]:
            return 0
        elif l[1] <= x <= l[0]:
            return 1
        elif l[2] <= x < l[1]:
            return 2
        elif l[3] <= x < l[2]:
            return 3
        elif l[4] <= x < l[3]:
            return 4
        else:
            return 5
        
    start_row = split_index - 1

    # Apply categorization
    news_df.loc[news_df.index > start_row, "actual category"] = news_df.loc[news_df.index > start_row, "price_change_percentage"].apply(categorize_value)
    news_df["predicted_price_change"] = np.concatenate([np.full(split_index, np.nan), y_pred])
    news_df.loc[news_df.index > start_row, "predicted category"] = news_df.loc[news_df.index > start_row, "predicted_price_change"].apply(categorize_value)

    # Compute classification accuracy
    df_filtered = news_df.iloc[split_index:].reset_index(drop=True)[["actual category", "predicted category"]]
    accuracy = accuracy_score(df_filtered["actual category"], df_filtered["predicted category"])
    
    true_p = ((news_df["actual category"] > 3) & (news_df["predicted category"] > 3)).sum()
    false_neg=((news_df["actual category"] > 3) & (news_df["predicted category"] < 4)).sum()
    false_p=((news_df["actual category"] < 4) & (news_df["predicted category"] > 3)).sum()
    true_neg=((news_df["actual category"] <4) & (news_df["predicted category"] <4)).sum()
    precision=true_p/(true_p+false_p)
    recall=true_p/(true_p+false_neg)
    f1=2*precision*recall/(precision+recall)
    # Store results
    result_df.loc[len(result_df)] = [ticker, mae, r2, accuracy, article_count,w_count, len(news_df),precision,recall, f1,true_p,false_neg,false_p,true_neg]
    
    # **Close Database Connection**
    conn.close()

In [9]:
# Define column names
columns = ["symbol", "MAE", "r-square", "all categories classification accuracy","article count","word count", "total days used for analysis",'Price drop precision','price drop recall','price drop f1 score','true positive','false negative','false positive','true negative']
result_df = pd.DataFrame(columns=columns)

# **Main Execution Loop**
for ticker in ticker_all:
    process_ticker(ticker, nyse, stop_words, result_df)

end_time = time.time()  # End time measurement
elapsed_time = end_time - start_time  # Calculate elapsed time

print(f"Execution Time: {elapsed_time:.5f} seconds")  # Print execution time

print(result_df)
# **Save Final Results**
result_df.to_csv("result_data_words_stock_score.csv", index=False)

Processing AAPL...
Execution Time: 92.62148 seconds
  symbol       MAE  r-square  all categories classification accuracy  \
0   AAPL  0.410427  0.762253                                0.759259   

   article count  word count  total days used for analysis  \
0           5547       12082                           267   

   Price drop precision  price drop recall  price drop f1 score  \
0                   1.0                1.0                  1.0   

   true positive  false negative  false positive  true negative  
0              5               0               0             49  
