In [1]:
import pandas as pd
import duckdb
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import pandas_market_calendars as mcal
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report
import numpy as np
import spacy

In [2]:
# Ensure stopwords and tokenizer are available
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hilun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hilun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Database connection
db_file_path = r"C:\Users\hilun\OneDrive\Desktop\OMS\Practitam\financial_news.db"
conn = duckdb.connect(database=db_file_path, read_only=False)

# Define stopwords
stop_words = set(stopwords.words('english'))

# Load ticker data
ticker_file = pd.read_csv(r"C:\Users\hilun\OneDrive\Desktop\OMS\Practitam\Index tickers\nasdaq_screener_us_tech_mid.csv")
ticker_all = ticker_file["Symbol"]  # Extract column

# Define column names
columns = ["symbol", "MAE", "r-square", "classification accuracy"]
result_df = pd.DataFrame(columns=columns)

# Load NYSE trading calendar
nyse = mcal.get_calendar('NYSE')

# Load negative words
negative_words_df = pd.read_csv(r"C:\Users\hilun\OneDrive\Desktop\OMS\Practitam\Negative_words.csv", header=None, names=["word"])
negative_words_set = set(negative_words_df["word"].str.lower())

In [4]:
# Function to find the next trading day
def next_trading_day(date):
    date = pd.Timestamp(date)
    while len(nyse.valid_days(start_date=date.strftime('%Y-%m-%d'), end_date=date.strftime('%Y-%m-%d'))) == 0:
        date += pd.Timedelta(days=1)
    return date

# Tokenization function
def tokenize_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha() and len(word) > 2 and word not in stop_words]
    return words

In [5]:
# Function to process a single ticker
def process_ticker(ticker, conn, nyse, stop_words, negative_words_set, result_df):
    print(f"Processing {ticker}...")
    
    query = """
        SELECT 
            a.mapped_trading_date AS publish_date,
            a.description, 
            dpm.price_change_percentage
        FROM "Headlines"."Articles_Trading_Day" a
        INNER JOIN "Headlines"."Daily_Price_Movement" dpm
        ON a.mapped_trading_date = dpm.trading_date  
        WHERE a.ticker = ?
        AND dpm.ticker = ?;
    """
    
    news_df = conn.execute(query, [ticker, ticker]).fetchdf()

    if news_df.empty:
        print(f"Skipping {ticker}: No data found.")
        return

    news_df["publish_date"] = pd.to_datetime(news_df["publish_date"]).dt.date
    news_df["description"] = news_df["description"].fillna("")

    # Group descriptions by date
    news_df = news_df.groupby("publish_date", as_index=False).agg({
        "description": lambda x: " ".join(x),
        "price_change_percentage": "first"
    })

    # Adjust for non-trading days
    news_df["adjusted_date"] = news_df["publish_date"].apply(next_trading_day)

    # Re-group after adjusting trading days
    news_df = news_df.groupby("adjusted_date", as_index=False).agg({
        "description": lambda x: " ".join(x),
        "price_change_percentage": "first"
    })

    news_df["tokenized_words"] = news_df["description"].astype(str).apply(tokenize_text)

    # Calculate token scores
    unique_words = set(word for words_list in news_df["tokenized_words"] for word in words_list)
    word_scores = {word: [] for word in unique_words}

    for _, row in news_df.iterrows():
        words_list = row["tokenized_words"]
        price_change = row["price_change_percentage"]
        total_words = len(words_list)

        if total_words > 0:
            word_counts = {word: words_list.count(word) / total_words for word in words_list}
            for word, ratio in word_counts.items():
                word_scores[word].append(ratio * price_change)

    tk_info = pd.DataFrame({
        "word": list(word_scores.keys()),
        "score": [np.mean(scores) if scores else 0 for scores in word_scores.values()]
    }).dropna()

    token_scores_dict = dict(zip(tk_info["word"], tk_info["score"]))

    def calculate_token_score(tokens):
        return sum(token_scores_dict.get(token, 0) for token in tokens)

    news_df["token_score"] = news_df["tokenized_words"].apply(calculate_token_score)

    # Ensure no missing values in price change
    news_df = news_df.dropna()


    X_combined = news_df["token_score"].values.reshape(-1, 1)  # Use token score as feature
    y = news_df["price_change_percentage"].values

    # Train-Test Split
    split_index = int(len(news_df) * 0.8)
    X_train, X_test = X_combined[:split_index], X_combined[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]

    # Train XGBoost Model
    xgb_model = XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1)
    xgb_model.fit(X_train, y_train)

    # Predict on test data
    y_pred = xgb_model.predict(X_test)

    # Evaluate model performance
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    ave = news_df["price_change_percentage"].abs().mean()
    l = [7, 3.5, 0, -3.5, -7] if ave >= 1.5 else [5, 2.5, 0, -2.5, -5]

    # Categorization/Classification function
    def categorize_value(x):
        if x > l[0]:
            return 0
        elif l[1] <= x <= l[0]:
            return 1
        elif l[2] <= x < l[1]:
            return 2
        elif l[3] <= x < l[2]:
            return 3
        elif l[4] <= x < l[3]:
            return 4
        else:
            return 5

    start_row = split_index - 1

    # Apply categorization
    news_df.loc[news_df.index > start_row, "actual category"] = news_df.loc[news_df.index > start_row, "price_change_percentage"].apply(categorize_value)
    news_df["predicted_price_change"] = np.concatenate([np.full(split_index, np.nan), y_pred])
    news_df.loc[news_df.index > start_row, "predicted category"] = news_df.loc[news_df.index > start_row, "predicted_price_change"].apply(categorize_value)

    # Compute classification accuracy
    df_filtered = news_df.iloc[split_index:].reset_index(drop=True)[["actual category", "predicted category"]]
    accuracy = accuracy_score(df_filtered["actual category"], df_filtered["predicted category"])

    # Store results
    result_df.loc[len(result_df)] = [ticker, mae, r2, accuracy]

In [6]:
# **Main Execution Loop**
for ticker in ticker_all:
    process_ticker(ticker, conn, nyse, stop_words, negative_words_set, result_df)

# **Close Database Connection**
conn.close()

print(result_df)
# **Save Final Results**
result_df.to_csv("result_data_words_stock_score.csv", index=False)

Processing PTC...
Processing ON...
Processing MDB...
Processing SSNC...
Processing VRSN...
Processing WDC...
Processing GFS...
Processing NTAP...
Processing SMCI...
Processing ZM...
Processing MCHP...
Processing ANSS...
Processing ZS...
Processing EA...
Processing MPWR...
Processing TTWO...
Processing TTD...
Processing VRSK...
Processing CTSH...
Processing DDOG...
Processing ADSK...
Processing WDAY...
Processing CDNS...
Processing SNPS...
Processing MSTR...
Processing FTNT...
Processing MRVL...
   symbol       MAE  r-square  classification accuracy
0     PTC  0.605694  0.575707                 0.666667
1      ON  1.112728  0.395235                 0.606061
2     MDB  1.163653  0.653458                 0.458333
3    SSNC  0.398650  0.533353                 0.611111
4    VRSN  0.460197 -0.120305                 0.750000
5     WDC  2.026242 -0.794968                 0.208333
6     GFS  1.200959  0.832961                 0.571429
7    NTAP  1.021625 -0.281023                 0.760000
8    