In [1]:
from DataLoader import DataLoader
import pandas as pd 
import re
import spacy

In [None]:
import os 
from dotenv import load_dotenv
from pathlib import Path
load_dotenv()
DB_PATH = Path(os.getenv("DB_PATH"))
DB_FILE = os.getenv("DB_FILE")
duckdb_path = DB_PATH / DB_FILE

CSV_OUTPUT_PATH = Path(os.getenv("CSV_OUTPUT_PATH"))
CLEANED_ARTICLES_FILENAME = os.getenv("CLEANED_ARTICLES_FILENAME")

In [None]:
nlp = spacy.load("en_core_web_sm")

def preprocess_articles(df):
    # Keep relevant columns
    df = df[['ticker', 'mapped_trading_date', 'article_title', 'description']].copy()

    # Combine title and description
    df['full_text'] = df['article_title'].fillna('') + " " + df['description'].fillna('')

    # Text cleaning function
    def clean_text(text):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = re.sub(r'[^\w\s$%0-9]', '', text)  # Remove punctuation but keep $, %, and numbers
        # text = re.sub(r'[^\w\s$%]', '', text)  # Since we deal with data could be useful to keep $ and %
        return text

    # Apply text cleaning
    df['clean_text'] = df['full_text'].apply(clean_text)
    

    # Lemmatization function using SpaCy
    def lemmatize_text(text):
        doc = nlp(text)  # Process text with SpaCy
        return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

    # Apply lemmatization
    df['tokens'] = df['clean_text'].apply(lemmatize_text)
    
    # use pipeline 
    docs = nlp.pipe(df['clean_text'], batch_size=10)  # Adjust batch_size as needed

    df['tokens'] = [[token.lemma_ for token in doc if not token.is_stop] for doc in docs]

    return df


In [None]:
table = "headlines.Articles_Trading_Day"
dataLoader = DataLoader(duckdb_path)
df = dataLoader.load_data(table)

In [5]:
# Apply preprocessing
df_cleaned = preprocess_articles(df)

# Show sample output
print(df_cleaned[['ticker', 'mapped_trading_date', 'tokens']].head())


  ticker mapped_trading_date  \
0    DXC          2024-07-11   
1    DRI          2024-07-17   
2    DVA          2024-07-19   
3    DRI          2024-07-22   
4    DVA          2024-07-23   

                                              tokens  
0  [dxc, technology, report, fiscal, quarter, 202...  
1  [darden, buy, chuyâs, $, 600, m, allcash, tran...  
2  [update, davita, pay, $, 345, million, settlem...  
3  [analyst, financial, statement, darden, restau...  
4  [davita, healthcare, dva, lap, stock, market, ...  


In [None]:
df_cleaned.to_csv(f'{CSV_OUTPUT_PATH}/{CLEANED_ARTICLES_FILENAME}', index=False)

In [7]:
df_cleaned.count()

ticker                 363820
mapped_trading_date    363820
article_title          363820
description            363820
full_text              363820
clean_text             363820
tokens                 363820
dtype: int64