# Processing Text Data for Logistic Regression with Traditional Methods

In [3]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [5]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [6]:
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nikitapiko/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nikitapiko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikitapiko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikitapiko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [29]:
stop_words = set(stopwords.words('english'))

In [30]:
custom_stop_words = {"would", "could", "should", "might", "also", "today", "tomorrow", "week", "year"}  # Add more if necessary
stop_words.update(custom_stop_words)

In [31]:
# Financial terms and symbol replacement
symbol_map = {
    '%': 'percent',
    '$': 'dollars',
    '€': 'euros',
    '£': 'pounds',
    '¥': 'yen',
    '&': 'and',
    '#': 'number',
    'Q1': 'quarter 1',
    'Q2': 'quarter 2',
    'Q3': 'quarter 3',
    'Q4': 'quarter 4',
    'H1': 'half 1',
    'H2': 'half 2',
    'EPS': 'earnings per share',
    'EBITDA': 'earnings before interest taxes depreciation and amortization',
    'YoY': 'year over year',
    'QoQ': 'quarter over quarter',
    'up': 'increase',
    'down': 'decrease',
    'bullish': 'positive',
    'bearish': 'negative',
    'rally': 'increase',
    'plunge': 'decrease',
    'soar': 'increase',
    'crash': 'decrease',
    'rebound': 'recovery',
    'dip': 'decrease',
    'surge': 'increase'
}

In [35]:
# Important financial verbs and adjectives to retain
important_terms = {'gain', 'rise', 'increase', 'fall', 'drop', 'decrease', 'improve', 'decline',
                   'bullish', 'bearish', 'soar', 'plunge', 'rally', 'rebound', 'crash', 'dip'}

In [44]:
# Example text
text = "The company's stock price rose by 5% today."


def preprocess_text(text):
    

    # symbols handling 
    for symbol, replacement in symbol_map.items():
        text = text.replace(symbol, replacement)

    # lowercase
    text = text.lower()

    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # print(text)

    # Tokenization
    tokens = word_tokenize(text)

    # print(tokens)

    # Stop Words Removal
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # print(filtered_tokens)

    # Stemming (or use Lemmatization)
    ps = PorterStemmer()
    stemmed_tokens = [ps.stem(word) for word in filtered_tokens]

    # print(stemmed_tokens)

    # Lemmatization (use this over stemming for better handling of financial terms)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, 'v') if word in important_terms else lemmatizer.lemmatize(word) for word in filtered_tokens]

    # Combine tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)

    # preprocessed_text = ' '.join(stemmed_tokens)  # or for stemmed tokens
    # print(preprocessed_text)
    
    
    return preprocessed_text

In [37]:
preprocess_text(text)

the companys stock price rose by 5percent today
['the', 'companys', 'stock', 'price', 'rose', 'by', '5percent', 'today']
['companys', 'stock', 'price', 'rose', '5percent']
['compani', 'stock', 'price', 'rose', '5percent']
company stock price rose 5percent


'company stock price rose 5percent'

In [39]:
preprocess_text("$1000 and 5% up")

dollars1000 and 5percent increase
['dollars1000', 'and', '5percent', 'increase']
['dollars1000', '5percent', 'increase']
['dollars1000', '5percent', 'increas']
dollars1000 5percent increase


'dollars1000 5percent increase'

In [38]:
example_text = "Q1 earnings up 5% with EBITDA increasing, stock price bullish."
preprocess_text(example_text)

quarter 1 earnings increase 5percent with earnings before interest taxes depreciation and amortization increasing stock price positive
['quarter', '1', 'earnings', 'increase', '5percent', 'with', 'earnings', 'before', 'interest', 'taxes', 'depreciation', 'and', 'amortization', 'increasing', 'stock', 'price', 'positive']
['quarter', '1', 'earnings', 'increase', '5percent', 'earnings', 'interest', 'taxes', 'depreciation', 'amortization', 'increasing', 'stock', 'price', 'positive']
['quarter', '1', 'earn', 'increas', '5percent', 'earn', 'interest', 'tax', 'depreci', 'amort', 'increas', 'stock', 'price', 'posit']
quarter 1 earnings increase 5percent earnings interest tax depreciation amortization increasing stock price positive


'quarter 1 earnings increase 5percent earnings interest tax depreciation amortization increasing stock price positive'

In [16]:
import pandas as pd

### Labeled Data Preprocessing

In [40]:
filepath_labeled = "../data/labeled_data/merged_labeled_data.csv"

In [41]:
df_labeled = pd.read_csv(filepath_labeled)

In [42]:
df_labeled.head()

Unnamed: 0,newsHeadline,sentiment
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive


In [45]:
df_labeled['preprocessed_txt'] = df_labeled['newsHeadline'].apply(preprocess_text)

In [46]:
df_labeled.head()

Unnamed: 0,newsHeadline,sentiment,preprocessed_txt
0,"According to Gran , the company has no plans t...",neutral,according gran company plan move production ru...
1,Technopolis plans to develop in stages an area...,neutral,technopolis plan develop stage area less 10000...
2,The international electronic industry company ...,negative,international electronic industry company elco...
3,With the new production plant the company woul...,positive,new production plant company increase capacity...
4,According to the company 's updated strategy f...,positive,according company increasedated strategy year ...


In [47]:
df_labeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7110 entries, 0 to 7109
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   newsHeadline      7110 non-null   object
 1   sentiment         7110 non-null   object
 2   preprocessed_txt  7110 non-null   object
dtypes: object(3)
memory usage: 166.8+ KB


### Unlabeled Data Preprocessing

In [48]:
df_unlabeled = pd.read_csv("../data/sp500_headlines_yahoo.csv")

In [49]:
df_unlabeled.head()

Unnamed: 0,Headlines
0,"Inflation data, retail sales, Walmart earnings..."
1,Whipsaw week for stocks leaves markets 'on edg...
2,Down Between 12% and 24% From Their 52-Week Hi...
3,Traders Bet on Wild Swings With CPI Print Set ...
4,"Down 60% This Year, Is Intel Stock a Bargain Buy?"


In [50]:
df_unlabeled['preprocessed_txt'] = df_unlabeled['Headlines'].apply(preprocess_text)

In [51]:
df_unlabeled.head()

Unnamed: 0,Headlines,preprocessed_txt
0,"Inflation data, retail sales, Walmart earnings...",inflation data retail sale walmart earnings aw...
1,Whipsaw week for stocks leaves markets 'on edg...,whipsaw stock leaf market edge ahead busy econ...
2,Down Between 12% and 24% From Their 52-Week Hi...,12percent 24percent 52week high 3 magnificent ...
3,Traders Bet on Wild Swings With CPI Print Set ...,trader bet wild swing cpi print set test market
4,"Down 60% This Year, Is Intel Stock a Bargain Buy?",60percent intel stock bargain buy


#### Saving to CSV

In [53]:
df_labeled.to_csv("../data/processed/processed_labeled_data.csv", index=False)

In [54]:
df_unlabeled.to_csv("../data/processed/processed_spx500_unlabeled_data.csv", index=False)