## Import functionality

In [13]:
import pandas as pd
import numpy as np
from os.path import join

## Load and merge sentiment data

* https://www.kaggle.com/datasets/ankurzing/aspect-based-sentiment-analysis-for-financial-news
* https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news
* https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis

In [14]:
SOURCE_DATA_FOLDER = ['..','datasets']
SOURCE_DATA = {
    'ankurzing_asp':[['kaggle','ankurzing','SEntFiN-v1.1.csv'],'utf-8', None],
    'ankurzing_sent':[['kaggle','ankurzing','all-data.csv'],'ansi', None],
    'sbhatti':[['kaggle','sbhatti','data.csv'],'utf-8', None]
}
# Filtered words
WORDS:dict = {} # word->WStat
TEXTS:list = [] # [ (sentiment, [(word, count))]) ]

In [15]:
class WStat:
    def __init__(self, score, count) -> None:
        self.score = score
        self.count = count
    def add(self, point, count):
        self.score += point
        self.count += count
    def get_score(self): return self.score
    def get_count(self): return self.count
# Add words
def add_words(point, words:list[str], dst:dict[WStat], txt:list):
    wmap = {}
    for w in words:
        if w in wmap: wmap[w] += 1
        else: wmap[w] = 1
    for word,count in wmap.items():
        if word in dst:
            dst[word].add(point,count)
        else:
            dst[word] = WStat(point,count)
    txt.append((point, wmap.items()))
# Process 'ankurzing__aspect-based...
def prep_ankurzing_asp(df_src:pd.DataFrame, dst:dict, txt:dict):
    for _,row in df_src.iterrows():
        # get sentiment classification
        sentiment = row['Decisions']
        sentiment = sentiment[sentiment.find(':')+3]
        # score
        if sentiment=='p': point = 1
        elif sentiment=='n': point = -1
        else: point = 0
        # get text words
        add_words(point, row['Title'].lower().split(), dst, txt)
# Process 'ankurzing__sentiment...'
def prep_ankurzing_sent(df_src:pd.DataFrame, dst:dict, txt:dict):
    for _,row in df_src.iterrows():
        # get sentiment classification
        sentiment = row[0]
        # score
        if sentiment=='positive': point = 1
        elif sentiment=='negative': point = -1
        else: point = 0
        # get text words
        add_words(point, row[1].lower().split(), dst, txt)
# Process 'sbhatti__financial...'
def prep_sbhatti(df_src:pd.DataFrame, dst:dict, txt:dict):
    for _,row in df_src.iterrows():
        # get sentiment classification
        sentiment = row['Sentiment']
        # score
        if sentiment=='positive': point = 1
        elif sentiment=='negative': point = -1
        else: point = 0
        # get text words
        add_words(point, row['Sentence'].lower().split(), dst, txt)

In [16]:
# Load data
for k in SOURCE_DATA:
    item = SOURCE_DATA[k]
    if isinstance(item[2],type(None)):
        item[2] = pd.read_csv(join(*SOURCE_DATA_FOLDER, *item[0]), encoding=item[1])

prep_ankurzing_sent(SOURCE_DATA['ankurzing_sent'][2], WORDS, TEXTS)
prep_ankurzing_asp(SOURCE_DATA['ankurzing_asp'][2], WORDS, TEXTS)
prep_sbhatti(SOURCE_DATA['sbhatti'][2], WORDS, TEXTS)

_words = []
_counts = []
_scores = []
for k in WORDS:
    _words.append(k)
    _scores.append(WORDS[k].get_score())
    _counts.append(WORDS[k].get_count())
df_words = pd.DataFrame({
    'Word':pd.Series(_words, dtype='str'),
    'Count':pd.Series(_counts, dtype='int16'),
    'Score':pd.Series(_scores, dtype='float32')
})

print(df_words.head())
print(df_words.shape)
print(df_words.dtypes)

          Word  Count  Score
0  technopolis     29    3.0
1        plans    202   -1.0
2           to   8192  247.0
3      develop     30   15.0
4           in   8143  330.0
(25780, 3)
Word      object
Count      int16
Score    float32
dtype: object


## Pre-process data

In [22]:
# Extract features
words = list(df_words.query('Score < -15 or Score > 15')['Word'])
rows = []
count = len(TEXTS)
current = 0
print('Processing')
for sentiment,word_tuples in TEXTS:
    row = np.zeros(len(words)+1,dtype=np.float32)
    row[0] = sentiment
    for w,c in word_tuples:
        try:
            idx = words.index(w)+1
            row[idx] = c
        except: pass
    rows.append(row)
    current += 1
    if current % 2100 == 0:
        print(f'Row {current} of {count}.')

# Create feature dataframe
print('Creating DataFrame.')
df_features = pd.DataFrame(rows, columns=['SCORE']+words)
print('Writing DataFrame to file.')
df_features.to_csv(join(*SOURCE_DATA_FOLDER,'sentiment.csv'))
print('Bye.')

Processing
Row 2100 of 21440.
Row 4200 of 21440.
Row 6300 of 21440.
Row 8400 of 21440.
Row 10500 of 21440.
Row 12600 of 21440.
Row 14700 of 21440.
Row 16800 of 21440.
Row 18900 of 21440.
Row 21000 of 21440.
Creating DataFrame.
Writing DataFrame to file.
