In [1]:
import nltk
import pandas as pd
import random
from typing import List, Mapping, Optional, Sequence

# import gensim
import numpy as np
from numpy.typing import NDArray
from download_hf import download_parquet

FloatArray = NDArray[np.float64]

"""
This script was specifically written to prepare the financial_phrasebank dataset
    financial_phrasebank has a single sentence per row
    labels range from 0 (negative), 1(neutral), 2 (positive)
    there are 4 datasets based on % agreement between annotators
"""
import gensim.downloader as api

google_news = api.load("word2vec-google-news-300")
# google_news.save("word2vec-google-news-300.model")


# import sentiment_economy_news dataset
def import_data(split):
    """
    Import financial_phrasebank dataset
    splits available:
        sentences_50agree,
        sentences_66agree,
        sentences_75agree,
        sentences_allagree
    """
    return download_parquet("financial_phrasebank", split)


from nltk.corpus import wordnet

In [2]:
def Negation(sentence):
    """
    Input: Tokenized sentence (List of words)
    Output: Tokenized sentence with negation handled (List of words)
    """
    temp = int(0)
    for i in range(len(sentence)):
        if sentence[i - 1] in ["not", "n't"]:
            antonyms = []
            for syn in wordnet.synsets(sentence[i]):
                syns = wordnet.synsets(sentence[i])
                w1 = syns[0].name()
                temp = 0
                for l in syn.lemmas():
                    if l.antonyms():
                        antonyms.append(l.antonyms()[0].name())
                max_dissimilarity = 0
                for ant in antonyms:
                    syns = wordnet.synsets(ant)
                    w2 = syns[0].name()
                    syns = wordnet.synsets(sentence[i])
                    w1 = syns[0].name()
                    word1 = wordnet.synset(w1)
                    word2 = wordnet.synset(w2)
                    if isinstance(word1.wup_similarity(word2), float) or isinstance(
                        word1.wup_similarity(word2), int
                    ):
                        temp = 1 - word1.wup_similarity(word2)
                    if temp > max_dissimilarity:
                        max_dissimilarity = temp
                        antonym_max = ant
                        sentence[i] = antonym_max
                        sentence[i - 1] = ""
    while "" in sentence:
        sentence.remove("")
    return sentence


def clean_text(text):
    """Clean text incorporating Negation handling and stopwords."""
    # turn to lowercase
    text = text.lower()
    # word tokenization
    text = nltk.word_tokenize(text)
    # negation handling
    text = Negation(text)
    # remove punctuation
    text = [word for word in text if word.isalnum()]
    # remove stopwords
    stopwords = nltk.corpus.stopwords.words("english")
    text = [word for word in text if word not in stopwords]
    if text == "":
        pass
    else:
        return " ".join(text)


def tokenize_financial_phrasebank(df):
    """
    Tokenize sentiment economy news.
    """
    # tokenize sentences
    df["tokenized_sentences"] = df["sentence"].apply(clean_text)
    df = df.loc[df.tokenized_sentences != ""]
    return df


def sum_token_embeddings(
    token_embeddings: Sequence[FloatArray],
) -> FloatArray:
    """Sum the token embeddings."""
    total: FloatArray = np.array(token_embeddings).sum(axis=0)
    return total


def map_labels(df):
    """Map labels to integers."""
    label_map = {0: -1, 1: 0, 2: 1}
    return df["label"].map(label_map)


def split_train_test(
    X: FloatArray, y: FloatArray, test_percent: float = 20
) -> tuple[FloatArray, FloatArray, FloatArray, FloatArray]:
    """Split data into training and testing sets."""
    N = len(y)
    data_idx = list(range(N))
    random.shuffle(data_idx)
    break_idx = round(test_percent / 100 * N)
    training_idx = data_idx[break_idx:]
    testing_idx = data_idx[:break_idx]
    X_train = X[training_idx, :]
    y_train = y[training_idx]
    X_test = X[testing_idx, :]
    y_test = y[testing_idx]
    return X_train, y_train, X_test, y_test

In [3]:
def generate_data_word2vec(df: pd.DataFrame) -> tuple[FloatArray, FloatArray]:
    """Generate training and testing data with word2vec."""
    # load pre-trained word2vec model
    # google_news = gensim.models.KeyedVectors.load("word2vec-google-news-300.model")
    X: FloatArray = np.array(
        [
            # sum the token embeddings for each sentence. If word is not in the model, return embedding of ['UNK']
            sum_token_embeddings(
                [
                    google_news[word] if word in google_news else google_news["UNK"]
                    for _, word in enumerate(sentence)
                ]
            )
            for _, sentence in enumerate(df.tokenized_sentences)
        ]
    )
    # labels = [-1, 0, 1] seems to be causing an error
    # y: FloatArray = np.array(map_labels(df))
    y: FloatArray = np.array(df.label)
    return split_train_test(X, y)

In [4]:
def generate_observation_word2vec(sentence):
    X: FloatArray = np.array(
        [
            sum_token_embeddings(
                [
                    google_news[word] if word in google_news else google_news["UNK"]
                    for _, word in enumerate(sentence)
                ]
            )
        ]
    )
    return X


def etl(split):
    """
    Extract, transform, and load financial_phrasebank
    """
    df = import_data(split)
    df = tokenize_financial_phrasebank(df)
    X_train, y_train, X_test, y_test = generate_data_word2vec(df)
    return X_train, y_train, X_test, y_test


def aggregate_all_splits():
    """
    Aggregate all splits of financial_phrasebank
    """
    df = pd.DataFrame()
    for split in [
        "sentences_50agree",
        "sentences_66agree",
        "sentences_75agree",
        "sentences_allagree",
    ]:
        df = pd.concat([df, import_data(split)])
    df = tokenize_financial_phrasebank(df)
    X_train, y_train, X_test, y_test = generate_data_word2vec(df)
    return X_train, y_train, X_test, y_test

In [6]:
X_train, y_train, X_test, y_test = aggregate_all_splits()

In [9]:
def aggregate_fin_data():
    """
    Aggregate all splits of financial_phrasebank
    """
    df = pd.DataFrame()
    for split in [
        "sentences_50agree",
        "sentences_66agree",
        "sentences_75agree",
        "sentences_allagree",
    ]:
        df = pd.concat([df, import_data(split)])
    df = tokenize_financial_phrasebank(df)
    return df

In [11]:
df = aggregate_fin_data()
negative_df = df[df["label"] == 0]
neutral_df = df[df["label"] == 1]
positive_df = df[df["label"] == 2]

In [12]:
positive_df

Unnamed: 0,sentence,label,tokenized_sentences
3,With the new production plant the company woul...,2,new production plant company would increase ca...
4,According to the company 's updated strategy f...,2,according company updated strategy years baswa...
5,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,2,financing aspocomp growth aspocomp aggressivel...
6,"For the last quarter of 2010 , Componenta 's n...",2,last quarter 2010 componenta net sales doubled...
7,"In the third quarter of 2010 , net sales incre...",2,third quarter 2010 net sales increased eur mn ...
...,...,...,...
2064,"29 September , 2010 Finnish waste management a...",2,29 september 2010 finnish waste management rec...
2074,Key shareholders of Finnish IT services provid...,2,key shareholders finnish services provider tie...
2091,"As part of the transaction , M-real and Sappi ...",2,part transaction sappi also signed agreement s...
2225,`` I am extremely delighted with this project ...,2,extremely delighted project continuation coope...


In [16]:
positive_string = " ".join(positive_df["tokenized_sentences"])
positive_words = positive_string.split()

neutral_string = " ".join(neutral_df["tokenized_sentences"])
neutral_words = neutral_string.split()

negative_string = " ".join(negative_df["tokenized_sentences"])
negative_words = negative_string.split()

In [17]:
import prep_financial_phrasebank as prep

In [24]:
columns = ["text", "label"]
synthetic_df = pd.DataFrame(columns=columns)
for i in range(5000):
    max_length = 15
    length = int(np.round(np.random.uniform(0.15, 1) * max_length))
    types = 2
    sent = int(np.round(np.random.uniform(0, 1) * types))
    if sent == 1:
        rand_text = [random.choice(neutral_words) for _ in range(length)]
    elif sent == 2:
        rand_text = [random.choice(positive_words) for _ in range(length)]
    elif sent == 0:
        rand_text = [random.choice(negative_words) for _ in range(length)]
    random_text = " ".join(rand_text)
    synthetic_df.loc[i] = [random_text, sent]  # Add rand_text to synthetic_df

synthetic_df["text"] = synthetic_df["text"].apply(prep.clean_text)
# remove empty strings
synthetic_df = synthetic_df[synthetic_df["text"] != ""]

In [20]:
from prep_financial_phrasebank import (
    tokenize_financial_phrasebank,
    generate_data_word2vec,
    sum_token_embeddings,
)

In [21]:
def aggregate_fake_splits(fakedata):
    """
    Aggregate all splits of financial_phrasebank
    """
    df = fakedata
    df = tokenize_financial_phrasebank(df)
    X_train, y_train, X_test, y_test = generate_data_word2vec(df)
    return X_train, y_train, X_test, y_test


def run_experiment1(synthetic_df) -> None:
    from sklearn.linear_model import LogisticRegression

    # prepare training and testing data
    X_train, y_train, X_test, y_test = aggregate_fake_splits(synthetic_df)

    clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)
    print("word2vec (train):", clf.score(X_train, y_train))
    print("word2vec (test):", clf.score(X_test, y_test))

    return clf

In [25]:
columns = ["sentence", "label"]
New_Synthetic_Data = pd.DataFrame(columns=columns)
New_Synthetic_Data["sentence"] = synthetic_df["text"]
New_Synthetic_Data["label"] = synthetic_df["label"]
New_Synthetic_Data

Unnamed: 0,sentence,label
0,flights passenger helsinki expected eur 500 co...,0
1,make excluding fall totaled 22 published staff...,0
2,second arrive rival 2009 third impacted 2010 e...,0
3,area eur13 first volume march insurer eur732m ...,2
4,2010 oil projects houses 5 pm held thursday ra...,1
...,...,...
4995,price earliest decided,1
4996,service refurbishment 2007 helsinki manufactur...,1
4997,glaston registered works determining november ...,1
4998,business outotec financial nordea approximatel...,1


In [95]:
def RNN_experiment_torch_synth(synthetic_df):
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    from torch.utils.data import Dataset, DataLoader
    from torch.nn.utils.rnn import pad_sequence
    from sklearn import metrics

    # prepare training and testing data
    # X_train, y_train, X_test, y_test = aggregate_all_splits()
    X_train, y_train, X_test, y_test = aggregate_fake_splits(synthetic_df)

    # convert to torch tensors
    X_train = torch.from_numpy(X_train)
    y_train = torch.from_numpy(y_train)
    X_test = torch.from_numpy(X_test)
    y_test = torch.from_numpy(y_test)

    # create dataset
    class FinancialPhraseBankDataset(Dataset):
        def __init__(self, X, y):
            self.X = X
            self.y = y

        def __len__(self):
            return len(self.y)

        def __getitem__(self, idx):
            return self.X[idx], self.y[idx]

    # create dataloader
    train_dataset = FinancialPhraseBankDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # define model
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.fc1 = nn.Linear(300, 300)
            self.fc2 = nn.Linear(300, 3)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = self.fc2(x)
            return x

    net = Net()

    # define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    # train
    for epoch in range(20):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs.float())
            # print(outputs.shape)
            # print(labels.shape)
            loss = criterion(outputs, labels.long())
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 100 == 99:  # print every 100 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / 100))

    print("Finished Training")
    # After training, generate predictions on test data
    X_test = X_test.float()
    outputs_test = net(X_test)
    _, predicted_test = torch.max(outputs_test, 1)
    outputs_train = net(X_train)
    _, predicted_train = torch.max(outputs_train, 1)
    # Convert tensors to numpy arrays for comparison with sklearn metrics
    y_test_np = y_test.numpy()
    predicted_test_np = predicted_test.numpy()

    # Convert tensors to numpy arrays for comparison with sklearn metrics
    y_train_np = y_train.numpy()
    predicted_train_np = predicted_train.numpy()

    # Now you can use sklearn's metrics to compare y_test_np and predicted_np
    # For example, to calculate accuracy:
    accuracy1 = metrics.accuracy_score(y_train_np, predicted_train_np)
    print("Train Accuracy: ", accuracy1)
    accuracy2 = metrics.accuracy_score(y_test_np, predicted_test_np)
    print("Test Accuracy: ", accuracy2)

In [63]:
clf = run_experiment1(New_Synthetic_Data)
rnn = RNN_experiment_torch_synth(New_Synthetic_Data)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


word2vec (train): 0.5895
word2vec (test): 0.549
[1,   100] loss: 1.093
[2,   100] loss: 0.986
[3,   100] loss: 0.954
[4,   100] loss: 0.946
[5,   100] loss: 0.943
[6,   100] loss: 0.972
[7,   100] loss: 0.937
[8,   100] loss: 0.927
[9,   100] loss: 0.952
[10,   100] loss: 0.937
Finished Training
Train Accuracy:  0.55625
Test Accuracy:  0.548


In [96]:
def RNN_experiment_torch():
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    from torch.utils.data import Dataset, DataLoader
    from torch.nn.utils.rnn import pad_sequence
    from sklearn import metrics

    # prepare training and testing data
    X_train, y_train, X_test, y_test = aggregate_all_splits()
    # X_train, y_train, X_test, y_test = etl("sentences_allagree")

    # convert to torch tensors
    X_train = torch.from_numpy(X_train)
    y_train = torch.from_numpy(y_train)
    X_test = torch.from_numpy(X_test)
    y_test = torch.from_numpy(y_test)

    # create dataset
    class FinancialPhraseBankDataset(Dataset):
        def __init__(self, X, y):
            self.X = X
            self.y = y

        def __len__(self):
            return len(self.y)

        def __getitem__(self, idx):
            return self.X[idx], self.y[idx]

    # create dataloader
    train_dataset = FinancialPhraseBankDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # define model
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.fc1 = nn.Linear(300, 350)
            self.fc2 = nn.Linear(350, 3)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = self.fc2(x)
            return x

    net = Net()

    # define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    # train
    for epoch in range(20):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs.float())
            # print(outputs.shape)
            # print(labels.shape)
            loss = criterion(outputs, labels.long())
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 100 == 99:  # print every 100 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / 100))

    print("Finished Training")
    # After training, generate predictions on test data
    X_test = X_test.float()
    outputs_test = net(X_test)
    _, predicted_test = torch.max(outputs_test, 1)
    outputs_train = net(X_train)
    _, predicted_train = torch.max(outputs_train, 1)
    # Convert tensors to numpy arrays for comparison with sklearn metrics
    y_test_np = y_test.numpy()
    predicted_test_np = predicted_test.numpy()

    # Convert tensors to numpy arrays for comparison with sklearn metrics
    y_train_np = y_train.numpy()
    predicted_train_np = predicted_train.numpy()

    # Now you can use sklearn's metrics to compare y_test_np and predicted_np
    # For example, to calculate accuracy:
    accuracy1 = metrics.accuracy_score(y_train_np, predicted_train_np)
    print("Train Accuracy: ", accuracy1)
    accuracy2 = metrics.accuracy_score(y_test_np, predicted_test_np)
    print("Test Accuracy: ", accuracy2)

In [89]:
from prep_financial_phrasebank import run_experiment

In [90]:
random.seed(2)

In [97]:
random.seed(2)
clf = run_experiment1(New_Synthetic_Data)
rnn = RNN_experiment_torch_synth(New_Synthetic_Data)
print("")
print("")
print("")
logreg = run_experiment()
rnn2 = RNN_experiment_torch()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


word2vec (train): 0.573
word2vec (test): 0.605
[1,   100] loss: 1.174
[2,   100] loss: 0.974
[3,   100] loss: 0.960
[4,   100] loss: 0.972
[5,   100] loss: 0.973
[6,   100] loss: 0.945
[7,   100] loss: 0.950
[8,   100] loss: 0.953
[9,   100] loss: 0.954
[10,   100] loss: 0.945
[11,   100] loss: 0.934
[12,   100] loss: 0.929
[13,   100] loss: 0.938
[14,   100] loss: 0.932
[15,   100] loss: 0.932
[16,   100] loss: 0.945
[17,   100] loss: 0.933
[18,   100] loss: 0.918
[19,   100] loss: 0.928
[20,   100] loss: 0.943
Finished Training
Train Accuracy:  0.56975
Test Accuracy:  0.585





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


word2vec (train): 0.6425852296760003
word2vec (test): 0.6554991539763113
[1,   100] loss: 1.168
[1,   200] loss: 2.087
[1,   300] loss: 3.004
[2,   100] loss: 0.926
[2,   200] loss: 1.823
[2,   300] loss: 2.708
[3,   100] loss: 0.907
[3,   200] loss: 1.800
[3,   300] loss: 2.691
[4,   100] loss: 0.874
[4,   200] loss: 1.760
[4,   300] loss: 2.654
[5,   100] loss: 0.890
[5,   200] loss: 1.756
[5,   300] loss: 2.634
[6,   100] loss: 0.880
[6,   200] loss: 1.748
[6,   300] loss: 2.619
[7,   100] loss: 0.878
[7,   200] loss: 1.734
[7,   300] loss: 2.601
[8,   100] loss: 0.870
[8,   200] loss: 1.724
[8,   300] loss: 2.605
[9,   100] loss: 0.865
[9,   200] loss: 1.746
[9,   300] loss: 2.615
[10,   100] loss: 0.863
[10,   200] loss: 1.764
[10,   300] loss: 2.636
[11,   100] loss: 0.881
[11,   200] loss: 1.742
[11,   300] loss: 2.619
[12,   100] loss: 0.854
[12,   200] loss: 1.715
[12,   300] loss: 2.581
[13,   100] loss: 0.854
[13,   200] loss: 1.722
[13,   300] loss: 2.569
[14,   100] loss: 