<a href="https://colab.research.google.com/github/nlnlvlc/financial-lstm-data/blob/main/preprocessing_financial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The following code prepares the data to run through two sentiment analysis models:

*   an **AT-LSTM** (Attention based Long-Short Term Memory model)
*   a **Bi-LSTM-AN** (BiDirectional Long-Short Term Memory & Adverserial Neural Network Hybrid model)

Before running the data through these models, the data previously cleaned [here](https://colab.research.google.com/drive/1IjLgsdhgxp0GHsvgoPNi5vyd7gLawzwt#scrollTo=OWXNCQeJu68o) most be pre-processed to identify words with significant semantic meaning and their **hypernyms**: sets of words which categorizes or better generalizes another word.

*(Ex. "Color" or "Fruit" are both hypernyms for "Orange")*

The full project can be found [here](https://github.com/Alex-Gideon/635Group3Project/tree/main).

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from os import walk

from nltk.corpus import stopwords

import csv

import pandas as pd

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
#import stops words
stop_words = set(stopwords.words('english'))

In [None]:
#check if word has associated hypernyms
def check_for_hypernim(token):
    hypernims = []
    for i in range(15):
        try:
            hypernims1 = []
            for i, j in enumerate(wn.synsets(token)):
                for l in j.hypernyms():
                    hypernims1.append(l.lemma_names()[0])
            token = hypernims1[0]
            hypernims.append(hypernims1)
        except IndexError:
            hypernims.append([token])

    return hypernims

In [None]:
#applies hypernym
def process_text(fileName):
  df = pd.read_csv(fileName)

  #holds collection of each word and their hypernyms
  processed = []
  #holds label for each word
  labels = []

  #processes text in each row by filtering each sentence
  #and tagging with appropriate hypernyms
  #if no hypernym exists, the base word is applied as its own hypernym
  for index, row in df.iterrows():
    label = row['label']
    line = row['text']
    words = line.split()

    filtered_sentence = []

    x = []
    #removes stop words from list of words
    for r in words:

        if not r in stop_words:
            filtered_sentence.append(r)

    tagged = nltk.pos_tag(filtered_sentence)
    #removes unusable characters and undesired tagged words
    for i in tagged:
        if len(i[0]) != 0 or len(i[0]) != 1:
            if i[1] == 'IN' or i[1] == 'DT' or i[1] == 'CD' or
            i[1] == 'CC' or i[1] == 'EX' or i[1] == 'MD' or   i[1] == 'WDT' or
            i[1] == 'WP' or i[1] == 'UH' or i[1] == 'TO' or i[1] == 'RP' or
            i[1] == 'PDT' or i[1] == 'PRP' or i[1] == 'PRP$' or i[0] == 'co':
                # print(i[0])
                continue
            else:
                x.append(i[0].rstrip(".,?!"))
    #check if a hypernym exists for remaining words
    #if no, add word as its own hypernym 15 times
    #if yes, apply hypernyms to word
    for i in x:
        l = []
        l.append(i)
        hype = check_for_hypernim(i)
        if len(hype) == 0:
            hype.append(i)  # 1
            hype.append(i)  # 2
            hype.append(i)  # 3
            hype.append(i)  # 4
            hype.append(i)  # 5
            hype.append(i)  # 6
            hype.append(i)  # 7
            hype.append(i)  # 8
            hype.append(i)  # 9
            hype.append(i)  # 10
            hype.append(i)  # 11
            hype.append(i)  # 12
            hype.append(i)  # 13
            hype.append(i)  # 14
            hype.append(i)  # 15
        for hyper in hype:
            l.append(hyper[0])
        processed.append(l)
        labels.append(label)

  print(len(processed))
  print(len(labels))

  #test that word and label order is correct
  print(f'Label 0: {labels[0]}')
  print(f'Processed 0: {processed[0]}')

  #create list of hyponyms
  word_only = [x[0] for x in processed]

  dict = {"label": labels, "hypernyms":word_only}
  #dataframe holding each word and their hypernyms
  processed_df = pd.DataFrame(processed)
  #dataframe holding each word and their respective labels
  label_df = pd.DataFrame(dict)
  label_df.head(5)
  #join both dataframes and drop duplicate column
  #to ensure label/word order is maintained
  joined_df = label_df.join(processed_df)
  joined_df.drop(['hypernyms'], axis=1, inplace=True)

  #based on file name, save processed and joined dataframes to respective files
  if fileName == '/experiment-1/financial/datasets/clean_financialpc.csv':

    processed_df.to_csv(
        '/experiment-1/financial/datasets/hypernyms_financialpc.csv',
        header=False,
        index=False
        )

    joined_df.to_csv(
        '/experiment-1/financial/datasets/train_financialpc.csv',
        header=False,
        index=False)
    print("clean_financialpc has been processed")

  if fileName == '/experiment-1/financial/datasets/clean_financialfull.csv':
    processed_df.to_csv(
        '/experiment-1/financial/datasets/hypernyms_financialfull.csv',
        header=False,
        index=False
        )

    joined_df.to_csv(
        '/experiment-1/financial/datasets/train_financialfull.csv',
        header=False,
        index=False
        )
    print("clean_financialfull has been processed")


In [None]:
#cleaned datasets to be processed
datasets = ['/experiment-1/financial/datasets/clean_financialpc.csv',
            '/experiment-1/financial/datasets/clean_financialfull.csv']

In [None]:
#process all datasets and verify the correct number of items and the labelling
for dataset in datasets:
  process_text(dataset)

82078
82078
Label 0: 1
Processed 0: ['ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments', 'ReutersPayments']
clean_financialpc has been processed
56663
56663
Label 0: 7
Processed 0: ['According', 'match', 'lighter', 'fuel', 'substance', 'matter', 'concern', 'interest', 'curiosity', 'cognitive_state', 'psychological_state', 'condition', 'state', 'administrative_district', 'district', 'region']
clean_financialfull has been processed
