In [4]:
import pandas as pd

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import numpy as np
np.random.seed(22)

[nltk_data] Downloading package wordnet to /Users/jrevier/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
fname = '../Data/Combined_News_DJIA.csv'


In [6]:

import pandas as pd
from sklearn.metrics import f1_score, accuracy_score


class Base:
    """Base class that houses common utilities for reading in test data
    and calculating model accuracy and F1 scores.
    """
    def __init__(self) -> None:
        pass

    def read_data(self) -> pd.DataFrame:
        "Read in test data into a Pandas DataFrame"
        df = pd.read_csv(fname, sep='\t', header=None, names=colnames)
        df['truth'] = df['truth'].str.replace('__label__', '')
        # Categorical data type for truth labels
        df['truth'] = df['truth'].astype(int).astype('category')
        # Optional lowercase for test data (if model was trained on lowercased text)
        if lower_case:
            df['text'] = df['text'].str.lower()
        return df

    def accuracy(self, df: pd.DataFrame) -> None:
        "Prediction accuracy (percentage) and F1 score"
        acc = accuracy_score(df['truth'], df['pred'])*100
        f1 = f1_score(df['truth'], df['pred'], average='macro')
        print("Accuracy: {}\nMacro F1-score: {}".format(acc, f1))

In [7]:

from flair.embeddings import FlairEmbeddings, BertEmbeddings, WordEmbeddings, DocumentRNNEmbeddings

# init Flair embeddings
flair_forward_embedding = FlairEmbeddings('multi-forward')
flair_backward_embedding = FlairEmbeddings('multi-backward')

# init BERT base (cases)
optional_embedding = BertEmbeddings('bert-base-cased')
# OR init ELMo (original)
# optional_embedding = ELMoEmbeddings('original')

word_embeddings = list(filter(None, [
    optional_embedding,
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]))

# Initialize document embedding by passing list of word embeddings
document_embeddings = DocumentRNNEmbeddings(
    word_embeddings,
    hidden_size=512,
    reproject_words=True,
    reproject_words_dimension=256,
)

In [8]:
class FlairSentiment(Base):
    """Predict fine-grained sentiment scores using Flair."""
    def __init__(self, model_file: str=None) -> None:
        super().__init__()
        from flair.models import TextClassifier
        self.model = TextClassifier.load(model_file)

    def score(self, text: str) -> int:
        from flair.data import Sentence
        doc = Sentence(text)
        self.model.predict(doc)
        pred = int(doc.labels[0].value)
        return pred

    def predict(self, train_file: None, test_file: str, lower_case: bool) -> pd.DataFrame:
        "Use tqdm to display model prediction status bar"
        from tqdm import tqdm
        tqdm.pandas()
        df = self.read_data(test_file, lower_case)
        df['pred'] = df['text'].progress_apply(self.score)
        return df

In [10]:
# Loop through the columns containing headlines and apply `topic_getter` to each headline, to retrieve topic
# Assign that data to a new column

for x in range(1,26): # for each column
    col = "Top" + str(x)
    new_col = col + "_topic_score"
    
    # for each row in column
    # get the headline, feed it into the lda_model_bow to get the topic and score
    df[new_col] = df[col].apply(FlairSentiment)

NameError: name 'df' is not defined