In [17]:
import pandas
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [18]:
def get_headlines_df(filename):
    return pandas.read_csv(filename)

def preprocess_headlines_df(headlines_df):
    headlines_df['Title + Subtitle'] = headlines_df['Title'] + ' ' + headlines_df['Subtitle']
    return headlines_df[[
        'Date',
        'Link',
        'Title + Subtitle',
        'Title',
        'Subtitle',
        'Authors'
    ]]


def get_headlines_list(headlines_df):
    headlines_array = np.array(headlines_df)
    return list(headlines_array[:,2])

def get_stocks_list(headlines_df):
    headlines_array = np.array(headlines_df)
    return list(headlines_array[:, -1])

def get_tokenizer():
    return AutoTokenizer.from_pretrained("ProsusAI/finbert")

def get_model():
    return AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

def chunk_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def get_headlines_table():
    headlines_table = pandas.DataFrame({'Headline':[''],'Author':[''],'Pos':[''],'Neg':[''],'Neutr':['']})
    return headlines_table

def predict_sentiment(model, tokenizer, headlines_list, stocks_list, headlines_table, stride=100):
    model.eval()
    n=0
    for lines, stocks in zip(chunk_list(headlines_list, stride), chunk_list(stocks_list, stride)):
        input = tokenizer(lines, padding = True, truncation = True,  return_tensors='pt')
        outputs = model(**input)
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)
        print(f"{n+1}/{int(len(headlines_list)/stride)}") 
        for headline, stock, pos, neg, neutr in zip(lines,stocks, prediction[:, 0].tolist(), prediction[:, 1].tolist(), prediction[:, 2].tolist() ): 
            headlines_table = headlines_table.append({'Headline':headline, 'Author':stock, 'Pos':pos, 'Neg':neg, 'Neutr':neutr}, ignore_index=True)
        n+=1
    return headlines_table



In [19]:
headlines_df = get_headlines_df('/Users/benschlagman/Desktop/UCL Year 3/Final Year Project/snp_ft.csv')
headlines_df = preprocess_headlines_df(headlines_df)
headlines_list = get_headlines_list(headlines_df)
stocks_list = get_stocks_list(headlines_df)
tokenizer = get_tokenizer()
model = get_model()
headlines_table = get_headlines_table()
headlines_table = predict_sentiment(model, tokenizer, headlines_list, stocks_list, headlines_table)


1/1


In [22]:
classification = headlines_table[[
    'Headline',
    'Pos',
    'Neg',
    'Neutr'
]]

snp_web_scapping = pandas.read_csv('/Users/benschlagman/Desktop/UCL Year 3/Final Year Project/snp_ft.csv')
snp_web_scapping['Title + Subtitle'] = snp_web_scapping['Title'] + ' ' + snp_web_scapping['Subtitle']
snp_web_scapping = snp_web_scapping[[
    'Date',
    'Link',
    'Title + Subtitle',
    'Title',
    'Subtitle',
    'Stock',
    'Authors'
]]
snp_table = snp_web_scapping.merge(classification, left_on='Title + Subtitle', right_on='Headline')
snp_table = snp_table.drop(columns=['Headline'])
snp_table.to_csv('snp_nlp.csv')

                           Date  \
0      Monday, 7 November, 2022   
1      Monday, 7 November, 2022   
2    Thursday, 3 November, 2022   
3      Sunday, 30 October, 2022   
4      Friday, 28 October, 2022   
..                          ...   
95   Thursday, 27 January, 2022   
96   Thursday, 27 January, 2022   
97  Wednesday, 26 January, 2022   
98     Monday, 24 January, 2022   
99   Thursday, 20 January, 2022   

                                                 Link  \
0   https://www.ft.com/content/e9098953-e031-4bc5-...   
1   https://www.ft.com/content/2e9cd061-99c3-4638-...   
2   https://www.ft.com/content/cc3b4a5a-af35-41d5-...   
3   https://www.ft.com/content/7df7443c-226b-455a-...   
4   https://www.ft.com/content/fa6bec83-058f-4991-...   
..                                                ...   
95  https://www.ft.com/content/f87b1918-f194-44ac-...   
96  https://www.ft.com/content/5a2c18b9-4c37-470a-...   
97  https://www.ft.com/content/d500de8d-0d7c-4b2d-...   
98  https