In [46]:
import pandas
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch



In [47]:
headlines_df = pandas.read_csv('snp_ft.csv')
headlines_df

Unnamed: 0.1,Unnamed: 0,Date,Link,Title,Subtitle,Stock,Authors
0,2,"Monday, 7 November, 2022",https://www.ft.com/content/e9098953-e031-4bc5-...,Were we wrong about big tech?,Taking growth for granted,Apple Inc.,['Robert Armstrong']
1,3,"Monday, 7 November, 2022",https://www.ft.com/content/2e9cd061-99c3-4638-...,Apple warns of iPhone shipment delays in wake ...,Situation worsens in Foxconn factory where wor...,Apple Inc.,"['Ryan McMorrow', 'Nian Liu', 'Patrick McGee',..."
2,4,"Thursday, 3 November, 2022",https://www.ft.com/content/cc3b4a5a-af35-41d5-...,"China’s closed-loop crisis: ‘I’m human, not a ...",The sustainability of a system meant to keep f...,Apple Inc.,"['Edward White', 'Qianer Liu']"
3,7,"Sunday, 30 October, 2022",https://www.ft.com/content/7df7443c-226b-455a-...,Workers flee Covid restrictions at China’s lar...,Authorities say company will go ‘all out’ to h...,Apple Inc.,"['Gloria Li', 'Ryan McMorrow']"
4,8,"Friday, 28 October, 2022",https://www.ft.com/content/fa6bec83-058f-4991-...,Apple says it is facing ‘significant’ headwind...,Company also flags supply constraints for late...,Apple Inc.,['Patrick McGee']
...,...,...,...,...,...,...,...
6888,21640,"Monday, 18 July, 2016",https://www.ft.com/content/b7296c10-4ac0-11e6-...,"From rock stars to roadies, all change for Wal...",Fund managers head to the next big Fang — util...,Netflix Inc.,['Nicole Bullock']
6889,21643,"Friday, 1 July, 2016",https://www.ft.com/content/bf22baf6-3fa4-11e6-...,Wall Street enjoys best week since late-2015 a...,"Consumer groups, carmakers and transport secto...",Netflix Inc.,['Gregory Meyer']
6890,21645,"Wednesday, 29 June, 2016",https://www.ft.com/content/bc52a558-36de-11e6-...,The office is dead! Long live the office!,"Technology is banishing old, static ways of wo...",Netflix Inc.,['Alison Maitland']
6891,21646,"Monday, 20 June, 2016",https://www.ft.com/content/17856f62-360f-11e6-...,Pay transparency is the last taboo in business,The fear of telling all on salaries infects bo...,Netflix Inc.,['Andrew Hill']


In [48]:
headlines_df = pandas.read_csv('snp_ft.csv')

headlines_df['Title + Subtitle'] = headlines_df['Title'] + ' ' + headlines_df['Subtitle']
headlines_df = headlines_df[[
    'Date',
    'Link',
    'Title + Subtitle',
    'Title',
    'Subtitle',
    'Authors'
]]
headlines_df.head(5)

Unnamed: 0,Date,Link,Title + Subtitle,Title,Subtitle,Authors
0,"Monday, 7 November, 2022",https://www.ft.com/content/e9098953-e031-4bc5-...,Were we wrong about big tech? Taking growth fo...,Were we wrong about big tech?,Taking growth for granted,['Robert Armstrong']
1,"Monday, 7 November, 2022",https://www.ft.com/content/2e9cd061-99c3-4638-...,Apple warns of iPhone shipment delays in wake ...,Apple warns of iPhone shipment delays in wake ...,Situation worsens in Foxconn factory where wor...,"['Ryan McMorrow', 'Nian Liu', 'Patrick McGee',..."
2,"Thursday, 3 November, 2022",https://www.ft.com/content/cc3b4a5a-af35-41d5-...,"China’s closed-loop crisis: ‘I’m human, not a ...","China’s closed-loop crisis: ‘I’m human, not a ...",The sustainability of a system meant to keep f...,"['Edward White', 'Qianer Liu']"
3,"Sunday, 30 October, 2022",https://www.ft.com/content/7df7443c-226b-455a-...,Workers flee Covid restrictions at China’s lar...,Workers flee Covid restrictions at China’s lar...,Authorities say company will go ‘all out’ to h...,"['Gloria Li', 'Ryan McMorrow']"
4,"Friday, 28 October, 2022",https://www.ft.com/content/fa6bec83-058f-4991-...,Apple says it is facing ‘significant’ headwind...,Apple says it is facing ‘significant’ headwind...,Company also flags supply constraints for late...,['Patrick McGee']


In [49]:
headlines_array = np.array(headlines_df)
headlines_list = list(headlines_array[:,2])
stocks_list = list(headlines_array[:, -1])

print(headlines_list[:3])
print(stocks_list[:5])

['Were we wrong about big tech? Taking growth for granted', 'Apple warns of iPhone shipment delays in wake of China Covid lockdowns Situation worsens in Foxconn factory where workers have been leaving to escape Covid outbreak', 'China’s closed-loop crisis: ‘I’m human, not a machine’ The sustainability of a system meant to keep factories operating is under intense strain']
["['Robert Armstrong']", "['Ryan McMorrow', 'Nian Liu', 'Patrick McGee', 'Tabby Kinder']", "['Edward White', 'Qianer Liu']", "['Gloria Li', 'Ryan McMorrow']", "['Patrick McGee']"]


In [50]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [51]:
headlines_table = pandas.DataFrame({'Headline':[''],'Author':[''],'Pos':[''],'Neg':[''],'Neutr':['']})

In [52]:
"""
Uing a FinBERT (Financial BERT) NLP model implemented with HuggingFace. 
The model will output activations for three classes: positive, negative or neutral. 
Those relate to how a given headline is likely to affect what it's talking about according to the FinBERT model. 
"""

def chunk_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


STRIDE = 100

model.eval()
n=0
for lines, stocks in zip(chunk_list(headlines_list, STRIDE), chunk_list(stocks_list, STRIDE)):
  input = tokenizer(lines, padding = True, truncation = True,  return_tensors='pt')
  outputs = model(**input)
  prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)
  print(f"{n+1}/{int(len(headlines_list)/STRIDE)}") 
  for headline, stock, pos, neg, neutr in zip(lines,stocks, prediction[:, 0].tolist(), prediction[:, 1].tolist(), prediction[:, 2].tolist() ): 
    headlines_table = headlines_table.append({'Headline':headline, 'Author':stock, 'Pos':pos, 'Neg':neg, 'Neutr':neutr}, ignore_index=True)
  n+=1


1/68
2/68
3/68
4/68
5/68
6/68
7/68
8/68
9/68
10/68
11/68
12/68
13/68
14/68
15/68
16/68
17/68
18/68
19/68
20/68
21/68
22/68
23/68
24/68
25/68
26/68
27/68
28/68
29/68
30/68
31/68
32/68
33/68
34/68
35/68
36/68
37/68
38/68
39/68
40/68
41/68
42/68
43/68
44/68
45/68
46/68
47/68
48/68
49/68
50/68
51/68
52/68
53/68
54/68
55/68
56/68
57/68
58/68
59/68
60/68
61/68
62/68
63/68
64/68
65/68
66/68
67/68
68/68
69/68


In [53]:
classification = headlines_table[[
    'Headline',
    'Pos',
    'Neg',
    'Neutr'
]]

snp_web_scapping = pandas.read_csv('snp_ft.csv')
snp_web_scapping['Title + Subtitle'] = snp_web_scapping['Title'] + ' ' + snp_web_scapping['Subtitle']
snp_web_scapping = snp_web_scapping[[
    'Date',
    'Link',
    'Title + Subtitle',
    'Title',
    'Subtitle',
    'Stock',
    'Authors'
]]
snp_table = snp_web_scapping.merge(classification, left_on='Title + Subtitle', right_on='Headline')
snp_table = snp_table.drop(columns=['Headline'])

snp_table.to_csv('snp_nlp.csv')