In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import requests
from bs4 import BeautifulSoup

from transformers import BertTokenizer, BertForMaskedLM, BertForSequenceClassification
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import MarianMTModel, MarianTokenizer

import torch
import json
import textwrap

from fuzzywuzzy import fuzz




# Obtaining the correct dataframe from whole Sigwatch one (only banks' related scandals)

In [2]:
df = pd.read_csv("sigwatch_merged.csv")
del df['Unnamed: 0']
df.head()

  df = pd.read_csv("sigwatch_merged.csv")


Unnamed: 0,uid_archive,date,company_code,company,company_type,company_parent_code,company_parent,company_parent_country,sentiment,prominence,...,industry_sector_code3,industry_sector3,industry_sector_code4,industry_sector4,report,link1,link2,link3,link4,link5
0,40934,2011-12-21,18,AREVA,Parent,18,AREVA,France,-2,4,...,,,,,"In France, France Nature Environnement (FNE) a...",http://www.fne.asso.fr/fr/pollution-de-cours-...,,,,
1,40934,2011-12-21,3849,Comurhex,Subsidiary,18,AREVA,France,-2,4,...,,,,,"In France, France Nature Environnement (FNE) a...",http://www.fne.asso.fr/fr/pollution-de-cours-...,,,,
2,40931,2011-12-21,1,Monsanto,Parent,1,Monsanto,US,-1,2,...,1279.0,Agriculture,,,"In Bulgaria, Public Environmental Center for S...",http://novinite.com/view_news.php?id=134782,,,,
3,40927,2011-12-21,227,Nestle,Parent,227,Nestle,Switzerland,-2,4,...,,,,,of the baby food companies based on the scale...,http://info.babymilkaction.org/pressrelease/pr...,,,,
4,40924,2011-12-21,3847,Kazmunaigas Exploration and Production,Parent,3847,Kazmunaigas Exploration and Production,Kazakhstan,-2,4,...,,,,,"In the UK, Platform is mobilising supporters t...",http://blog.platformlondon.org/2011/12/20/demo...,,,,


In [3]:
#Filtering the data using sectors' fields at our disposal
df = df[df['industry_sector1'].str.contains("Finance") & df['corp_industry_sector1'].str.contains("Finance")]
df.reset_index(drop=True, inplace=True)

In [4]:
#Deleting rows having nan values in both company and company_parent field
df = df.dropna(subset=['company', 'company_parent'], how='all')
df.reset_index(drop=True, inplace=True)

In [5]:
#Deleting rows having nan values in link1 field
df = df.dropna(subset=['link1'], how='all')
df.reset_index(drop=True, inplace=True)

In [6]:
df.shape

(4344, 94)

In [7]:
def load_dict_from_file(file_path):
    with open(file_path, 'r') as f:
        dictionary = json.load(f)
    return dictionary

In [8]:
banks_events = load_dict_from_file('dictionary_to_event.json')
coutries = banks_events.keys()

In [9]:
#Filtering the dataframe to have only companies in the countries of which we have data
df = df[df['company_parent_country'].isin(coutries)]
df.reset_index(drop=True, inplace=True)

In [10]:
df.shape

(3177, 94)

# Scraping the text of the news from the link

Since we got many different websites from which we have to scrape the text, it was impossible to design a specific code to scrape the text.
However the text of an article found online is most of the times included in p paragrafs, which is the specific section of a webpage dedicated to the main text of the page.

We therefore use this aspect to extract all the p paragrafs in the webpages we load; assuming that the text of the article, if it's present on the page and the page is still online, will be in this HTML tag

In [11]:
def extract_text(url):
    # Add agent information in your header before retrieve the page
    user_agents_list = [
        'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
    ]
    # Retrieve the HTML page of the URL with a 30-second timeout
    try:
        page = requests.get(url, headers={"User-agent": random.choice(user_agents_list)}, timeout=15)
    except requests.Timeout:
        print(f"Timed out while accessing {url}")
        return None

    # page.content contains the HTML
    soup = BeautifulSoup(page.content, 'html.parser')
    paragraphs = soup.find_all('p')
    article_text = ' '.join(p.get_text() for p in paragraphs)
    return article_text


In [None]:
df['news_text'] = ''
for i in range(len(df)):
    if i%100 == 0: print(i)
    
    try:
        df.loc[i,'news_text'] = extract_text(df.loc[i,'link1'])
        if df.loc[i,'news_text'] is None:
            df.loc[i,'news_text'] = np.nan
    except Exception as e:
        print(f"Error for index {i}: {e}")
        df.loc[i,'news_text'] = np.nan


0
Error for index 4: HTTPConnectionPool(host='www.wwfrsapartners.com', port=80): Max retries exceeded with url: /static/uploads/page_files/Chinese_version.pdf%20 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f308b1334c0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Error for index 63: HTTPConnectionPool(host='adicae.org', port=4040): Max retries exceeded with url: /nota-de-prensa/533.html (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f308acdb1c0>: Failed to establish a new connection: [Errno 111] Connection refused'))
Error for index 64: HTTPConnectionPool(host='adicae.org', port=4040): Max retries exceeded with url: /nota-de-prensa/533.html (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f308abdc910>: Failed to establish a new connection: [Errno 111] Connection refused'))
E

In [None]:
df.news_text.isna().sum()

745

Out of 3177 rows, we're able to extract some text from all but 745 websites.
Considering that the dataset presents also quite old events, this is quite a satisfactory result.


nb: a future possible direction to make this analysis even better is to use the information at our disposal in the columns 'link2', 'link3', 'link4', 'link5' where present.
However, this would require a modification also to the pipeline (anticipating the detection of an article phase, later this will be more clear) we use to obtain the final result, for this reason we consider out of the scope of this project this possible modification.

In [None]:
df.to_csv('news_text.csv')

# Detect language of the article/string

Since we are scraping from multiple websites, it is possible, and indeed occurs in this case, that the text scraped is not in uniform language.
With this model we're able to detect the language of the portion of the text extracted online, this informaion will be extremely useful further on this work.

In [None]:
df = pd.read_csv('news_text.csv')

In [None]:
model_name = "papluca/xlm-roberta-base-language-detection"
model = XLMRobertaForSequenceClassification.from_pretrained(model_name)
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json: 100%|██████████| 1.42k/1.42k [00:00<00:00, 181kB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.11G/1.11G [00:03<00:00, 343MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 502/502 [00:00<00:00, 297kB/s]
Downloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 134MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 140kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 9.08M/9.08M [00:00<00:00, 31.7MB/s]


In [None]:
def detect_language(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

    # Predict
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get predicted class (language)
    predicted_class_idx = torch.argmax(logits, dim=1).item()
    # Map the predicted class index to its respective language
    language = model.config.id2label[predicted_class_idx]

    return language

In [None]:
df['text_lang'] = ''

for i in range(len(df)):
    if i % 20 == 0:
        print(i)
    #print(i)
    try: df.loc[i,'text_lang'] = detect_language(df.loc[i,'news_text'])
    except: df.loc[i,'text_lang'] = np.nan

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160


In [None]:
df.text_lang.value_counts()

en    1277
nl     294
de     206
es     181
fr     153
bg     107
pt     107
ru      36
tr      20
it      19
pl      11
ur       8
th       6
el       3
ja       3
hi       1
Name: text_lang, dtype: int64

Remembering that we're able to extract text from 2,432 websites in total, we were quite lucky in the sense that more than half of them were written in english, semplifying the whole pipeline.


nb: another future improvement of this work may consist in using also the information scraped online in languages different than english by translating them (we consider this aspect out of the scope of this project once again; however, in the end of this notebook we suggest the code to perform this task).

In [None]:
df.to_csv('news_text_COMPLETE.csv')

# Determining whether the text is the news or an error message

When opening a specific link like the ones provided in the Sigwatch dataframe, it may happen that the page still loads up but with an error message in it (below it's possible to see an example).

In those cases, the code's able to collect the text but clearly this is not an article, for this reason running the sentiment analysis without adding a 'check' layer before would simply be misleading in term of results achieved.

With this layer, we're using a pre-trained model available on huggingface, to get a probability of how likely it is that the text scraped is an article.

We then use a different threshold to finally establish whether that string is an article or not, based on the whole lenght of the string.
This is quite intuitvely when looking at numerous examples like the one shown here, most of the times when the webpage returns an error message the lenght of the string does not exceed 400/500 characters, we therefore decided to select an higher threshold for string in this region (0.95) compared to the threshold used with "standard" strings (0.90).


nb: another possible problem is when we try to load a specific webpage and instead of returning an error message, we're immediately redirected to the homepage (see example 2 below).
This specific case was not treated differently from the others, so in those cases it's very likely that we'll classify those strings as articles. 

![Alt text](example.png)

This is an example of the second problematic found.

Searching for this page 'http://www.globallabourrights.org/reports?id=0642' will lead to this page instead 'https://www.globallabourrights.org/'

![Alt text](example2.png)

In [None]:
df = pd.read_csv('news_text_COMPLETE.csv')

In [None]:
# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
model = BertForMaskedLM.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 188kB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [00:00<00:00, 523MB/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 6.60kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/

In [None]:
def is_article(text):
    # Tokenize input text and get output from the model
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs).logits

    # Calculate the average probability of the predicted tokens
    probs = torch.nn.functional.softmax(outputs, dim=-1)
    predicted_indexes = torch.argmax(probs, dim=-1)
    predicted_probs = torch.gather(probs, -1, predicted_indexes.unsqueeze(-1)).mean().item()

    # If the average predicted probability is above a threshold, classify as an article
    return predicted_probs

In [None]:
df['is_article'] = ''

for i in range(len(df)):
    if i % 20 == 0: print(i)
    if df.loc[i, 'text_lang'] == 'en':
        try: df.loc[i,'is_article'] = is_article(df.loc[i,'news_text'])
        except: df.loc[i,'is_article'] = np.nan

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160


In [None]:
#Obtaining a dummy variable for artile or not based on the strategy defined above
df['is_article_dummy'] = 0

for i in range(len(df)):
    
    if len(str(df.loc[i,'is_article'])) > 400:
        if df.loc[i,'is_article'] > 0.95:
            df.loc[i,'is_article_dummy'] = 1
    else:
        if df.loc[i,'is_article'] != '' and df.loc[i,'is_article'] > 0.90:
            df.loc[i,'is_article_dummy'] = 1

In [None]:
df.is_article_dummy.value_counts()

0    2257
1     920
Name: is_article_dummy, dtype: int64

We're able, based on this method, to retrieve the articles' texts of aproximately 1k articles.

In [None]:
df.to_csv('news_text_COMPLETE.csv')

# Obtaining the sentiment based on the text of the campaign

Finally, we use again a pre-trained model found on huggingface, to get a sentiment value over the texts which we consider coming from an article online (922 rows).

Since the original range for the sentiment values was [-2,2], we modify the values coming from this model to obtain results in the same range, not to alter the analyses.

In [None]:
df = pd.read_csv('news_text_COMPLETE.csv')

In [None]:
# Create the pipeline outside the function
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def sentiment_analysis(text):
    max_len = 512  # Maximum sequence length for DistilBERT
    chunks = [text[i:i + max_len] for i in range(0, len(text), max_len)]  # Split text into chunks
    
    overall_sentiment_score = 0
    for chunk in chunks:
        result = classifier(chunk)[0]
        sentiment_score = result['score']
        label = result['label']
        
        if label == 'POSITIVE':
            overall_sentiment_score += sentiment_score
        else:
            overall_sentiment_score -= sentiment_score
            
    # Normalize the overall sentiment score
    overall_sentiment_score /= len(chunks)
    
    return overall_sentiment_score * 2


Downloading (…)lve/main/config.json: 100%|██████████| 629/629 [00:00<00:00, 82.8kB/s]
Downloading model.safetensors: 100%|██████████| 268M/268M [00:00<00:00, 304MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 8.20kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 43.9MB/s]


In [None]:
df['sentiment_computed'] = ''

for i in range(len(df)):
    if i % 20 == 0: print(i)
    
    if df.loc[i, 'is_article_dummy'] == 1 and df.loc[i,'news_text'] != '':
        df.loc[i, 'sentiment_computed'] = sentiment_analysis(df.loc[i,'news_text'])

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160


In [None]:
df.to_csv('news_text_COMPLETE.csv')

# Formatting the dataframe to match the desired output format

Simply as the title, out of the numerous columns created during this process, we want to have a resulting dataframe which has the exact same structure as the dataframe coming directly from Sigwatch.

In [None]:
df = pd.read_csv('news_text_COMPLETE.csv')
del df['Unnamed: 0']

In [None]:
df

Unnamed: 0,uid_archive,date,company_code,company,company_type,company_parent_code,company_parent,company_parent_country,sentiment,prominence,...,report,link1,link2,link3,link4,link5,news_text,text_lang,is_article,is_article_dummy
0,40883,2011-12-21,49,World Bank,Parent,49,World Bank,US,-1,2,...,environmental impact assessment of the contro...,http://www.noticiasser.pe/16/12/2011/cajamarca...,https://www.rainforest-rescue.org/mailalert/80...,,,,Presidente Asociación SER\nALEJANDRO LAOS FERN...,es,,0
1,40856,2011-12-16,568,Lloyds Banking Group plc,Parent,568,Lloyds Banking Group plc,UK,-1,3,...,would be there.</em><br />See,http://occupylsx.org/?p=2487,,,,,Forms of protests Forms of protests Large...,en,0.950527,1
2,40833,2011-12-16,568,Lloyds Banking Group plc,Parent,568,Lloyds Banking Group plc,UK,-1,4,...,"In the UK, Which? welcomed the announcement by...",http://www.which.co.uk/news/2011/12/co-operati...,,,,,\n We couldn’t find the page you’re l...,en,0.884532,0
3,40833,2011-12-16,2509,Co-operative Bank plc,Subsidiary,604,Co-operative Group (UK),UK,2,4,...,"In the UK, Which? welcomed the announcement by...",http://www.which.co.uk/news/2011/12/co-operati...,,,,,\n We couldn’t find the page you’re l...,en,0.884532,0
4,40829,2011-12-16,2067,RSA Insurance,Parent,2067,RSA Insurance,UK,0,4,...,published with the insurer RSA (Royal Sun All...,http://www.wwfrsapartners.com/static/uploads/p...,http://www.wwfrsapartners.com/static/uploads/p...,http://wwf.panda.org/wwf_news/?202839/Insuranc...,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3172,80635,2018-01-15,90,RBS Group,Parent,90,RBS Group,UK,1,2,...,from all sides should encourage government to...,http://www.globaljustice.org.uk/news/2018/jan/...,,,,,,,,0
3173,80560,2018-01-04,170,BNP Paribas,Parent,170,BNP Paribas,France,-2,4,...,.<br />UPDATED (Feb 23): Attac claimed victory...,https://france.attac.org/se-mobiliser/iphonere...,https://eelv.fr/apple-contre-attac-une-poursui...,,,,Voir la liste de tous les évènements L’année d...,fr,,0
3174,80511,2018-01-08,4981,Caixa Geral de Depositos (CGD),Parent,4981,Caixa Geral de Depositos (CGD),Portugal,-2,2,...,"amounts for transfers, with Caixa Geral de De...",https://www.deco.proteste.pt/dinheiro/poupanca...,,,,,DICAS Como limpar um forno DICAS Que materiais...,pt,,0
3175,80470,2018-01-05,1364,DNB Norway,Parent,1364,DNB Norway,Norway,-1,4,...,as activist-led protests had allegedly persua...,https://www.framtiden.no/aktuelt/etiske-penger...,,,,,,,,0


In [None]:
sigwatch = pd.read_csv('sigwatch_merged.csv')
del sigwatch['Unnamed: 0']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## V1 -> substituting values computed in the original 'sentiment' column, using sigwatch's values for the ones we were not able to compute ours

In [None]:
df_1 = df.copy()

In [None]:
for i in range(len(df_1)):
    if not np.isnan(df_1.loc[i,'sentiment_computed']):
        df_1.loc[i, 'sentiment'] = df_1.loc[i,'sentiment_computed']

#storing only the columns found also in sigwatch dataset
common_columns = sigwatch.columns.intersection(df_1.columns)
df_1 = df_1[common_columns]

In [None]:
df_1

Unnamed: 0,uid_archive,date,company_code,company,company_type,company_parent_code,company_parent,company_parent_country,sentiment,prominence,...,industry_sector_code3,industry_sector3,industry_sector_code4,industry_sector4,report,link1,link2,link3,link4,link5
0,40883,2011-12-21,49,World Bank,Parent,49,World Bank,US,-1.000000,2,...,,,,,environmental impact assessment of the contro...,http://www.noticiasser.pe/16/12/2011/cajamarca...,https://www.rainforest-rescue.org/mailalert/80...,,,
1,40856,2011-12-16,568,Lloyds Banking Group plc,Parent,568,Lloyds Banking Group plc,UK,0.229507,3,...,,,,,would be there.</em><br />See,http://occupylsx.org/?p=2487,,,,
2,40833,2011-12-16,568,Lloyds Banking Group plc,Parent,568,Lloyds Banking Group plc,UK,-1.000000,4,...,,,,,"In the UK, Which? welcomed the announcement by...",http://www.which.co.uk/news/2011/12/co-operati...,,,,
3,40833,2011-12-16,2509,Co-operative Bank plc,Subsidiary,604,Co-operative Group (UK),UK,2.000000,4,...,,,,,"In the UK, Which? welcomed the announcement by...",http://www.which.co.uk/news/2011/12/co-operati...,,,,
4,40829,2011-12-16,2067,RSA Insurance,Parent,2067,RSA Insurance,UK,0.000000,4,...,,,,,published with the insurer RSA (Royal Sun All...,http://www.wwfrsapartners.com/static/uploads/p...,http://www.wwfrsapartners.com/static/uploads/p...,http://wwf.panda.org/wwf_news/?202839/Insuranc...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3172,80635,2018-01-15,90,RBS Group,Parent,90,RBS Group,UK,1.000000,2,...,,,,,from all sides should encourage government to...,http://www.globaljustice.org.uk/news/2018/jan/...,,,,
3173,80560,2018-01-04,170,BNP Paribas,Parent,170,BNP Paribas,France,-2.000000,4,...,,,,,.<br />UPDATED (Feb 23): Attac claimed victory...,https://france.attac.org/se-mobiliser/iphonere...,https://eelv.fr/apple-contre-attac-une-poursui...,,,
3174,80511,2018-01-08,4981,Caixa Geral de Depositos (CGD),Parent,4981,Caixa Geral de Depositos (CGD),Portugal,-2.000000,2,...,,,,,"amounts for transfers, with Caixa Geral de De...",https://www.deco.proteste.pt/dinheiro/poupanca...,,,,
3175,80470,2018-01-05,1364,DNB Norway,Parent,1364,DNB Norway,Norway,-1.000000,4,...,,,,,as activist-led protests had allegedly persua...,https://www.framtiden.no/aktuelt/etiske-penger...,,,,


In [None]:
df_1.to_csv('news_text_COMPLETE_option1.csv')

## V2 -> 'sentiment' column is renamed in 'sentiment_old', while 'sentiment_computed' becomes the new 'sentiment' column

In [None]:
df_2 = df.copy()

In [None]:
df_2['sentiment_old'] = df_2['sentiment']
del df_2['sentiment']
df_2.rename(columns={'sentiment_computed': 'sentiment'}, inplace=True)

#storing only the columns found also in sigwatch dataset, plus the old sentiment column named 'sentiment_old'
common_columns = sigwatch.columns.intersection(df_2.columns).to_list()
if 'sentiment_old' in df_2.columns:
    common_columns.append('sentiment_old')
df_2 = df_2[common_columns]

In [None]:
df_2

Unnamed: 0,uid_archive,date,company_code,company,company_type,company_parent_code,company_parent,company_parent_country,sentiment,prominence,...,industry_sector3,industry_sector_code4,industry_sector4,report,link1,link2,link3,link4,link5,sentiment_old
0,40883,2011-12-21,49,World Bank,Parent,49,World Bank,US,,2,...,,,,environmental impact assessment of the contro...,http://www.noticiasser.pe/16/12/2011/cajamarca...,https://www.rainforest-rescue.org/mailalert/80...,,,,-1
1,40856,2011-12-16,568,Lloyds Banking Group plc,Parent,568,Lloyds Banking Group plc,UK,0.229507,3,...,,,,would be there.</em><br />See,http://occupylsx.org/?p=2487,,,,,-1
2,40833,2011-12-16,568,Lloyds Banking Group plc,Parent,568,Lloyds Banking Group plc,UK,,4,...,,,,"In the UK, Which? welcomed the announcement by...",http://www.which.co.uk/news/2011/12/co-operati...,,,,,-1
3,40833,2011-12-16,2509,Co-operative Bank plc,Subsidiary,604,Co-operative Group (UK),UK,,4,...,,,,"In the UK, Which? welcomed the announcement by...",http://www.which.co.uk/news/2011/12/co-operati...,,,,,2
4,40829,2011-12-16,2067,RSA Insurance,Parent,2067,RSA Insurance,UK,,4,...,,,,published with the insurer RSA (Royal Sun All...,http://www.wwfrsapartners.com/static/uploads/p...,http://www.wwfrsapartners.com/static/uploads/p...,http://wwf.panda.org/wwf_news/?202839/Insuranc...,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3172,80635,2018-01-15,90,RBS Group,Parent,90,RBS Group,UK,,2,...,,,,from all sides should encourage government to...,http://www.globaljustice.org.uk/news/2018/jan/...,,,,,1
3173,80560,2018-01-04,170,BNP Paribas,Parent,170,BNP Paribas,France,,4,...,,,,.<br />UPDATED (Feb 23): Attac claimed victory...,https://france.attac.org/se-mobiliser/iphonere...,https://eelv.fr/apple-contre-attac-une-poursui...,,,,-2
3174,80511,2018-01-08,4981,Caixa Geral de Depositos (CGD),Parent,4981,Caixa Geral de Depositos (CGD),Portugal,,2,...,,,,"amounts for transfers, with Caixa Geral de De...",https://www.deco.proteste.pt/dinheiro/poupanca...,,,,,-2
3175,80470,2018-01-05,1364,DNB Norway,Parent,1364,DNB Norway,Norway,,4,...,,,,as activist-led protests had allegedly persua...,https://www.framtiden.no/aktuelt/etiske-penger...,,,,,-1


In [None]:
df_2.to_csv('news_text_COMPLETE_option2.csv')

# Bonus section

## Translating articles' text not in English to English

In [None]:
def chunk_text(text, max_length=400):
    """Split the text into chunks of max_length."""
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

In [None]:
def translate_to_english(text, source_lang):
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-en"
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)

    # Split the text into chunks
    text_chunks = chunk_text(text)
    translations = []

    for chunk in text_chunks:
        batch = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
        translated_tokens = model.generate(**batch)
        translation = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
        translations.append(translation)

    return " ".join(translations)

In [None]:
df['news_text_en'] = ''
df['translated'] = 0
df.loc[df.text_lang == 'en', 'news_text_en'] = df.loc[df.text_lang == 'en', 'news_text']

for language in df.text_lang.unique():
    df_filter = df[df.text_lang == language]
    
    for i in df_filter.index:
        df.loc[i,'news_text_en'] = translate_to_english(df.loc[i,'news_text'], language)
        df.loc[i, 'translated'] = 1

Downloading (…)lve/main/config.json: 100%|██████████| 1.44k/1.44k [00:00<00:00, 187kB/s]
Downloading pytorch_model.bin: 100%|██████████| 312M/312M [00:01<00:00, 302MB/s]
Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 35.1kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 44.0/44.0 [00:00<00:00, 23.5kB/s]
Downloading (…)olve/main/source.spm: 100%|██████████| 826k/826k [00:00<00:00, 41.1MB/s]
Downloading (…)olve/main/target.spm: 100%|██████████| 802k/802k [00:00<00:00, 40.8MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.59M/1.59M [00:00<00:00, 57.2MB/s]


KernelInterrupted: Execution interrupted by the Jupyter kernel.

In [None]:
df.to_csv('/datasets/ludovicos-drive/news_text_COMPLETE.csv')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=80f2b4d4-87b5-4cce-b2ad-28e5548d48b3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>