In [1]:
import pandas as pd 
import PyPDF2
import os
import spacy
import re
import string
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# own module
import helper_functions.models as models
import helper_functions.preprocessing as preprocess

In [2]:
# load document
dir = 'data/Morgan-Stanley_2019-Sustainability-Report_Final.pdf'
df = pd.DataFrame({'Ticker':'Barclays', 'content': preprocess.extract_content(dir=dir)}, index=[0])
articles = preprocess.extract_statements(text=df['content'].values.tolist()[0])

In [3]:
# load models
topic_model = models.topic_load_model(model_dir="joeddav/xlm-roberta-large-xnli")
sentiment_model = models.sentiment_load_model(model_dir="distilbert-base-uncased-finetuned-sst-2-english")

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
topics = ["ESG","Food", "Finance", "Social"]
nr_paragraphs = 3
predictions_all = models.predict(topic_model=topic_model, sentiment_model=sentiment_model, doc=articles[0:nr_paragraphs], topics=topics)
models.format_topic_sent(dic=predictions_all, doc=articles[0:nr_paragraphs])

Unnamed: 0,Original text,Social,ESG,Finance,Food,NEGATIVE,POSITIVE
0,Sustainability Report TABLE OF CONTENTS 3Our S...,92.943,86.936,39.765,0.18,0.323,99.677
1,"Committed to facilitate the prevention, reduct...",55.083,44.916,19.968,0.11,2.189,97.811
2,Introduced a new Sustainability at Work progra...,48.869,93.394,0.131,0.027,0.179,99.821


In [6]:
# show topic predictions
(pd.DataFrame(predictions_all["Topic"].mean(),columns=['Confidence (%)'])*100).round(3).sort_values(by='Confidence (%)',ascending=False)

Unnamed: 0,Confidence (%)
ESG,75.082
Social,65.632
Finance,19.954
Food,0.106


In [7]:
# Show weighted normalized sentiment predictions 
models.compute_weighted_sentiment(dic=predictions_all)

Unnamed: 0,Unnamed: 1,Confidence (%)
NEGATIVE,Social,0.809
NEGATIVE,ESG,0.635
NEGATIVE,Finance,0.945
NEGATIVE,Food,0.958
POSITIVE,Social,99.191
POSITIVE,ESG,99.365
POSITIVE,Finance,99.055
POSITIVE,Food,99.042
