# Text Classification with ESG-BERT

In [1]:
import os
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

os.chdir("../..")
tqdm.pandas()

### Load transcripts

In [2]:
transcripts = pd.read_csv("data/processed/transcripts.csv")
print(transcripts.shape)

(3762, 4)


In [3]:
transcripts.head()

Unnamed: 0,ticker,year,quarter,transcript
0,A,2022,1,"Hello, and welcome to the Q1 2022 Agilent Tech..."
1,A,2022,3,Good afternoon. Thank you for attending today'...
2,AAL,2022,1,"Good morning, and welcome to the American Airl..."
3,AAL,2022,2,"Good morning, and welcome to the American Airl..."
4,AAL,2022,2,"Good morning, and welcome to the American Airl..."


### Load text classifier

In [4]:
# load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")
model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

# pipeline for text classification
text_classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

In [5]:
# demo
demo_scores = text_classifier([
    "Our production line releases a lot of carbon dioxide.",
    "Men are paid 3 times more than women.",
    "Last year, the revenue has increased by 75%."
])
demo_scores

[{'label': 'GHG_Emissions', 'score': 0.7811035513877869},
 {'label': 'Labor_Practices', 'score': 0.9579222798347473},
 {'label': 'Management_Of_Legal_And_Regulatory_Framework',
  'score': 0.23833788931369781}]

### Perform text classification on training dataset

In [6]:
demo = transcripts.transcript[0]
print(demo)

Hello, and welcome to the Q1 2022 Agilent Technologies earnings conference call. My name is Emily, and I will be coordinating the call today. [Operator instructions] I now have the pleasure of handing the call over to our host, Parmeet Ahuja, vice president, investor relations at Agilent Technologies. Please go ahead.
Thank you Emily, and welcome everyone to Agilent's conference call for the first quarter of fiscal year 2022. With me are Mike McMullen, Agilent president and CEO, and Bob McMahon, Agilent senior vice president and CFO. Joining in the Q&A after Mike and Bob's comments will be Jacob Thaysen, president of the Agilent life science and applied markets group; Sam Raha, president of the Agilent diagnostics and genomics group; and Padraig McDonnell, president of the Agilent CrossLab group. This presentation is being webcast live.
The news release for our Q1 financial results, investor presentation, and information to supplement today's discussion, along with a recording of this 

In [7]:
demo_scores = text_classifier(demo.split("\n"))
demo = pd.DataFrame({
    "transcript": demo.split("\n"),
    "label" : [x["label"] for x in demo_scores],
    "score": [x["score"] for x in demo_scores],
})
demo

Unnamed: 0,transcript,label,score
0,"Hello, and welcome to the Q1 2022 Agilent Tech...",Business_Model_Resilience,0.577523
1,"Thank you Emily, and welcome everyone to Agile...",Business_Model_Resilience,0.669292
2,"The news release for our Q1 financial results,...",Systemic_Risk_Management,0.301881
3,Core revenue growth excludes the impact of cur...,Systemic_Risk_Management,0.496821
4,These changes have no impact on our company's ...,Systemic_Risk_Management,0.773721
5,"Thanks, Parmeet, and thanks to everyone for jo...",Business_Model_Resilience,0.743204
6,This is up 9% core and up 8% reported. This is...,Systemic_Risk_Management,0.656891
7,"In fact, Q1 orders grew roughly twice as fast ...",Systemic_Risk_Management,0.560982
8,These strong results have been achieved in a v...,Business_Model_Resilience,0.548486
9,"Our pharma business, Agilent's largest market,...",Product_Design_And_Lifecycle_Management,0.813935


In [128]:
# split long paragraphs
def split_paragraphs(paragraphs, max_char=2000):
    i = 0
    i_max = len(paragraphs)
    while i < len(paragraphs):
        if len(paragraphs[i]) < max_char:
            i += 1
            continue
        para = paragraphs[i]
        sep = ". " if ". " in para else ", "
        m = len(para) // 2
        j = 0
        while j < m:
            if para[m-j-1:m-j+1] == sep:
                m = m-j
                break
            if para[m+j-1:m+j+1] == sep:
                m = m+j
                break
            m += 1
        paragraphs = paragraphs[:i] + [para[:m], para[m+1:]] + paragraphs[i+1:]
    return paragraphs

# text classification
def text_classification(transcript):
    transcript = split_paragraphs(transcript.split("\n"))
    scores = text_classifier(transcript)
    return [{
        "transcript": transcript[i],
        "label": scores[i]["label"],
        "score": scores[i]["score"],
    } for i in range(len(transcript))]

classification_results = []
for i, row in tqdm(transcripts.iterrows(), total=transcripts.shape[0]):
    if i < len(classification_results):
        continue
    classification_results.append(text_classification(row.transcript))

transcripts["transcript_esg"] = pd.Series(classification_results).apply(
    lambda x: "\n".join([c["transcript"] for c in x if c["score"] > 0.6])
)
transcripts.head()

Unnamed: 0,ticker,year,quarter,transcript,transcript_short,transcript_esg
0,A,2022,1,"Hello, and welcome to the Q1 2022 Agilent Tech...","Thank you Emily, and welcome everyone to Agile...","Thank you Emily, and welcome everyone to Agile..."
1,A,2022,3,Good afternoon. Thank you for attending today'...,"Thank you, Hannah, and welcome, everyone, to A...","Thank you, Hannah, and welcome, everyone, to A..."
2,AAL,2022,1,"Good morning, and welcome to the American Airl...","Good morning, and welcome to the American Airl...","Good morning, and welcome to the American Airl..."
3,AAL,2022,2,"Good morning, and welcome to the American Airl...","Good morning, and welcome to the American Airl...","Good morning, and welcome to the American Airl..."
4,AAL,2022,2,"Good morning, and welcome to the American Airl...","Good morning, and welcome to the American Airl...","Good morning, and welcome to the American Airl..."


In [126]:
print("Initial avg. no. of words:", transcripts.transcript.apply(lambda t: len(t.split())).mean())
print("Final avg. no. of words:", transcripts.transcript_short.apply(lambda t: len(t.split())).mean())

Initial avg. no. of words: 3012.4683678894207
Final avg. no. of words: 1298.3466241360977


### Save shortened transcripts

In [131]:
transcripts = transcripts.drop(columns=["transcript"])
# transcripts.to_csv("data/processed/transcripts_esg.csv", index=False)