In [1]:
import pandas as pd 
import string
from nltk.tokenize import word_tokenize 
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer

In [2]:
train = pd.read_csv('../../data/processed/train.csv')
test = pd.read_csv('../../data/processed/test.csv')

train

Unnamed: 0,symbol,year,quarter,transcript_esg,esg_score,esg_risk_level
0,A,2022,1,"Thank you Emily, and welcome everyone to Agile...",15.0,Low
1,A,2022,3,"Thank you, Hannah, and welcome, everyone, to A...",15.0,Low
2,AAPL,2022,1,"Good day, and welcome to the Apple Q1 FY 2022 ...",17.0,Low
3,AAPL,2022,2,"Good day, and welcome to the Apple Q2 FY 2022 ...",17.0,Low
4,AAPL,2022,3,"Good day, and welcome to the Apple Q3 FY 2022 ...",17.0,Low
...,...,...,...,...,...,...
603,WYNN,2022,4,Here we are three years in the global pandemic...,26.0,
604,YUM,2022,1,Welcome to the Q1 2022 Yum! Brands Earnings co...,21.0,Medium
605,YUM,2022,2,"Before we get started, I would like to remind ...",21.0,Medium
606,ZTS,2022,1,"Thank you, operator. Good morning, everyone, a...",18.0,Low


In [3]:
len(train['transcript_esg'][1].split(" "))

962

In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
def convert_to_lowercase(msg):
    '''
    aim: change all words to lower case
    '''
    return msg.lower()


def remove_punctuation(msg):
    '''
    aim: remove all the punctuation from the tweet given
    Punctuations are characters other than alphaters and digits.
    '''
    return msg.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(msg):
    '''
    aim: remove all stopwords in the tweets
    '''
    word_tokens = word_tokenize(msg)
    filtered_tweet = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_tweet)

def remove_urls(msg):
    '''
    aim: remove all the urls contained inside the tweets
    '''
    return re.sub(r'http\S+|www\S+|https\S+', '', msg, flags=re.MULTILINE)

def remove_numbers(msg): 
    return re.sub(r'\d+', '', msg) 

def clean_transcript(msg): 
    if not isinstance(msg, str): 
        return ""
    msg = remove_numbers(msg) 
    msg = convert_to_lowercase(msg)
    msg = remove_urls(msg)
    msg = remove_punctuation(msg)
    msg = remove_stopwords(msg)
    return msg

In [6]:
train['transcript_esg'] = train['transcript_esg'].apply(clean_transcript)
test['transcript_esg'] = test['transcript_esg'].apply(clean_transcript)
train.head(5)

Unnamed: 0,symbol,year,quarter,transcript_esg,esg_score,esg_risk_level
0,A,2022,1,thank emily welcome everyone agilents conferen...,15.0,Low
1,A,2022,3,thank hannah welcome everyone agilents confere...,15.0,Low
2,AAPL,2022,1,good day welcome apple q fy earnings conferenc...,17.0,Low
3,AAPL,2022,2,good day welcome apple q fy earnings conferenc...,17.0,Low
4,AAPL,2022,3,good day welcome apple q fy earnings conferenc...,17.0,Low


In [7]:
len(train['transcript_esg'][1].split(" "))

538

In [8]:
def lemmatization(tweet):
    '''
    aim: perform lemmatization on the text
    '''
    words = nltk.word_tokenize(tweet)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

lemmatizer = WordNetLemmatizer() 

# Applying stemming and lemmatization to compare the output
train['transcript_esg'] = train['transcript_esg'].apply(lemmatization)
train

Unnamed: 0,symbol,year,quarter,transcript_esg,esg_score,esg_risk_level
0,A,2022,1,thank emily welcome everyone agilents conferen...,15.0,Low
1,A,2022,3,thank hannah welcome everyone agilents confere...,15.0,Low
2,AAPL,2022,1,good day welcome apple q fy earnings conferenc...,17.0,Low
3,AAPL,2022,2,good day welcome apple q fy earnings conferenc...,17.0,Low
4,AAPL,2022,3,good day welcome apple q fy earnings conferenc...,17.0,Low
...,...,...,...,...,...,...
603,WYNN,2022,4,three year global pandemic later wynn la vega ...,26.0,
604,YUM,2022,1,welcome q yum brand earnings conference call n...,21.0,Medium
605,YUM,2022,2,get started would like remind conference call ...,21.0,Medium
606,ZTS,2022,1,thank operator good morning everyone welcome z...,18.0,Low


In [9]:
train = train.dropna(subset=['transcript_esg', 'esg_score', 'esg_risk_level'])
test = test.dropna(subset=['transcript_esg', 'esg_score', 'esg_risk_level'])

In [10]:
train.to_csv('../../data/processed/train_esg_shortened.csv', index=False)
test.to_csv('../../data/processed/test_esg_shortened.csv', index=False)

In [5]:
train = pd.read_csv('../../data/processed/train.csv')

# Handling NaN values by replacing them with an empty string
train['transcript_esg'] = train['transcript_esg'].fillna('')

# Calculating the number of words in the longest 'transcript_esg'
max_words = train['transcript_esg'].str.split().apply(len).max()

print(f"The longest 'transcript_esg' contains {max_words} words.")

The longest 'transcript_esg' contains 4502 words.


In [4]:
import pandas as pd 
train = pd.read_csv('../../data/processed/train_esg_shortened.csv')

# Handling NaN values by replacing them with an empty string
train['transcript_esg'] = train['transcript_esg'].fillna('')

# Calculating the number of words in the longest 'transcript_esg'
max_words = train['transcript_esg'].str.split().apply(len).max()

print(f"The longest 'transcript_esg' contains {max_words} words.")

The longest 'transcript_esg' contains 2429 words.


In [8]:
import pandas as pd 
train = pd.read_csv('../../data/processed/transcripts.csv')

# Handling NaN values by replacing them with an empty string
train['transcript'] = train['transcript'].fillna('')

# Calculating the number of words in the longest 'transcript_esg'
max_words = train['transcript'].str.split().apply(len).max()

print(f"The longest 'transcript' contains {max_words} words.")

The longest 'transcript' contains 9929 words.
