In [None]:
import re
import requests
from bs4 import BeautifulSoup
import textstat
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator

#### Main function

In [None]:
root_path = "C:/Users/34433/Desktop/MFFT/Courses/MFIN7036 NLP"
locations = pd.read_csv("C:/Users/34433/Desktop/MFFT/Courses/MFIN7036 NLP/Group project/locations.csv").sort_values(by=['Company'],ascending=True)
locations['Path'] = locations['Path'].str.replace(':','_') # replace ':' by '_' to adapt to Windows system
scores_summary = [] # creat a blank list to save each complexity score from transcripts

for i in range(0,len(locations['Date'])):
    path = locations['Path'][i]
    company = locations ['Company'][i]
    date = locations['Date'][i] 
    # children will take "True" or "False". We will only preserve parent companies' transcripts and drop children companies.
    children = locations['Children'][i] 
    file_path=path
    if pd.isna(path) == False:
        try:
            text_list=parse_html(file_path) # use parse_html function to parse the html and get texts in a list
        except:
            continue
        mgmt_names, analyst_names = get_mgmt_names(text_list) # use get_mgmt_names function to get hint words
        combined_strs = get_mgmt_discussion(text_list, mgmt_names,analyst_names)
        q_score_summary = complexity_score(combined_strs,date,company,children)
        scores_summary.append(q_score_summary)
    elif pd.isna(path) == True:
        # if the path is null, we create a blank frame for this company at this date
        q_score = {'Date':date,
           'Company': company,
           'Flesch score':[], 'Reading time':[], 'Sentence number':[], 'Polysyllable number':[],
                   'Polysyllable number per sentence':[],'Children': children}
        q_score_summary = pd.DataFrame(q_score)
        scores_summary.append(q_score_summary)
    else:
        q_score = {'Date':date,
           'Company': company,
           'Flesch score':[], 'Reading time':[], 'Sentence number':[], 'Polysyllable number':[],
                   'Polysyllable number per sentence':[],'Children': children}
        q_score_summary = pd.DataFrame(q_score)
        scores_summary.append(q_score_summary)
summary_frame = pd.concat(scores_summary)
summary_frame_parent = summary_frame[summary_frame['Children']==list(summary_frame['Children'])[0]]
summary_frame_parent.to_csv("C:/Users/34433/Desktop/MFFT/Courses/MFIN7036 NLP/Group project/complexity_220318.csv",index=False)

In [None]:
#f = open("C:/Users/34433/Desktop/MFFT/Courses/MFIN7036 NLP/pure_articles/AAPL/2011-01-18T23_05_21-05_00_Apple Management Discusses Q1 2011 Results - Earnings Call Transcript.html",'r',encoding='utf-8')
def parse_html(file_path):
    f = open(root_path+file_path,'r',encoding='utf-8')
    htmlhandle=f.read()
    s= BeautifulSoup(htmlhandle,'lxml') # use Beautiful Soup to parse the html
    text_list = [text for text in s.stripped_strings]
    return text_list

In [None]:
def get_mgmt_names(raw):
    hint_mgmt = ['Executives','Company Participants','Unverified Participant','Company Representatives','Corporate Participants']
    hint_analyst = ['Analysts','Conference Call Participants']
    hint_operator = ['Unidentified Company Representative','Operator']
    mgmt_index_flag = raw.index([x for x in hint_mgmt if x in raw ][0])
    # Some earnings call are without analysts
    try:
        analyst_index_flag = raw.index([x for x in hint_analyst if x in raw ][0])
    except:
        try: # Some transcripts may not have Analysts
            analyst_index_flag = raw.index("Operator")
        except: # Some transcripts may even not have Operator
            for t in raw:
                if len(t.split(" "))>35: # manually set a number to detect whether it is a name or a sentence
                    analyst_index_flag = raw.index(t)-1 # "-1"To make follwing analyst names blank
                    break
    # Deal with transcripts that do not start from Operator
    try:
        operator_index_flag = raw.index([x for x in hint_operator if x in raw ][0])
    except:
        for t in raw:
            if len(t.split(" "))>35:
                operator_index_flag = raw.index(t)
                break
    if operator_index_flag >25:
        for t in raw:
            if len(t.split(" "))>35:
                operator_index_flag = raw.index(t)
                break
    # Some transcript may introduce management with their title in company, like "Bob - CFO"
    if set(raw[mgmt_index_flag+1:analyst_index_flag]) & set (raw[operator_index_flag:]) == set(): 
        mgmt_names = [name.split(' - ')[0].split(' – ')[0] for name in raw[mgmt_index_flag+1:analyst_index_flag]]
        try:
            if analyst_index_flag != raw.index("Operator"): 
                # Set this condition to make sure analyst exist
                analyst_names = [name.split(' - ')[0].split(' – ')[0] for name in raw[analyst_index_flag+1:operator_index_flag]]                
            else:
                analyst_names = []
        except:
            analyst_names = []
    else:
        mgmt_names = raw[mgmt_index_flag+1:analyst_index_flag]
        analyst_names = raw[analyst_index_flag+1:operator_index_flag]
    return mgmt_names, analyst_names

#### Get texts for each member in management and put in a list

In [None]:
def get_mgmt_discussion(text_list, mgmt_names,analyst_names):
    hints = mgmt_names+analyst_names+['Operator','Unidentified Company Representative']
    combined_strs = []
    for i in range(0,len(mgmt_names)):
        disc = []
        flag = -1 # When flag =-1, we do not pick up text. When flag =1, we pick up texts.
        for text in text_list:
            if text ==mgmt_names[i]  and flag == -1:
                flag=flag * -1
            elif text not in hints and flag ==1:
                disc.append(text)
            elif text in hints and flag ==1:
                flag=flag*-1
            elif text not in hints and flag== -1:
                continue
        combined_str = ("\n\n".join(disc))
        combined_strs.append(combined_str)       
    return combined_strs

#### Complexity analysis using Textstat
##### Flesch reading ease score: 
0-29 Vergy confusing 30-49 Difficult 50-59 Fairly diffucult 60-69 Standard 70-79 Fairly Easy >=80 Easy & Very easy
https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level

In [None]:
def complexity_score(combined_strs,date,company,children):
    # Use functions in Textstat to calculate different complexity scores
    flesch_ease_scores = [textstat.flesch_reading_ease(text) for text in combined_strs]
    mean_flesch_score = np.mean(flesch_ease_scores)
    if mean_flesch_score < 0:
        mean_flesch_score = 0
    reading_times = [textstat.reading_time(text,ms_per_char=14.69) for text in combined_strs] #Assumes 14.69ms per character.
    sentence_num =  [textstat.sentence_count (text) for text in combined_strs]
    polysyllable_num = [textstat.polysyllabcount (text) for text in combined_strs] # The number of words with syllable >= 3
    # Calculate plysyllabe words per sentence
    polysyllable_per_sentence = list(map(lambda x,y: round(x/y,2), polysyllable_num,sentence_num)) 
    q_score = {'Date':date,
               'Company': company,
               'Flesch score':[np.round(mean_flesch_score,2)], 
               'Reading time':[np.round(np.sum(reading_times),2)], 
               'Sentence number':[np.round(np.sum(sentence_num),2)], 
               'Polysyllable number':[np.round(np.sum(polysyllable_num),2)], 
               'Polysyllable number per sentence':[np.round(np.mean(polysyllable_per_sentence),2)],
               'Children': children}
    q_score_summary = pd.DataFrame(q_score)
    return q_score_summary