# Summariza Sales Contracts 

<b>Objective:</b> The goal of this project is to identify the amount of soft versus hard information in SEC filings of sales contracts that summarize a sales contract. Soft information is often forward-looking, uncertain, or qualitative. Hard information is often specific, historical, certain, and quantitative. 

<b>Input Data:</b> These transcripts are probably not in standardized format. But, to goal is to extract the content of information disseminated publicly through webcasts or press releases, so please process all text in each transcript or press release. Please remove boilerplate safe harbor statement and company description, and header indices created by the transcript or newswire companies. Each transcript or press release is in a separate .txt file with a unique identifier as the file name. 

<b>Output items:</b>

•	Total_words	
•	Number_Entities	
•	Words_in_Entities	
•	Number_of_Times	
•	Words_in_Times	
•	Number_of_Locations	
•	Words_in_Locations	
•	Number_of_Organizations	
•	Words_in_Organizations	
•	Number_of_Persons	
•	Words_in_Persons	
•	Number_of_Money	
•	Words_in_Money	
•	Number_of_Percentages	
•	Words_in_Percentages	
•	Number_of_Dates	
•	Words_in_Dates

•	Number of forward-looking words (Bozanic Roulstone Buskirk 2016 Appendix A word list)

•	Number of uncertain words (Bozanic et al. 2018 use Loughran and McDonald’s uncertainty measure)

•	Number of positive words (Harvard dictionary)

•	Number of negative words (Harvard dictionary)


In [2]:
# Standard Library
import os
import re
import csv
import sys
import time
import string
import datetime

# Third Party Libraries
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import opinion_lexicon

# Name Entity Recognitation
# https://juejin.im/post/5971a4b9f265da6c42353332?utm_source=gold_browser_extension%5D
import spacy 
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

# Measure the Readability
# https://pypi.org/project/textstat/
import textstat
import csv
import re
from bs4 import BeautifulSoup
import requests
import unicodedata

# Measure the Sentiment 
# https://www.analyticsvidhya.com/blog/2018/02/natural-language-processing-for-beginners-using-textblob/
from textblob import TextBlob

REGEX = r'\d{5} \d+/\d+/\d+ \d+/\d+/\d+'
TAG = r'<(.*?)>(.*?)</(.*?)>'

# Measure the forward looking statements
LINES = [temp.strip() for temp in open('expressions.txt', 'r').readlines()]
FWD_REGEX = re.compile(r'%s' % (r'\b' + r'\b|\b'.join(LINES) + r'\b'),
                   re.IGNORECASE)
IGNORE = ['call', r'questions?', 'press release', 'slides?', 'webcast',
          r'\?', r'(can|do|will|have) you', r'Q ?:', r'\[Q', r'\[?Operator\]?']
REG_IGNORE = re.compile(r'%s' %  r'|'.join(IGNORE), re.IGNORECASE)

In [2]:
#Example: Write a python list into person.csv file

import csv
csvData = [['Person', 'Age'], ['Peter', '22'], ['Jasmine', '21'], ['Sam', '24']]
with open('person.csv', 'w',newline='') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(csvData)
csvFile.close()

In [3]:
import unicodedata
def get_content(filein):
    "Get the main content of each investor meeting disclosure"
    #current_dir = os.path.dirname(os.path.abspath('__file__'))
    #folder = os.path.join(current_dir, 'Factiva week 1&2_v2')
    
    #path = os.path.join(folder, filein)
    
    data = open(filein, 'r',encoding='utf-8',errors="surrogateescape")

    mess = data.read()
    
    # presentation = 1 means it is a investor meeting including some conversations
    # presentation = 0 means it is a press releases that only include a summary
    
    presentation = 1
    
    if '\x00' in mess:
        data = open(path, 'r',encoding='utf-16')
        mess = data.read()
        
    if re.findall(r'[^\|] [A-Z]+: \w',mess) == []:
        presentation = 0
    
    # make the '\n' optional to include the missing ones and some company contact info as well as the copayright statement
    mess = mess.strip()
    content = re.findall(r'([A-Z][a-z].+)\.?\n?',mess)
    
    content = ' '.join(content)
    content = re.sub(r"[\x97\x95\xa0]"," ", content)
    content = re.sub(r"\\" + "'","", content)
    
    content = unicodedata.normalize("NFKD", content)
    content = preprocess_text(content)
    #content = content.strip()
    
    
    
    
    return content,presentation

#This is an example of a general description of the company as a press release classified as 0
# get_content('73681_11272012.txt')
# get_content('23297_09252003.txt')

In [4]:
def get_tokens(text):
    """Get a list of tokens (words) for a given text."""
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    
    filtered = [i for i in tokens if not all(j in string.punctuation for j in i)]
    final = [w.upper() for w in filtered if not w in stop_words]
    
    return final

# get_tokens(content)

In [5]:
def get_articles(headline=True):
    """Split the document file into articles."""
    path = os.path.join(current_dir, '2.Disclosure_sample.txt')
    lines = codecs.open(path, 'rU', 'latin').readlines()
    docs = list(dump_splitter(lines, headline=headline))
    return docs

In [6]:
def dump_splitter(data, headline=True):
    """Generator of article chunks."""
    buff = []
    for line in data:
        if re.findall(REGEX, line):
            if buff:
                if not headline:
                    buff.pop(1)
                yield u' '.join(buff)
                buff[:] = []
        if line.strip():
            buff.append(line.strip())
    yield u' '.join(buff)


In [7]:
def preprocess_text(doc):
    """Preprocess text."""
    # Extract preamble
    #preamble = re.findall(REGEX, doc)[0].split()
    text = re.sub(REGEX, '', doc).strip()

    # Remove irrelevant text
    text = re.sub((r'. (More information|For information on|For more '
                   'information) .*?$'), '', text)

    # Titlecase uppercase headlines
    capital = ''
    for char in text:
        if char.isupper() or char in string.punctuation + ' ':
            capital += char
        else:
            break
    if len(capital.split()) > 3:
        text = text.replace(capital, capital.title())

    return text


In [8]:
def get_readability(text):
    "Calculate some readability measures from the textstat package "
    
    #Return the Flesch Reading Ease Score
    read_ease = textstat.flesch_reading_ease(text)
    
    #Return the Fog Index Grade
    read_grade = textstat.gunning_fog(text)
    
    return read_ease,read_grade

In [9]:
def is_fwd(sentence):
    """Return true if the sentece is a fwd looking statement."""
    if sentence.isupper():
        return False
    if REG_IGNORE.search(sentence):
        return False
    return bool(FWD_REGEX.search(sentence))

is_fwd("Now we will move to page 21. And I'm going to ask Drew to go over the pro forma financial impact.")

True

In [10]:
def get_sentences(text):
    """Sentence tokenizer."""
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    return sent_detector.tokenize(text.strip())

get_sentences("Now we will move to page 21. And I'm going to ask Drew to go over the pro forma financial impact.")

['Now we will move to page 21.',
 "And I'm going to ask Drew to go over the pro forma financial impact."]

In [11]:
def get_fwd_statements(text):
    """Get number of forward-looking statements."""
    all_sents = get_sentences(text)
    
    len_all = len(all_sents)
    if not len_all:
        return None, None, None
    fwd = 0
    fwd_sents = []
    for sent in all_sents:
        if is_fwd(sent):
            fwd += 1
            fwd_sents.append(sent)
    return len_all, fwd, fwd * 1.0 / len_all, fwd_sents

get_fwd_statements("Now we will move to page 21. And I'm going to ask Drew to go over the pro forma financial impact.")

(2, 1, 0.5, ['Now we will move to page 21.'])

In [12]:
def get_results(text):
    "Count and sum the number of words in each entity and the number of entities "
    if len(text)>= 1000000:
        print('len of text exceeds ', len(text))
        text = text[:999999]
        
    text = nlp(text)

    labels = set([w.label_ for w in text.ents])
    entity_results = dict()
    word_results = dict()

    for label in labels:
        entities = [e.string for e in text.ents if label==e.label_]
        
        #get the number of words
        entity_list = " ".join(entities).strip()
        tokens = word_tokenize(entity_list)
        word_results[label] = len(tokens)
        
        #get the number of entities
        entity_results[label] = len(entities)


    for cat in ['TIME','LOC','ORG','PERSON','MONEY','PERCENT','DATE']:
            if not cat in entity_results.keys():
                entity_results[cat] = 0
            if not cat in word_results.keys():
                word_results[cat] = 0

    
    
    total_entities = sum(entity_results.values())
    e_times = entity_results['TIME']
    e_locations = entity_results['LOC']
    e_organizations = entity_results['ORG']
    e_persons = entity_results['PERSON']
    e_money = entity_results['MONEY']
    e_percentages = entity_results['PERCENT']
    e_dates = entity_results['DATE']
    
    total_entity_words = sum(word_results.values())
    w_times = word_results['TIME']
    w_locations = word_results['LOC']
    w_organizations = word_results['ORG']
    w_persons = word_results['PERSON']
    w_money = word_results['MONEY']
    w_percentages = word_results['PERCENT']
    w_dates = word_results['DATE']
        
    
    return total_entities, e_times, e_locations, e_organizations, e_persons, e_money, e_percentages, e_dates,\
    total_entity_words, w_times, w_locations, w_organizations, w_persons, w_money, w_percentages, w_dates


In [13]:
def get_uncertainty(text):
    "Count the frequencies of uncertain words from a list stored in uncertainty text file"
    
    cnt = Counter()
    wanted = re.findall('\w+',open('LM_uncertainty.txt').read())
    words = get_tokens(text)
    
    for word in words:
        if word in wanted:
            cnt[word] += 1
            
    total_cnt = sum(dict(cnt).values())
    
    return total_cnt,cnt

In [14]:
def get_sentiments(text):
    "Count the number of positive and negative words based off the LoughranMcDonald_SentimentWordLists_2018"
  
    cnt_pos = Counter()
    cnt_neg = Counter()
    words = get_tokens(text)
    
    wanted_pos = re.findall('\w+',open('LM_positive.txt').read())
    wanted_neg = re.findall('\w+',open('LM_negative.txt').read())
    
    for word in words:
        if word in wanted_pos:
            cnt_pos[word] += 1
        elif word in wanted_neg:
            cnt_neg[word] += 1
            
    pos = sum(dict(cnt_pos).values())
    neg = sum(dict(cnt_neg).values())
    
    return pos,neg,cnt_pos,cnt_neg

In [19]:
#use this cell for processing transcripts
import csv

results = []
with open("Input_transcript.csv", encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile) # change contents to floats
    for row in reader: # each row is a list
        results.append(row)

#create the header
header = [['eventid', 'year', 'companyName', 'companyTicker', 'startDate', 'eventTitle',
           'Content Format:\nInvestor Meeting - 1\nPress Release - 0',
           'Total Words',
           'Flesch Reading Ease Score','Gunning Fog Index',
           'Number of Entities','Words in Entities',
           'Number of Times','Words in Times',
           'Number of Locations','Words in Locations',
           'Number of Organizations','Words in Organizations',
           'Number of Persons','Words in Persons',
           'Number of Money','Words in Money',
           'Number of Percentages','Words in Percentages',
           'Number of Dates','Words in Dates',
           'Total Sentences', 'Total Forward Sentences', 'Forward Ratio',
           'Uncertainty Words',
           'Polarity','Subjectivity',
           'Positive Words','Negative Words'
           ]]

missing = []
#open a new output csv file 
with open('Investor_Meeting_Transcripts_new.csv', 'w',newline='') as fileout:
    writer = csv.writer(fileout)
    writer.writerows(header)
    #print ('\nScanning %d files in "%s"\n' % (len(files), folder))
    
    for kk in results[1:]:
        
        #Check the current file in the process
#         print(filein)
        
#         if not (num + 1) % 1000:
#             secs = time.time() - start
#             print ('\tFile %d done (%d secs)' % (num + 1, secs))
#             start = time.time()
        filein = 'all_texts/' + kk[0] + '.txt'
        with open(filein, 'w', encoding='utf-8') as text_file:
            text_file.write(kk[-1])

            
        #ID,Date = filein.strip('.txt').split('_')
        
        #Date = datetime.datetime.strptime(Date,'%m%d%Y').strftime('%Y-%m-%d')
    
        eventid, year, companyName, companyTicker, startDate, eventTitle = kk[0], kk[1],  kk[2], kk[3], kk[4], kk[5] 
        
        content,presentation = get_content(filein)
        
        words = len(get_tokens(content))
        
        #get the readability measures
        read_ease,read_grade = get_readability(content)
        
        if content == '':
            missing.append(filein)
            print ('\tERROR for file %s: 0 length' % filein)
            continue
            
        total_entities,e_times,e_locations, e_organizations, e_persons, e_money, e_percentages, e_dates,\
        total_entity_words,w_times,w_locations, w_organizations, w_persons, w_money, w_percentages, w_dates,\
        = get_results(content)

        fwd = get_fwd_statements(content)
        
        #count the frequencies of uncertain words 
        total_cnt,cnt = get_uncertainty(content)
        
        #use textblob package to analyze the sentiment
        blob = TextBlob(content)
        polarity = round(blob.sentiment.polarity,2)
        subjectivity = round(blob.sentiment.subjectivity,2)
        
        #count the positive and negative words
        pos, neg, cnt_pos, cnt_neg = get_sentiments(content)

#             if verbose:
#                 os.system('clear')
#                 print ('\n%d)' % num)
#                 for sent in res[3]:
#                     print ('\t%s' % sent)
#                 _ = raw_input('')

        row = [eventid, year, companyName, companyTicker, startDate, eventTitle, presentation, words,\
            read_ease,read_grade,\
            total_entities, total_entity_words, e_times, w_times, e_locations, w_locations, \
            e_organizations, w_organizations, e_persons, w_persons, e_money, w_money, \
            e_percentages, w_percentages, e_dates, w_dates,\
            fwd[0], fwd[1], round(fwd[2],2),\
            total_cnt,\
            polarity, subjectivity,\
            pos,neg]

        writer.writerow(row)
        
    print('\nAll done.\n These text files do not have required contents:\n')
    print(missing)
    

	ERROR for file all_texts/694710.txt: 0 length
	ERROR for file all_texts/721313.txt: 0 length
	ERROR for file all_texts/724140.txt: 0 length
	ERROR for file all_texts/937676.txt: 0 length
	ERROR for file all_texts/1125157.txt: 0 length
	ERROR for file all_texts/1105592.txt: 0 length

All done.
 These text files do not have required contents:

['all_texts/694710.txt', 'all_texts/721313.txt', 'all_texts/724140.txt', 'all_texts/937676.txt', 'all_texts/1125157.txt', 'all_texts/1105592.txt']


In [16]:
#use this for processing sales contracts
with open('sales_contracts.csv') as f:
    sales = [s for line in f.readlines() for s in line[:-1].split(',')]
sales = sales[1:] 
#create the header
header = [[ 'Total Words',
           'Flesch Reading Ease Score','Gunning Fog Index',
           'Number of Entities','Words in Entities',
           'Number of Times','Words in Times',
           'Number of Locations','Words in Locations',
           'Number of Organizations','Words in Organizations',
           'Number of Persons','Words in Persons',
           'Number of Money','Words in Money',
           'Number of Percentages','Words in Percentages',
           'Number of Dates','Words in Dates',
           'Total Sentences', 'Total Forward Sentences', 'Forward Ratio',
           'Uncertainty Words',
           'Polarity','Subjectivity',
           'Positive Words','Negative Words'
           ]]



missing = []
#open a new output csv file 
m = 1
with open('sales_contracts_results.csv', 'w',newline='') as fileout:
    writer = csv.writer(fileout)
    writer.writerows(header)
    #print ('\nScanning %d files in "%s"\n' % (len(files), folder))
    
    for url in sales:
        try:
            res = requests.get(url)
            page = BeautifulSoup(res.text)
            raw_text = page.body.get_text(" ", strip = True)
            filein = 'all_texts_asu/' + str(m)
            m+=1
            with open(filein, 'w', encoding='utf-8') as text_file:
                text_file.write(raw_text)

            content,presentation = get_content(filein)

            words = len(get_tokens(content))

            #get the readability measures
            read_ease,read_grade = get_readability(content)


            total_entities,e_times,e_locations, e_organizations, e_persons, e_money, e_percentages, e_dates,\
            total_entity_words,w_times,w_locations, w_organizations, w_persons, w_money, w_percentages, w_dates,\
            = get_results(content)

            fwd = get_fwd_statements(content)

            #count the frequencies of uncertain words 
            total_cnt,cnt = get_uncertainty(content)

            #use textblob package to analyze the sentiment
            blob = TextBlob(content)
            polarity = round(blob.sentiment.polarity,2)
            subjectivity = round(blob.sentiment.subjectivity,2)

            #count the positive and negative words
            pos, neg, cnt_pos, cnt_neg = get_sentiments(content)


            row = [words,\
                read_ease,read_grade,\
                total_entities, total_entity_words, e_times, w_times, e_locations, w_locations, \
                e_organizations, w_organizations, e_persons, w_persons, e_money, w_money, \
                e_percentages, w_percentages, e_dates, w_dates,\
                fwd[0], fwd[1], round(fwd[2],2),\
                total_cnt,\
                polarity, subjectivity,\
                pos,neg]

            writer.writerow(row)
        except Exception as e:
            print(e)
            writer.writerow([0]*27)
        
    print('\nAll done.\n These text files do not have required contents:\n')
    print(missing)

type NoneType doesn't define __round__ method
type NoneType doesn't define __round__ method
type NoneType doesn't define __round__ method
type NoneType doesn't define __round__ method
type NoneType doesn't define __round__ method
type NoneType doesn't define __round__ method
type NoneType doesn't define __round__ method

All done.
 These text files do not have required contents:

[]


[0, 0, 0, 0, 0, 0, 0, 0]