## Extracting Text from PDF files

### Extracting relevant pages of PDF

In [1]:
import PyPDF2 as pdf

In [2]:
def relevant_pages(file_name):
    file = open(file_name, 'rb')
    pdf_reader = pdf.PdfFileReader(file)
    pdf_writer = pdf.PdfFileWriter()
    for i in range(3,pdf_reader.getNumPages()-1):
        page_i = pdf_reader.getPage(i)
        pdf_writer.addPage(page_i)
    
    output = open('../Data/Transcripts/Pages.pdf','wb')
    pdf_writer.write(output)
    output.close()

### Extracting text from each PDF

In [3]:
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io
import os

In [4]:
def text_extractor(file_name):
    '''
    input: file name of an earnings transcript
    output: extracted text from the transcript
    '''  
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    
    with open(file_name, 'rb') as fh:

        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            page_interpreter.process_page(page)
            
        text = fake_file_handle.getvalue()

    # close open handles
    converter.close()
    fake_file_handle.close()
    
    return text

In [5]:
directory = r'/Users/mike/Desktop/GitHub Repositories/project4_microsoft_transformation/Data/Transcripts'
msft_earnings_dict_orig = {}
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        msft_earnings_dict_orig[filename[0:-4]] = text_extractor(os.path.join('../Data/Transcripts', filename))
    else:
        continue

In [6]:
msft_earnings_dict_orig.keys()

dict_keys(['msft_21q2', 'msft_21q1', 'msft_07q3', 'msft_18q4', 'msft_18q1', 'msft_18q3', 'msft_07q4', 'msft_18q2', 'msft_20q1', 'msft_19q4', 'msft_20q2', 'msft_20q3', 'msft_19q3', 'msft_19q2', 'msft_20q4', 'msft_19q1', 'msft_08q4', 'msft_15q1', 'msft_17q3', 'msft_11q4', 'msft_17q2', 'msft_15q2', 'msft_13q4', 'msft_17q1', 'msft_15q3', 'msft_08q2', 'msft_11q3', 'msft_13q1', 'msft_11q2', 'msft_17q4', 'msft_08q3', 'msft_08q1', 'msft_15q4', 'msft_13q2', 'msft_13q3', 'msft_11q1', 'msft_12q4', 'msft_14q2', 'msft_14q3', 'msft_16q1', 'msft_16q3', 'msft_09q4', 'msft_14q1', 'msft_16q2', 'msft_10q4', 'msft_12q2', 'msft_09q1', 'msft_14q4', 'msft_10q1', 'msft_12q3', 'msft_12q1', 'msft_10q3', 'msft_09q2', 'msft_09q3', 'msft_16q4', 'msft_10q2'])

## Text Preprocessing

In [7]:
msft_earnings_dict = msft_earnings_dict_orig.copy()

In [8]:
msft_earnings_dict_orig['msft_09q2']

"MICROSOFT CORPORATION FQ2 2009 EARNINGS CALL |  JAN 22, 2009\n\nPresentation\n\nUnknown Speaker\n<strong>Operator</strong>\nWelcome to the Microsoft fiscal year 2009 second quarter earnings call. Today’s call is being recorded. If\nyou have any objections, you may disconnect at this time. I would now like to turn the call over to Mr. Bill\nKoefoed, General Manager, Investor Relations. Sir, you may begin.\n<strong>Bill Koefoed</strong>\nThank you, operator, and thanks everyone for joining us a little earlier than normal today for Microsoft’s\nsecond quarter 2009 earnings conference call. We decided to align the timing of the earnings release this\nquarter with the cost management initiatives that we announced this morning. We will talk further about\nthese initiatives later on the call.\nI am delighted today to be joined by Steve Ballmer, our Chief Executive Officer, as well as Chris Liddell,\nSenior Vice President and Chief Financial Officer, Frank Brod, Corporate Vice President and C

### Remove line breaks and apostrophes

In [9]:
for tscript in msft_earnings_dict.keys():
    msft_earnings_dict[tscript] = msft_earnings_dict[tscript].replace('\n',' ')
    msft_earnings_dict[tscript] = msft_earnings_dict[tscript].replace("\'s",'')
    msft_earnings_dict[tscript] = msft_earnings_dict[tscript].replace("\'ll",'')
    msft_earnings_dict[tscript] = msft_earnings_dict[tscript].replace("\'re",'')

### Remove punctuations

In [10]:
import string

In [11]:
for tscript in msft_earnings_dict.keys():
    msft_earnings_dict[tscript] = msft_earnings_dict[tscript].translate(str.maketrans('', '', string.punctuation))

### Remove numbers

In [12]:
import re

In [13]:
for tscript in msft_earnings_dict.keys():
    msft_earnings_dict[tscript] = re.sub('\w*\d\w*', '', msft_earnings_dict[tscript])

In [14]:
for tscript in msft_earnings_dict.keys():
    msft_earnings_dict[tscript] = msft_earnings_dict[tscript].replace('strong','')

### Tokenization (words)

In [15]:
import nltk

In [16]:
for tscript in msft_earnings_dict.keys():
    msft_earnings_dict[tscript] = nltk.word_tokenize(msft_earnings_dict[tscript])

### Correct spelling errors

In [17]:
from spellchecker import SpellChecker

In [18]:
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text)
    for word in text:
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return corrected_text

for tscript in msft_earnings_dict.keys():
    msft_earnings_dict[tscript] = correct_spellings(msft_earnings_dict[tscript])

In [19]:
msft_earnings_dict['msft_09q2']

['MICROSOFT',
 'CORPORATION',
 'EARNINGS',
 'CALL',
 'JAN',
 'Presentation',
 'Unknown',
 'Speaker',
 'Operator',
 'Welcome',
 'to',
 'the',
 'Microsoft',
 'fiscal',
 'year',
 'second',
 'quarter',
 'earnings',
 'call',
 'Today',
 'i',
 'i',
 'call',
 'is',
 'being',
 'recorded',
 'If',
 'you',
 'have',
 'any',
 'objections',
 'you',
 'may',
 'disconnect',
 'at',
 'this',
 'time',
 'I',
 'would',
 'now',
 'like',
 'to',
 'turn',
 'the',
 'call',
 'over',
 'to',
 'Mr',
 'Bill',
 'Koefoed',
 'General',
 'Manager',
 'Investor',
 'Relations',
 'Sir',
 'you',
 'may',
 'begin',
 'Bill',
 'Koefoed',
 'Thank',
 'you',
 'operator',
 'and',
 'thanks',
 'everyone',
 'for',
 'joining',
 'us',
 'a',
 'little',
 'earlier',
 'than',
 'normal',
 'today',
 'for',
 'Microsoft',
 'i',
 'i',
 'second',
 'quarter',
 'earnings',
 'conference',
 'call',
 'We',
 'decided',
 'to',
 'align',
 'the',
 'timing',
 'of',
 'the',
 'earnings',
 'release',
 'this',
 'quarter',
 'with',
 'the',
 'cost',
 'management',


### Remove stop words

In [20]:
from nltk.corpus import stopwords

In [21]:
stopwords = set(stopwords.words('english'))

In [22]:
msft_earnings_dict2 = {}

for tscript in msft_earnings_dict.keys():
    msft_earnings_dict2[tscript] = [word for word in msft_earnings_dict[tscript] if word.lower() not in stopwords]

In [23]:
len(msft_earnings_dict2['msft_09q2'])

4705

### Lemmatization

In [24]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import defaultdict
from nltk import pos_tag

In [25]:
lemmatizer = WordNetLemmatizer()
tag_map = defaultdict(lambda : wordnet.NOUN)
tag_map['V'] = wordnet.VERB 
tag_map['J'] = wordnet.ADJ
tag_map['R'] = wordnet.ADV

In [26]:
for tscript in msft_earnings_dict2.keys():
    msft_earnings_dict2[tscript] = [lemmatizer.lemmatize(word.lower(), tag_map[tag[0]]) 
                                   for word, tag in pos_tag(msft_earnings_dict2[tscript])]

In [27]:
for tscript in msft_earnings_dict2.keys():
    msft_earnings_dict2[tscript] = [word for word in msft_earnings_dict2[tscript] if len(word) > 1]

In [28]:
msft_earnings_dict2['msft_09q2']

['microsoft',
 'corporation',
 'earnings',
 'call',
 'jan',
 'presentation',
 'unknown',
 'speaker',
 'operator',
 'welcome',
 'microsoft',
 'fiscal',
 'year',
 'second',
 'quarter',
 'earnings',
 'call',
 'today',
 'call',
 'record',
 'objection',
 'may',
 'disconnect',
 'time',
 'would',
 'like',
 'turn',
 'call',
 'mr',
 'bill',
 'koefoed',
 'general',
 'manager',
 'investor',
 'relation',
 'sir',
 'may',
 'begin',
 'bill',
 'koefoed',
 'thank',
 'operator',
 'thanks',
 'everyone',
 'join',
 'little',
 'early',
 'normal',
 'today',
 'microsoft',
 'second',
 'quarter',
 'earnings',
 'conference',
 'call',
 'decide',
 'align',
 'timing',
 'earnings',
 'release',
 'quarter',
 'cost',
 'management',
 'initiative',
 'announce',
 'morning',
 'talk',
 'initiatives',
 'later',
 'call',
 'delight',
 'today',
 'join',
 'steve',
 'ballmer',
 'chief',
 'executive',
 'officer',
 'well',
 'chris',
 'liddell',
 'senior',
 'vice',
 'president',
 'chief',
 'financial',
 'officer',
 'frank',
 'brod',

### Remove people's names

In [29]:
person_name_dict = {}
for tscript in msft_earnings_dict2.keys():
    msft_earnings_dict2[tscript] = [word for word, tag in pos_tag(msft_earnings_dict2[tscript]) if tag!='NNP' or word=='subscriber' or word=='xbox']
#     person_name_dict[tscript] = [word for word, tag in pos_tag(msft_earnings_dict2[tscript]) if tag=='NNP']

### Remove frequent words that have no information value

In [30]:
frequent_words = ['quarter', 'revenue', 'microsoft', 'year', 'window', 'business', 'so', 'think', 'call', 'see',
                  'pa', 'go', 'earnings', 'question', 'fiscal', 'operator', 'billion', 'inc', 'like', 'also', 'look',
                  'good', 'come', 'well', 'get', 'say', 'make', 'right', 'chris', 'copyright', 'expect', 'use', 
                  'next', 'corporation', 'would', 'give', 'weve', 'saw', 'im', 'chief', 'officer', 'today', 'yes',
                  'investor', 'relation', 'release', 'thing', 'spglobalcommarketintelligence', 'could', 'lot', 'let',
                  'result', 'one', 'talk', 'really', 'want', 'million', 'thank', 'thanks', 'first', 'second', 'grow',
                  'growth', 'market', 'point', 'last', 'global', 'within', 'us', 'satya', 'across', 'line', 'point'
                  'even', 'up', 'include', 'cfo', 'overall', 'way', 'take', 'around', 'due', 'division', 'continue',
                  'liddle', 'server', 'presentation', 'welcome', 'jan', 'apr', 'jul', 'oct', 'third', 'fourth', 
                  'conference', 'instruction', 'my', 'turn', 'record', 'participant', 'colleen', 'healy', 'general',
                  'manager', 'bill', 'koefoed', 'may', 'sir', 'please', 'objection', 'disconnect', 'greeting',
                  'reminder', 'mike', 'spencer', 'pleasure', 'host', 'suh', 'proceed', 'afternoon', 'still', 'till',
                  'della', 'amy', 'hood', 'peter', 'klein', 'adam', 'cio', 'ceo', 'likely', 'it', 'hi', 'john',
                  'feel', 'much', 'wwwmicrosoftcommsft', 'alan', 'karl', 'ian', 'says', 'keith', 'difucci', 'steve',
                  'pc', 'sp', 'liddell', 'nadella', 'charlie', 'william', 'vice', 'president', 'ubs', 'bellini',
                  'holt', 'lync', 'fy', 'director', 'deutsche', 'keirstead', 'christopher', 'join', 'jason', 'frank',
                  'brod', 'michael', 'financial', 'increase', 'constant', 'currency', 'intelligence','former',
                  'charge', 'guarantee', 'three', 'tech', 'client','friar','senior','corporate','accounting','deputy',
                  'counsel','vp','six','five','four','sara','breza', 'db', 'ross', 'wei', 'seethoff','research',
                  'maguire', 'sarah', 'inaudible', 'mbd', 'japan', 'intelligent', 'xp', 'oppenheimer', 'ive', 'ag', 
                  'fx', 'thill', 'citigroup', 'egbert']

msft_earnings_dict3 = {}

for tscript in msft_earnings_dict2.keys():
    msft_earnings_dict3[tscript] = [word for word in msft_earnings_dict2[tscript] if word not in frequent_words]

## Pickle the Dictionary of Documents (Corpus)

In [31]:
import pickle

In [32]:
with open('cleaned_corpus.pickle', 'wb') as file:
    pickle.dump(msft_earnings_dict3, file)

## Bigrams 

In [33]:
from nltk.util import ngrams

In [34]:
msft_earnings_dict_bi = {}

for tscript in msft_earnings_dict2.keys():
    msft_earnings_dict_bi[tscript] = [word1 + ' ' + word2 for word1, word2 in list(ngrams(msft_earnings_dict2[tscript], 2))]
    

In [35]:
### Remove frequent words that have no information value

frequent_words_bi = ['former chief','peter klein','klein former','chris liddell','call oct','call jul','call jan',
                  'call apr','officer yes','full fiscal','former general','william koefoed','koefoed former',
                  'amy hood','hood executive','executive vp','vp cfo','point view','colleen healy',
                  'christopher liddell','year year','question please','division peter','bellini ubs','chris suh',
                  'please operator','spencer general','michael spencer','rbc capital','macmillan rbc',
                  'first quarter','fourth quarter','second quarter','third quarter','tech guarantee','satya nadella',
                  'sp global','constant currency','ceo director','nadella ceo','bill koefoed','microsoft business',
                  'business division','division revenue','increase constant','grow constant','business pc',
                  'currency drive','business process','quarter full','segment gross','personal computing','suh general',
                  'director yes','friar goldman','percentage point','currency gross','cloud gross','dollar increase',
                  'point yearoveryear','officer thanks','thill citigroup']

for tscript in msft_earnings_dict_bi.keys():
    msft_earnings_dict_bi[tscript] = [word for word in msft_earnings_dict_bi[tscript] if word not in frequent_words_bi]

In [36]:
with open('cleaned_corpus_bi.pickle', 'wb') as file:
    pickle.dump(msft_earnings_dict_bi, file)