## Extracting Text from PDF files

### Extracting relevant pages of PDF

In [1]:
import PyPDF2 as pdf

In [2]:
def relevant_pages(file_name):
    file = open(file_name, 'rb')
    pdf_reader = pdf.PdfFileReader(file)
    pdf_writer = pdf.PdfFileWriter()
    for i in range(3,pdf_reader.getNumPages()-1):
        page_i = pdf_reader.getPage(i)
        pdf_writer.addPage(page_i)
    
    output = open('../Data/Transcripts/Pages.pdf','wb')
    pdf_writer.write(output)
    output.close()

### Extracting text from each PDF

In [3]:
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io
import os

In [4]:
def text_extractor(file_name):
    '''
    input: file name of an earnings transcript
    output: extracted text from the transcript
    '''  
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    
    with open(file_name, 'rb') as fh:

        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            page_interpreter.process_page(page)
            
        text = fake_file_handle.getvalue()

    # close open handles
    converter.close()
    fake_file_handle.close()
    
    return text

In [5]:
directory = r'/Users/mike/Desktop/GitHub Repositories/project4_microsoft_transformation/Data/Transcripts'
msft_earnings_dict_orig = {}
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        msft_earnings_dict_orig[filename[0:-4]] = text_extractor(os.path.join('../Data/Transcripts', filename))
    else:
        continue

In [6]:
msft_earnings_dict_orig.keys()

dict_keys(['msft_21q2', 'msft_21q1', 'msft_07q3', 'msft_18q4', 'msft_18q1', 'msft_18q3', 'msft_07q4', 'msft_18q2', 'msft_20q1', 'msft_19q4', 'msft_20q2', 'msft_20q3', 'msft_19q3', 'msft_19q2', 'msft_20q4', 'msft_19q1', 'msft_08q4', 'msft_15q1', 'msft_17q3', 'msft_11q4', 'msft_17q2', 'msft_15q2', 'msft_13q4', 'msft_17q1', 'msft_15q3', 'msft_08q2', 'msft_11q3', 'msft_13q1', 'msft_11q2', 'msft_17q4', 'msft_08q3', 'msft_08q1', 'msft_15q4', 'msft_13q2', 'msft_13q3', 'msft_11q1', 'msft_12q4', 'msft_14q2', 'msft_14q3', 'msft_16q1', 'msft_16q3', 'msft_09q4', 'msft_14q1', 'msft_16q2', 'msft_10q4', 'msft_12q2', 'msft_09q1', 'msft_14q4', 'msft_10q1', 'msft_12q3', 'msft_12q1', 'msft_10q3', 'msft_09q2', 'msft_09q3', 'msft_16q4', 'msft_10q2'])

## Text Preprocessing

In [122]:
msft_earnings_dict = msft_earnings_dict_orig.copy()

In [123]:
msft_earnings_dict_orig['msft_14q2']

'MICROSOFT CORPORATION FQ2 2014 EARNINGS CALL |  JAN 23, 2014\n\nPresentation\n\nOperator\nGreetings, and welcome to the Microsoft Second Quarter Fiscal Year 2014 Earnings Conference Call.\n[Operator Instructions] . As a reminder, this conference is being recorded. It is now my pleasure to\nintroduce your host, Chris Suh, with -- Manager -- General Manager, Investor Relations for Microsoft.\nThank you, Chris. You may begin.\nChris Suh\nGeneral Manager of Cloud & Enterprise Finance\nThank you, operator. On our website, microsoft.com/investor, is our financial summary slide deck, which\nis intended to follow our prepared remarks and provides a reconciliation of differences between GAAP and\nnon-GAAP financial measures.\nAs a reminder, we will post today\'s prepared remarks to our website immediately following the call until\nthe complete transcript is available. Today\'s call is being webcast live and recorded. If you ask a question,\nit will be included in our live transmission, in the 

### Remove line breaks and apostrophes

In [124]:
for tscript in msft_earnings_dict.keys():
    msft_earnings_dict[tscript] = msft_earnings_dict[tscript].replace('\n',' ')
    msft_earnings_dict[tscript] = msft_earnings_dict[tscript].replace("\'s",'')
    msft_earnings_dict[tscript] = msft_earnings_dict[tscript].replace("\'ll",'')
    msft_earnings_dict[tscript] = msft_earnings_dict[tscript].replace("\'re",'')

### Remove punctuations

In [125]:
import re
import string

In [126]:
for tscript in msft_earnings_dict.keys():
    msft_earnings_dict[tscript] = msft_earnings_dict[tscript].translate(str.maketrans('', '', string.punctuation))

### Tokenization (words)

In [127]:
import nltk
import spacy

In [128]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/mike/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [129]:
for tscript in msft_earnings_dict.keys():
    msft_earnings_dict[tscript] = nltk.word_tokenize(msft_earnings_dict[tscript])

In [130]:
msft_earnings_dict['msft_14q2']

['MICROSOFT',
 'CORPORATION',
 'FQ2',
 '2014',
 'EARNINGS',
 'CALL',
 'JAN',
 '23',
 '2014',
 'Presentation',
 'Operator',
 'Greetings',
 'and',
 'welcome',
 'to',
 'the',
 'Microsoft',
 'Second',
 'Quarter',
 'Fiscal',
 'Year',
 '2014',
 'Earnings',
 'Conference',
 'Call',
 'Operator',
 'Instructions',
 'As',
 'a',
 'reminder',
 'this',
 'conference',
 'is',
 'being',
 'recorded',
 'It',
 'is',
 'now',
 'my',
 'pleasure',
 'to',
 'introduce',
 'your',
 'host',
 'Chris',
 'Suh',
 'with',
 'Manager',
 'General',
 'Manager',
 'Investor',
 'Relations',
 'for',
 'Microsoft',
 'Thank',
 'you',
 'Chris',
 'You',
 'may',
 'begin',
 'Chris',
 'Suh',
 'General',
 'Manager',
 'of',
 'Cloud',
 'Enterprise',
 'Finance',
 'Thank',
 'you',
 'operator',
 'On',
 'our',
 'website',
 'microsoftcominvestor',
 'is',
 'our',
 'financial',
 'summary',
 'slide',
 'deck',
 'which',
 'is',
 'intended',
 'to',
 'follow',
 'our',
 'prepared',
 'remarks',
 'and',
 'provides',
 'a',
 'reconciliation',
 'of',
 'dif

In [131]:
len(msft_earnings_dict['msft_14q2'])

8306

### Remove stop words

In [132]:
from nltk.corpus import stopwords

In [133]:
stopwords = set(stopwords.words('english'))

In [134]:
for tscript in msft_earnings_dict.keys():
    msft_earnings_dict[tscript] = [word for word in msft_earnings_dict[tscript] if word.lower() not in stopwords]

In [135]:
msft_earnings_dict['msft_14q2']

['MICROSOFT',
 'CORPORATION',
 'FQ2',
 '2014',
 'EARNINGS',
 'CALL',
 'JAN',
 '23',
 '2014',
 'Presentation',
 'Operator',
 'Greetings',
 'welcome',
 'Microsoft',
 'Second',
 'Quarter',
 'Fiscal',
 'Year',
 '2014',
 'Earnings',
 'Conference',
 'Call',
 'Operator',
 'Instructions',
 'reminder',
 'conference',
 'recorded',
 'pleasure',
 'introduce',
 'host',
 'Chris',
 'Suh',
 'Manager',
 'General',
 'Manager',
 'Investor',
 'Relations',
 'Microsoft',
 'Thank',
 'Chris',
 'may',
 'begin',
 'Chris',
 'Suh',
 'General',
 'Manager',
 'Cloud',
 'Enterprise',
 'Finance',
 'Thank',
 'operator',
 'website',
 'microsoftcominvestor',
 'financial',
 'summary',
 'slide',
 'deck',
 'intended',
 'follow',
 'prepared',
 'remarks',
 'provides',
 'reconciliation',
 'differences',
 'GAAP',
 'nonGAAP',
 'financial',
 'measures',
 'reminder',
 'post',
 'today',
 'prepared',
 'remarks',
 'website',
 'immediately',
 'following',
 'call',
 'complete',
 'transcript',
 'available',
 'Today',
 'call',
 'webcast'

In [136]:
len(msft_earnings_dict['msft_14q2'])

4645

### Lemmatization