In [1]:
from io import StringIO
from bs4 import BeautifulSoup
from tika import parser
import re

file_path = "Books Index for Phrases/Wooldridge - Introductory Econometrics (A Modern Approach) [Cengage 5th Ed - 2013].pdf"

file_data = []
_buffer = StringIO()
data = parser.from_file(file_path, xmlContent=True)
xhtml_data = BeautifulSoup(data['content'])
for page, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
    print('Parsing page {} of pdf file...'.format(page+1))
    _buffer = StringIO()
    _buffer.write(str(content))
    parsed_content = parser.from_buffer(_buffer.getvalue())
    _buffer.truncate()
    file_data.append({'id': 'page_'+str(page+1), 'content': parsed_content['content']})

Parsing page 1 of pdf file...
Parsing page 2 of pdf file...
Parsing page 3 of pdf file...
Parsing page 4 of pdf file...
Parsing page 5 of pdf file...
Parsing page 6 of pdf file...
Parsing page 7 of pdf file...
Parsing page 8 of pdf file...
Parsing page 9 of pdf file...
Parsing page 10 of pdf file...
Parsing page 11 of pdf file...
Parsing page 12 of pdf file...
Parsing page 13 of pdf file...
Parsing page 14 of pdf file...
Parsing page 15 of pdf file...
Parsing page 16 of pdf file...
Parsing page 17 of pdf file...
Parsing page 18 of pdf file...
Parsing page 19 of pdf file...
Parsing page 20 of pdf file...


In [2]:
START = "2SLS. See two stage least squares"
END = "assumptions"
PAGE_NUM = '862'

In [3]:
text = file_data[0]['content']

In [4]:
page_num_regex = re.compile(r', \d{1,3}')
# re.search(page_num_regex, 'adaptive expectations, 390, 392')
page_num_regex.sub('', 'adaptive expectations, 390, 392')

'adaptive expectations'

In [5]:
page_num_regex = re.compile(r', \d{1,3}–\d{1,3}')
page_num_regex.sub('', 'adjusted R-squareds, 202–205, 414')


'adjusted R-squareds, 414'

In [6]:
page_num_regex = re.compile(r'^\d{1,3}–\d{1,3}')
page_num_regex.sub('', '349–352')


''

In [7]:
page_num_regex = re.compile(r'^\d{1,3}–\d{1,3}')
page_num_regex.sub('', ' 349–352')

' 349–352'

In [8]:
see_regex = re.compile(r'\. See.+')
see_regex.sub('','2SLS. See two stage least squares')

'2SLS'

In [9]:
see_regex = re.compile(r'\. See.+')
see_regex.sub('','achievement test scores. See college GPA')



'achievement test scores'

In [10]:
res = []

for line in text.split('\n'):
    if START < line.lower() < END:
        if len(line) == 1 or line == 'Index' or line == PAGE_NUM:
            continue

        # clean page number
        page_num_regex_2 = re.compile(r', \d{1,3}–\d{1,3}')
        line = page_num_regex_2.sub('', line)

        page_num_regex_1 = re.compile(r', \d{1,3}')
        line = page_num_regex_1.sub('', line)

        page_num_regex_3 = re.compile(r'^\d{1,3}–\d{1,3}')
        line = page_num_regex_3.sub('', line)

        # remove See ...
        see_regex = re.compile(r'\. See.+')
        line = see_regex.sub('', line)

        if line:
            res.append(line)

In [11]:
res

['2SLS',
 '401(k) plans',
 'ability and wage',
 'achievement test scores',
 'adaptive expectations',
 'adjusted R-squareds',
 'AFDC participation',
 'age',
 'aggregate consumption function',
 'air pollution and housing prices',
 'airline and reservations',
 'alcohol drinking',
 'alternative hypotheses',
 'antidumping filings and chemical imports',
 'AR(3) serial correlation',
 'apples, ecolabeled',
 'AR(1) models, consistency example',
 'AR(1) serial correlation',
 'AR(2) models',
 'ARCH model',
 'AR(q) serial correlation',
 'arrests',
 'ASCII files',
 'assumption']

In [12]:
def extract_phrase(start_phrase, end_phrase, page_num, text): 

    res = []

    for line in text.split('\n'):
        if start_phrase.lower() <= line.lower() <= end_phrase.lower() or start_phrase.lower() >= line.lower() >= end_phrase.lower():
            if len(line) == 1 or line == 'Index' or line == page_num or len(line) > 100:
                continue

            # clean page number
            page_num_regex_2 = re.compile(r', \d{1,3}–\d{1,3}')
            line = page_num_regex_2.sub('', line)

            page_num_regex_1 = re.compile(r', \d{1,3}')
            line = page_num_regex_1.sub('', line)

            page_num_regex_3 = re.compile(r'^\d{1,3}–\d{1,3}')
            line = page_num_regex_3.sub('', line)

            # remove See ...
            see_regex = re.compile(r'\. See.+')
            line = see_regex.sub('', line)
                
            if line:
                res.append(line)

    return res

In [13]:
extract_phrase(start_phrase='2SLS. See two stage least squares', 
              end_phrase="assumptions", page_num="862", text=file_data[0]['content'])

['2SLS',
 '401(k) plans',
 'ability and wage',
 'achievement test scores',
 'adaptive expectations',
 'adjusted R-squareds',
 'AFDC participation',
 'age',
 'aggregate consumption function',
 'air pollution and housing prices',
 'airline and reservations',
 'alcohol drinking',
 'alternative hypotheses',
 'antidumping filings and chemical imports',
 'AR(3) serial correlation',
 'apples, ecolabeled',
 'AR(1) models, consistency example',
 'AR(1) serial correlation',
 'AR(2) models',
 'ARCH model',
 'AR(q) serial correlation',
 'arrests',
 'ASCII files',
 'assumptions',
 'assumption']

In [14]:
extract_phrase(start_phrase='asymptotic bias', 
              end_phrase="chemical imports. See antidumping filings and", page_num="863", text=file_data[1]['content'])

['bias',
 'attenuation',
 'biased estimators',
 'biased toward zero',
 'binary response models',
 'binary variables',
 'binomial distribution',
 'birth weight',
 'asymptotic standard error',
 'bivariate linear regression model',
 'BLUE (best linear unbiased estimator)',
 'bootstrapping',
 'Breusch-Godfrey test',
 'Breusch-Pagan test',
 'calculus, differential',
 'campus crimes, t test',
 'causality',
 'cdf (cumulative distribution functions)',
 'censored regression models',
 'Center for Research in Security Prices',
 'central limit theorem',
 'CEO salaries',
 'ceteris paribus',
 'chemical firms, nonnested models',
 'chemical imports',
 'asymptotic bias, deriving',
 'asymptotic confidence interval',
 'asymptotic efficiency of OLS',
 'asymptotic normality of estimators, in general,  ',
 'asymptotic normality of OLS',
 'asymptotic sample properties of estimators,  ',
 'asymptotic standard errors',
 'asymptotic t statistics',
 'asymptotically uncorrelated sequences',
 'asymptotics, OLS',
 

In [15]:
input_start = ['2SLS', 'asymptotic bias, deriving', 'chi-square distribution', 'criminologists', 'economic models', 'exogenous explanatory variables', 'firm sales', 'heteroskedasticity', 'independently pooled cross sections', 'Lagrange multiplier (LM) statistics', 'measurement error', 'OLS asymptotics', 'Panel Study of Income Dynamics', 'quadratic form for matrices', 'returns on equity and CEO salaries', 'serial correlation-robust standard errors, 431–434', 'stepwise regression, 686', 'theoretical framework, 687', 'unbalanced panels, 491–492', 'Wald test/statistics, 588–589, 598, 818', ]
input_end = ['assumptions', 'chemical imports. See antidumping filings and', 'crimes. See also arrests', 'economic growth and government policies', 'exclusion restrictions', 'finite sample properties', 'heterogeneity bias', 'independent variables', 'lagged explanatory variables', 'mean squared error (MSE)', 'OLS (ordinary least squares). See also', 'panel data', 'p-values', 'retrospective data, 2', 'serial correlation', 'statistics. See also hypothesis testing', 'theorems', 'unanticipated inflation, 390', 'wages. See also CEO salaries; income; minimum', 'zero-one variables, 227. See also qualitative']
input_page_num = [str(862 + i) for i in range(20)]
# input_text = [file_data[i]['content'] if i == 0 else file_data[i]['content'].replace(file_data[i-1]['content'], '') for i in range(20) ]
input_text = [file_data[i]['content'] for i in range(20) ]


In [16]:
import pandas as pd
pd.DataFrame({'page': input_page_num, 'start': input_start, 'end': input_end})

Unnamed: 0,page,start,end
0,862,2SLS,assumptions
1,863,"asymptotic bias, deriving",chemical imports. See antidumping filings and
2,864,chi-square distribution,crimes. See also arrests
3,865,criminologists,economic growth and government policies
4,866,economic models,exclusion restrictions
5,867,exogenous explanatory variables,finite sample properties
6,868,firm sales,heterogeneity bias
7,869,heteroskedasticity,independent variables
8,870,independently pooled cross sections,lagged explanatory variables
9,871,Lagrange multiplier (LM) statistics,mean squared error (MSE)


In [17]:
print(input_text[10])











municipal bond interest rates, 237–238
murder rates


SEM, 557
static Phillips curve, 346


n
natural experiments, 457, 521
natural logarithms, 712–715. See also logarithms
netting out, 78
no perfect collinearity assumption


for multiple linear regressions, 84–86, 87
for OLS in matrix form, 810
for time series regressions, 350, 385


no serial correlation assumption. See also serial  
correlation


for 2SLS, 553
for OLS in matrix form, 811
for time series regressions, 353–354, 387–388


nominal dollars, 360
nonexperimental data, 2
nonlinear functions, 710–716
nonlinearities, incorporating in simple regressions,  


41–44
nonnested models


choosing between, 203–205
functional form misspecification and, 307–308


nonrandom samples, 324–326, 615
nonstationary time series processes, 381–382
normal distribution, 745–749
normal sampling distributions


for multiple linear regressions, 120–121
for time series regressions, 355–356


normality assumption
for multiple linear regressi

In [18]:
total_extracted_key_phrase = []
for i in range(20):
    # print(extract_phrase(start_phrase=input_start[i], 
    #                            end_phrase=input_end[i], 
    #                            page_num=input_page_num[i], 
    #                            text=input_text[i]))
    extracted_key_phrase_by_page = extract_phrase(start_phrase=input_start[i], 
                                        end_phrase=input_end[i], 
                                        page_num=input_page_num[i], 
                                        text=input_text[i])
    total_extracted_key_phrase += extracted_key_phrase_by_page
    print('{keyword_num} keywords extracted from page {page_num}'.format(keyword_num=len(extracted_key_phrase_by_page), page_num=input_page_num[i]))

25 keywords extracted from page 862
60 keywords extracted from page 863
58 keywords extracted from page 864
55 keywords extracted from page 865
29 keywords extracted from page 866
34 keywords extracted from page 867
46 keywords extracted from page 868
38 keywords extracted from page 869
46 keywords extracted from page 870
53 keywords extracted from page 871
47 keywords extracted from page 872
27 keywords extracted from page 873
2 keywords extracted from page 874
49 keywords extracted from page 875
38 keywords extracted from page 876
38 keywords extracted from page 877
37 keywords extracted from page 878
28 keywords extracted from page 879
28 keywords extracted from page 880
30 keywords extracted from page 881


In [20]:
sorted(total_extracted_key_phrase)

['2SLS',
 '401(k) plans',
 'AFDC participation',
 'AR(1) models, consistency example',
 'AR(1) serial correlation',
 'AR(2) models',
 'AR(3) serial correlation',
 'AR(q) serial correlation',
 'ARCH model',
 'ASCII files',
 'BLUE (best linear unbiased estimator)',
 'Becker, Gary',
 'Bernoulli random variables',
 'Breusch-Godfrey test',
 'Breusch-Pagan test',
 'CEO salaries',
 'CPI (consumer price index)',
 'Center for Research in Security Prices',
 'Chow tests',
 'Cochrane-Orcutt (CO) estimation',
 'Compustat',
 'Davidson-MacKinnon test',
 'Dickey-Fuller test',
 'Durbin-Watson test',
 'EMH',
 'Eagle-Granger test',
 'EconLit',
 'Engle-Granger two-step procedure',
 'Excel',
 'F and t statistics',
 'F distribution',
 'F statistics',
 'F tests',
 'FDL (finite distributed lag) models, ',
 'FDL model',
 'Federal Bureau of Investigation',
 'GDL (geometric distributed lag)',
 'GDP',
 'GLS estimators',
 'GPA',
 'Gauss-Markov Theorem',
 'Gauss-Markov assumptions',
 'Goldberger, Arthur',
 'Google 

In [28]:
df = pd.DataFrame({'ID': [i for i in range(len(total_extracted_key_phrase))],
            'phrase': sorted(total_extracted_key_phrase),
            'textbook': ['Wooldridge - Introductory Econometrics' for _ in range(len(total_extracted_key_phrase))],
            'definition_or_explanations': ['' for _ in range(len(total_extracted_key_phrase))]})

In [29]:
df.head()

Unnamed: 0,ID,phrase,textbook,definition_or_explanations
0,0,2SLS,Wooldridge - Introductory Econometrics,
1,1,401(k) plans,Wooldridge - Introductory Econometrics,
2,2,AFDC participation,Wooldridge - Introductory Econometrics,
3,3,"AR(1) models, consistency example",Wooldridge - Introductory Econometrics,
4,4,AR(1) serial correlation,Wooldridge - Introductory Econometrics,


In [30]:
df.to_csv('Wooldridge_extracted_phrases.csv', index=False)