In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nbzhy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# get all files
import os
import glob
# change dir
os.chdir('papers/RFS_2000_2021_full_text_html')
files = glob.glob("*.html")

In [3]:
## sample file for test purpose
# from random import sample
# import random
# random.seed(42)
# files = sample(files,100)


In [4]:
files[:3]

['0.html', '1.html', '10.html']

In [5]:
def get_hypothesis(soup):

    hypotheses = []
    rules = []

    # case 1:
    results = soup.find_all(name='div', attrs='statement')

    for result in results:
        content = result.text
        # print(content)
        match = re.search('hypothes', content, re.I)
        if match:
            hypotheses.append(content)
            rules.append(1)

    # case 2:
    results = soup.find_all(name='p', attrs='chapter-para')

    for result in results:
        content = result.text
        # print(content)
        try:
            res = re.search('we.{1,10}hypothesize that.{1,300}\.', content, re.I)
            if res:
                hypotheses.append(res.group(0))
                rules.append(2)
        except:
            pass

    # case 3:
    for result in results:
        content = result.text
        # print(content)
        try:
            res = re.search('hypothesis \d[a-z]?[\s:].{1,300}', content, re.I)
            if res:
                hypotheses.append(res.group(0))
                rules.append(3)
        except:
            pass

    # case 4:
    for result in results:
        # print(result)
        content = result.text
        # print(content)
        try:
            res = re.search(r'Our.{1,10}hypothesis (\bis\b|\bwas\b) that.{1,300}\.', content, re.I)
            if res:
                hypotheses.append(res.group(0))
                rules.append(4)
        except:
            pass

    # case 5:
    for result in results:
        # print(result)
        content = result.text
        # print(content)
        try:
            res = re.search(r'we.{5,15}the hypothesis that.{1,300}\.', content, re.I)
            if res:
                hypotheses.append(res.group(0))
                rules.append(5)
        except:
            pass

    # case 6:
    for result in results:
        # print(result)
        content = result.text
        # print(content)
        try:
            res = re.search(r'.{100,400}\. To test this hypothesis', content)
            if res:
                res = res.group(0)
                if '\n' in res:
                    res = res.split('\n')[-1]
                sent_text = nltk.sent_tokenize(res)
                hypotheses.append(sent_text[-2])
                rules.append(6)

            res = re.search(r'.{100,400}\. To test these hypotheses', content) # remove "To test these hypothesis"
            if res:
                res = res.group(0)
                if '\n' in res:
                    res = res.split('\n')[-1]
                sent_text = nltk.sent_tokenize(res)
                hypotheses.append(sent_text[-2])
                rules.append(6)
        except:
            pass


    return hypotheses, rules

In [6]:
def main(file):
    with open(file, encoding='utf-8') as fp:
        soup = BeautifulSoup(fp, 'html.parser')
    return get_hypothesis(soup)

## get hypothesis from all free articles

In [7]:
file_names = []
H = []
R = []
for file in tqdm(files):
    hypotheses, rules = main(file)
    if hypotheses:
        H += hypotheses
        R += rules
        [file_names.append(file) for _ in range(len(hypotheses))]
    

100%|██████████| 1813/1813 [43:06<00:00,  1.43s/it]


In [8]:
len(file_names), len(H), len(R)

(940, 940, 940)

In [9]:
import pandas as pd
df = pd.DataFrame({'file': file_names, 'hypothesis': H, 'rules': R})

In [10]:
df.head()

Unnamed: 0,file,hypothesis,rules
0,0.html,Banks with low Tier 1 ratios are closer to reg...,6
1,0.html,If firms cannot easily substitute to external ...,6
2,1000.html,we examine the hypothesis that the observed di...,5
3,1000.html,we test the hypothesis that investors and lend...,5
4,1002.html,We hypothesize that skilled managers with inno...,2


In [11]:
df = df.drop_duplicates()

In [17]:
df.to_csv('../../RFS_extracted_hypothesis.csv')

In [12]:
metadata = pd.read_csv('../RFS_2000_2021.csv', index_col=0)
metadata.head()

Unnamed: 0,doi,title,volume,issue,abstract,full_text_filename,full_text_html_filename
0,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,The Rise of Shadow Banking: Evidence from Capi...,34,5,We investigate the connections between bank ca...,,0.html
1,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Mortgage Securitization and Shadow Bank Lending,34,5,We show how securitization affects the size of...,,1.html
2,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Learning about Competitors: Evidence from SME ...,34,5,We study how small and medium enterprise (SME)...,,2.html
3,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,"Marketplace Lending, Information Aggregation, ...",34,5,Lending marketplaces aimed at directly connect...,,3.html
4,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Mutual Funds as Venture Capitalists? Evidence ...,34,5,“Founder-friendly” venture financings and nont...,,4.html


In [13]:
H_merged = pd.merge(metadata, df, how='right', left_on='full_text_html_filename', right_on='file')
H_merged = H_merged.drop('file', axis=1)

In [14]:
H_merged.head()

Unnamed: 0,doi,title,volume,issue,abstract,full_text_filename,full_text_html_filename,hypothesis,rules
0,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,The Rise of Shadow Banking: Evidence from Capi...,34,5,We investigate the connections between bank ca...,,0.html,Banks with low Tier 1 ratios are closer to reg...,6
1,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,The Rise of Shadow Banking: Evidence from Capi...,34,5,We investigate the connections between bank ca...,,0.html,If firms cannot easily substitute to external ...,6
2,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,"Marketplace Lending, Information Aggregation, ...",34,5,Lending marketplaces aimed at directly connect...,,3.html,we reject the hypothesis that the interest rat...,5
3,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,"Marketplace Lending, Information Aggregation, ...",34,5,Lending marketplaces aimed at directly connect...,,3.html,we cannot reject the hypothesis that such a sw...,5
4,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Mutual Funds as Venture Capitalists? Evidence ...,34,5,“Founder-friendly” venture financings and nont...,,4.html,Hypothesis 1.Mutual funds are more likely to i...,1


In [15]:
H_merged.to_csv('../../RFS_extracted_hypothesis_with_paper_info.csv')

In [16]:
H_merged[H_merged['rules']==6]

Unnamed: 0,doi,title,volume,issue,abstract,full_text_filename,full_text_html_filename,hypothesis,rules
0,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,The Rise of Shadow Banking: Evidence from Capi...,34,5,We investigate the connections between bank ca...,,0.html,Banks with low Tier 1 ratios are closer to reg...,6
1,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,The Rise of Shadow Banking: Evidence from Capi...,34,5,We investigate the connections between bank ca...,,0.html,If firms cannot easily substitute to external ...,6
11,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Mutual Funds as Venture Capitalists? Evidence ...,34,5,“Founder-friendly” venture financings and nont...,,4.html,If mutual funds are indeed negotiating and sel...,6
14,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Public Market Players in the Private World: Im...,34,5,We investigate the effect of pre-IPO investmen...,,5.html,"In a similar spirit, we expect that institutio...",6
15,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Public Market Players in the Private World: Im...,34,5,We investigate the effect of pre-IPO investmen...,,5.html,We hypothesize that the institutions’ pre-IPO ...,6
...,...,...,...,...,...,...,...,...,...
863,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Confronting Information Asymmetries: Evidence ...,17,2,There are relatively few direct tests of the e...,,1759.html,Predictions 6 (VTB and bank debt substitution)...,6
871,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Greener Pastures and the Impact of Dynamic Ins...,16,4,Although institutional investors have a prefer...,,1779.html,If institutions are partially responsible for ...,6
872,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Greener Pastures and the Impact of Dynamic Ins...,16,4,Although institutional investors have a prefer...,,1779.html,We hypothesize that institutional investors ma...,6
920,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Financial Constraints and Stock Returns,14,2,We test whether the impact of financial constr...,,1894.html,They conjecture that common variation in stock...,6
