In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
from tqdm import tqdm

In [2]:
# get all files
import os
import glob
# change dir
os.chdir('papers/RFS_2000_2021_full_text_html')
files = glob.glob("*.html")

In [3]:
files[:3]

['0.html', '1.html', '10.html']

In [4]:
def get_hypothesis(soup):

    hypotheses = []
    # case 1:
    results = soup.find_all(name='div', attrs='statement')

    for result in results:
        content = result.text
        # print(content)
        match = re.search('hypothes', content, re.I)
        if match:
            hypotheses.append(content)

    # case 2:
    results = soup.find_all(name='p', attrs='chapter-para')

    for result in results:
        content = result.text
        # print(content)
        try:
            res = re.search('we.{1,10}hypothesize that.{1,300}\.', content, re.I)
            if res:
                hypotheses.append(res.group(0))
        except:
            pass

    # case 3:
    for result in results:
        content = result.text
        # print(content)
        try:
            res = re.search('hypothesis \d[a-z]?[\s:].{1,300}', content, re.I)
            if res:
                hypotheses.append(res.group(0))
        except:
            pass

    # case 4:
    for result in results:
        # print(result)
        content = result.text
        # print(content)
        try:
            res = re.search(r'Our.{1,10}hypothesis (\bis\b|\bwas\b) that.{1,300}\.', content, re.I)
            if res:
                hypotheses.append(res.group(0))
        except:
            pass

    # case 5:
    for result in results:
        # print(result)
        content = result.text
        # print(content)
        try:
            res = re.search(r'we.{5,15}the hypothesis that.{1,300}\.', content, re.I)
            if res:
                hypotheses.append(res.group(0))
        except:
            pass

    # case 6:
    for result in results:
        # print(result)
        content = result.text
        # print(content)
        try:
            res = re.search(r'.{100,400}\. To test this hypothesis', content)
            if res:
                hypotheses.append(res.group(0)[:-24]) # remove "To test this hypothesis"

            res = re.search(r'.{100,400}\. To test these hypotheses', content)
            if res:
                hypotheses.append(res.group(0)[:-24]) # remove "To test this hypothesis"
        except:
            pass


    return hypotheses

In [7]:
def main(file):
    with open(file, encoding='utf-8') as fp:
        soup = BeautifulSoup(fp, 'html.parser')
    return get_hypothesis(soup)

## get hypothesis from all free articles

In [9]:
results = {}
for file in tqdm(files):
    hypotheses = main(file)
    if hypotheses:
        results[file] = hypotheses
    

100%|██████████| 1813/1813 [45:38<00:00,  1.51s/it]


In [10]:
import pandas as pd
df = pd.DataFrame({'file': list(results.keys()), 'hypotheses': list(results.values())})

In [11]:
df.head()

Unnamed: 0,file,hypotheses
0,0.html,[We begin our analysis by examining the statis...
1,1000.html,[we examine the hypothesis that the observed d...
2,1002.html,[We hypothesize that skilled managers with inn...
3,1005.html,[we hypothesize that the market wide investor ...
4,1006.html,[we cannot reject the hypothesis that the inve...


In [12]:
# drop duplicates
df['hypotheses'] = df['hypotheses'].apply(lambda x: list(set(x)))
df['num_hypotheses'] = df['hypotheses'].apply(len)

In [13]:
df['num_hypotheses'].sum()

928

In [14]:
df.to_csv('../../RFS_extracted_hypothesis.csv', index=False)

In [19]:
### reformat dataframe ###
file_name = []
h = []
for index, row in df.iterrows():
    for i in range(len(row['hypotheses'])):
        file_name.append(row['file'])
        h.append(row['hypotheses'][i])

In [20]:
len(file_name)

928

In [21]:
len(h)

928

In [22]:
df_hypotheses_reformatted = pd.DataFrame({'file_name': file_name, 'hypothesis': h})

In [23]:
df_hypotheses_reformatted.head()

Unnamed: 0,file_name,hypothesis
0,0.html,ether this reduction in syndicated credit matt...
1,0.html,We begin our analysis by examining the statist...
2,1000.html,we examine the hypothesis that the observed di...
3,1000.html,we test the hypothesis that investors and lend...
4,1002.html,We hypothesize that skilled managers with inno...


In [24]:
metadata = pd.read_csv('../RFS_2000_2021.csv', index_col=0)
metadata.head()

Unnamed: 0,doi,title,volume,issue,abstract,full_text_filename,full_text_html_filename
0,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,The Rise of Shadow Banking: Evidence from Capi...,34,5,We investigate the connections between bank ca...,,0.html
1,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Mortgage Securitization and Shadow Bank Lending,34,5,We show how securitization affects the size of...,,1.html
2,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Learning about Competitors: Evidence from SME ...,34,5,We study how small and medium enterprise (SME)...,,2.html
3,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,"Marketplace Lending, Information Aggregation, ...",34,5,Lending marketplaces aimed at directly connect...,,3.html
4,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Mutual Funds as Venture Capitalists? Evidence ...,34,5,“Founder-friendly” venture financings and nont...,,4.html


In [26]:
H_merged = pd.merge(metadata, df_hypotheses_reformatted, how='right', left_on='full_text_html_filename', right_on='file_name')
H_merged = H_merged.drop('file_name', axis=1)

In [27]:
H_merged.head()

Unnamed: 0,doi,title,volume,issue,abstract,full_text_filename,full_text_html_filename,hypothesis
0,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,The Rise of Shadow Banking: Evidence from Capi...,34,5,We investigate the connections between bank ca...,,0.html,ether this reduction in syndicated credit matt...
1,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,The Rise of Shadow Banking: Evidence from Capi...,34,5,We investigate the connections between bank ca...,,0.html,We begin our analysis by examining the statist...
2,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,"Marketplace Lending, Information Aggregation, ...",34,5,Lending marketplaces aimed at directly connect...,,3.html,we reject the hypothesis that the interest rat...
3,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,"Marketplace Lending, Information Aggregation, ...",34,5,Lending marketplaces aimed at directly connect...,,3.html,we cannot reject the hypothesis that such a sw...
4,https://doi-org.libproxy1.nus.edu.sg/10.1093/r...,Mutual Funds as Venture Capitalists? Evidence ...,34,5,“Founder-friendly” venture financings and nont...,,4.html,Hypothesis 3.Mutual funds are more likely to i...


In [29]:
H_merged.to_csv('../../RFS_extracted_hypothesis_with_paper_info.csv')