In [1]:
import re
import glob
import pandas as pd
from tqdm import tqdm

In [2]:
# get all files
import os
os.chdir('JoF_2000_2021_full_text/')
files = glob.glob("*.txt")


In [3]:
def get_text(path_to_file):
    f = open(path_to_file, "r", encoding='utf8')
    text = f.read()
    return text

In [4]:
def get_hypothesis(text):

    # return extracted hypothesis and rule number

    # tokenized_text = text.splitlines()
    # print(tokenized_text)

    hypotheses = []
    rules = []

    text = text.replace('\n', '')

    # case 2:
    res = re.findall(r'we.{1,10}hypothesize that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [2 for _ in range(len(res))]


    # case 3:
    res = re.findall(r'hypothesis \d[a-z]?[\s:].{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [3 for _ in range(len(res))]


    # case 4:
    res = re.findall(r'Our.{1,10}hypothesis is that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [4 for _ in range(len(res))]

    res = re.findall(r'Our.{1,10}hypothesis was that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [4 for _ in range(len(res))]


    # case 5:
    res = re.findall(r'we.{5,15}the hypothesis that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [5 for _ in range(len(res))]


    # case 6:
    res = re.findall(r'.{100,400}\. To test this hypothesis', text)
    if res:
        hypotheses += res
        rules += [6 for _ in range(len(res))] 

    res = re.findall(r'.{100,400}\. To test these hypotheses', text)
    if res:
        hypotheses += res
        rules += [6 for _ in range(len(res))] 


    return hypotheses, rules

In [5]:
def main(file):
    text = get_text(file)
    return get_hypothesis(text)

In [6]:
main(files[50])

([], [])

## get hypothesis from all free articles

In [7]:
results = {}
H = pd.DataFrame()
for file in tqdm(files):
    # print(main(file))
    hypotheses, rules = main(file)
    # print('{:} hypotheses found in file {}'.format(len(hypotheses), file))    
    if hypotheses:
        file_names = [file for _ in range(len(hypotheses))]
        df = pd.DataFrame({'file_name': file_names, 'hypothesis': hypotheses, 'rule': rules})
        # print(df)
        H = pd.concat([H, df])


100%|██████████| 1889/1889 [05:57<00:00,  5.28it/s]


In [8]:
H = H.drop_duplicates()
H.reset_index(inplace=True)
H = H.drop('index', axis=1)


In [9]:
H.head()

Unnamed: 0,file_name,hypothesis,rule
0,10.txt,"We hypothesize that, by increasing the demand ...",2
1,10.txt,we examine the hypothesis that investors who l...,5
2,1002.txt,We reject the hypothesis that time effects can...,5
3,1004.txt,Our hypothesis is that the past changes in pri...,4
4,1004.txt,Our null hypothesis is that breaking the B/M r...,4


In [10]:
H.shape

(1055, 3)

In [11]:
H.rule.value_counts()

3    418
2    212
5    193
6    132
4    100
Name: rule, dtype: int64

In [12]:
H.to_csv('JoF_extracted_hypothesis.csv', index=False)

In [13]:
JoF_metadata = pd.read_csv('../JoF_2000_2021.csv', index_col=0)
JoF_metadata.head()

Unnamed: 0,doi,title,publish_date,publish_month,publish_year,volume,issue,abstract,full_text_filename
0,https://onlinelibrary-wiley-com.libproxy1.nus....,Rent Extraction with Securities Plus Cash,16,March,2021,N.A.,Accepted Articles,"In our target‐initiated theory of takeovers, a...",0.txt
1,https://onlinelibrary-wiley-com.libproxy1.nus....,Are CEOs Different?,9,March,2021,N.A.,Accepted Articles,"Using 2,603 executive assessments, we study ho...",1.txt
2,https://onlinelibrary-wiley-com.libproxy1.nus....,Subjective Cash Flow and Discount Rate Expecta...,26,February,2021,N.A.,Accepted Articles,Why do stock prices vary? Using survey forecas...,2.txt
3,https://onlinelibrary-wiley-com.libproxy1.nus....,Banking on Deposits: Maturity Transformation w...,15,February,2021,N.A.,Accepted Articles,We show that maturity transformation does not ...,3.txt
4,https://onlinelibrary-wiley-com.libproxy1.nus....,"For Richer, for Poorer: Bankers' Liability and...",11,February,2021,N.A.,Accepted Articles,We study whether banks are riskier if managers...,4.txt


In [16]:
H_merged = pd.merge(JoF_metadata, H, how='right', left_on='full_text_filename', right_on='file_name')
H_merged = H_merged.drop('file_name', axis=1)

In [17]:
H_merged.head()

Unnamed: 0,doi,title,publish_date,publish_month,publish_year,volume,issue,abstract,full_text_filename,hypothesis,rule
0,https://onlinelibrary-wiley-com.libproxy1.nus....,The Economics of Hedge Fund Startups: Theory a...,5,February,2021,N.A.,Accepted Articles,This paper examines how market frictions influ...,7.txt,HYPOTHESIS 1 (Two types of inceptions): Hot in...,3
1,https://onlinelibrary-wiley-com.libproxy1.nus....,The Economics of Hedge Fund Startups: Theory a...,5,February,2021,N.A.,Accepted Articles,This paper examines how market frictions influ...,7.txt,HYPOTHESIS 2 (Value‐creating inceptions): Ince...,3
2,https://onlinelibrary-wiley-com.libproxy1.nus....,The Economics of Hedge Fund Startups: Theory a...,5,February,2021,N.A.,Accepted Articles,This paper examines how market frictions influ...,7.txt,HYPOTHESIS 3 (The impact of family structure o...,3
3,https://onlinelibrary-wiley-com.libproxy1.nus....,The Economics of Hedge Fund Startups: Theory a...,5,February,2021,N.A.,Accepted Articles,This paper examines how market frictions influ...,7.txt,HYPOTHESIS 4 (Two types of inceptions within f...,3
4,https://onlinelibrary-wiley-com.libproxy1.nus....,The Economics of Hedge Fund Startups: Theory a...,5,February,2021,N.A.,Accepted Articles,This paper examines how market frictions influ...,7.txt,Hypothesis 3 that stand‐alone inceptions outpe...,3


In [18]:
H_merged.to_csv('../JoF_extracted_hypothesis_with_paper_info.csv')