In [1]:
import re
import glob
import pandas as pd
from tqdm import tqdm

In [2]:
journal = 'JoFE'

In [3]:
# get all files
import os
# change dir
os.chdir('papers/JoFE_2000_2021_full_text/')
files = glob.glob("*.txt")


In [4]:
# load meta data csv
metadata = pd.read_csv('../JoFE_2000_2021.csv', index_col=0)
metadata.head()

Unnamed: 0,title,doi,volume,issue,month,year,abstract,full_text_filename
0,Commonality in liquidity,https://doi.org/10.1016/S0304-405X(99)00057-4,56,1,April,2000,"Traditionally and understandably, the microsco...",0.txt
1,Just another day in the inter-bank foreign exc...,https://doi.org/10.1016/S0304-405X(99)00058-6,56,1,April,2000,"In this paper, I develop a theory of bid–ask q...",1.txt
2,The costs and determinants of order aggressive...,https://doi.org/10.1016/S0304-405X(99)00059-8,56,1,April,2000,This paper examines the costs and determinants...,2.txt
3,The relative pricing of U.S. Treasury STRIPS: ...,https://doi.org/10.1016/S0304-405X(99)00060-4,56,1,April,2000,We investigate pricing relations and the poten...,3.txt
4,"Eighths, sixteenths, and market depth: changes...",https://doi.org/10.1016/S0304-405X(99)00061-6,56,1,April,2000,"Using limit order data provided by the NYSE, w...",4.txt


In [5]:
def get_text(path_to_file):
    f = open(path_to_file, "r", encoding='utf8')
    text = f.read()
    return text

In [6]:
def get_hypothesis(text):

    # return extracted hypothesis and rule number

    # tokenized_text = text.splitlines()
    # print(tokenized_text)

    hypotheses = []
    rules = []

    text = text.replace('\n', '')

    # case 2:
    res = re.findall(r'we.{1,10}hypothesize that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [2 for _ in range(len(res))]


    # case 3:
    res = re.findall(r'hypothesis \d[a-z]?[\s:].{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [3 for _ in range(len(res))]


    # case 4:
    res = re.findall(r'Our.{1,10}hypothesis is that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [4 for _ in range(len(res))]

    res = re.findall(r'Our.{1,10}hypothesis was that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [4 for _ in range(len(res))]


    # case 5:
    res = re.findall(r'we.{5,15}the hypothesis that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [5 for _ in range(len(res))]


    # case 6:
    res = re.findall(r'.{100,400}\. To test this hypothesis', text)
    if res:
        hypotheses += res
        rules += [6 for _ in range(len(res))] 

    res = re.findall(r'.{100,400}\. To test these hypotheses', text)
    if res:
        hypotheses += res
        rules += [6 for _ in range(len(res))] 


    return hypotheses, rules

In [7]:
def main(file):
    text = get_text(file)
    return get_hypothesis(text)

In [8]:
main(files[50])

(['Our central hypothesis is that greater transparency in fund operating expenditures results in lower agency costs, i.e., better return performance. A key requirement for our empirical test is comparability of services received for expensed versus bundled payments.'],
 [4])

## get hypothesis from all free articles

In [9]:
results = {}
H = pd.DataFrame()
for file in tqdm(files):
    # print(main(file))
    hypotheses, rules = main(file)
    # print('{:} hypotheses found in file {}'.format(len(hypotheses), file))    
    if hypotheses:
        file_names = [file for _ in range(len(hypotheses))]
        df = pd.DataFrame({'file_name': file_names, 'hypothesis': hypotheses, 'rule': rules})
        # print(df)
        H = pd.concat([H, df])


100%|██████████| 2216/2216 [05:43<00:00,  6.45it/s]


In [10]:
H = H.drop_duplicates()
H.reset_index(inplace=True)
H = H.drop('index', axis=1)


In [11]:
H.head()

Unnamed: 0,file_name,hypothesis,rule
0,1001.txt,we hypothesize that similar agents react simil...,2
1,1001.txt,Hypothesis 2 that if small shareholder similar...,3
2,1001.txt,Hypothesis 4 posits that the impact of noncont...,3
3,1001.txt,n that large changes in similarity are rare an...,6
4,1002.txt,Our null hypothesis is that the alphas of thes...,4


In [12]:
H.shape

(1198, 3)

In [13]:
H.rule.value_counts()

2    417
3    357
5    167
6    133
4    124
Name: rule, dtype: int64

In [14]:
os.chdir('../../')
H.to_csv(journal+'_extracted_hypothesis.csv', index=False)

In [15]:
H_merged = pd.merge(metadata, H, how='right', left_on='full_text_filename', right_on='file_name')
H_merged = H_merged.drop('file_name', axis=1)

In [16]:
H_merged.head()

Unnamed: 0,title,doi,volume,issue,month,year,abstract,full_text_filename,hypothesis,rule
0,Abnormal returns to rivals of acquisition targ...,https://doi.org/10.1016/S0304-405X(99)00048-3,55,2,February,2000,We develop and test the Acquisition Probabilit...,14.txt,"wever, reject the hypothesis that mean abnorma...",5
1,When an event is not an event: the curious cas...,https://doi.org/10.1016/S0304-405X(99)00045-8,55,1,January,2000,Shares trading in the Bolsa Mexicana de Valore...,21.txt,Our null hypothesis is that excess returns for...,4
2,When an event is not an event: the curious cas...,https://doi.org/10.1016/S0304-405X(99)00045-8,55,1,January,2000,Shares trading in the Bolsa Mexicana de Valore...,21.txt,Our null hypothesis is that absolute returns d...,4
3,When an event is not an event: the curious cas...,https://doi.org/10.1016/S0304-405X(99)00045-8,55,1,January,2000,Shares trading in the Bolsa Mexicana de Valore...,21.txt,we can reject the hypothesis that adding anoth...,5
4,The `repricing’ of executive stock options,https://doi.org/10.1016/S0304-405X(00)00053-2,57,1,July,2000,We examine a sample of firms that reset the ex...,27.txt,We hypothesize that firms that reprice have a ...,2


In [17]:
H_merged.to_csv(journal+'_extracted_hypothesis_with_paper_info.csv')