In [1]:
import re
import glob
import pandas as pd
from tqdm import tqdm

In [2]:
journal = 'JoAE'

In [3]:
# get all files
import os
# change dir
os.chdir('papers/JoAE_2000_2021_full_text/')
files = glob.glob("*.txt")


In [4]:
# load meta data csv
metadata = pd.read_csv('../JoAE_2000_2021.csv', index_col=0)
metadata.head()

Unnamed: 0,title,full_text_filename,doi,abstract,volume,issue,page_from,page_to,url,month,year
0,Accounting for the impairment of long-lived as...,0.txt,https://doi-org.libproxy1.nus.edu.sg/10.1016/S...,We investigate write-downs of assets of oil an...,29,2,151.0,172.0,https://www-sciencedirect-com.libproxy1.nus.ed...,April,2000
1,Auditor resignations: clientele effects and le...,1.txt,https://doi.org/10.1016/S0165-4101(00)00019-7,I examine two hypotheses of auditor resignatio...,29,2,173.0,205.0,https://www-sciencedirect-com.libproxy1.nus.ed...,April,2000
2,Nuclear decommissioning costs: The impact of r...,2.txt,https://doi.org/10.1016/S0165-4101(00)00020-3,Mounting nuclear plant decommissioning costs a...,29,2,207.0,230.0,https://www-sciencedirect-com.libproxy1.nus.ed...,April,2000
3,Market valuation and deregulation of electric ...,3.txt,https://doi.org/10.1016/S0165-4101(00)00021-5,This study examines the effect of ongoing dere...,29,2,231.0,260.0,https://www-sciencedirect-com.libproxy1.nus.ed...,April,2000
4,The effect of tax accounting rules on capital ...,4.txt,https://doi.org/10.1016/S0165-4101(00)00027-6,This study investigates the effect of changing...,30,1,1.0,31.0,https://www-sciencedirect-com.libproxy1.nus.ed...,August,2000


In [5]:
def get_text(path_to_file):
    f = open(path_to_file, "r", encoding='utf8')
    text = f.read()
    return text

In [6]:
def get_hypothesis(text):

    # return extracted hypothesis and rule number

    # tokenized_text = text.splitlines()
    # print(tokenized_text)

    hypotheses = []
    rules = []

    text = text.replace('\n', '')

    # case 2:
    res = re.findall(r'we.{1,10}hypothesize that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [2 for _ in range(len(res))]


    # case 3:
    res = re.findall(r'hypothesis \d[a-z]?[\s:].{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [3 for _ in range(len(res))]


    # case 4:
    res = re.findall(r'Our.{1,10}hypothesis is that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [4 for _ in range(len(res))]

    res = re.findall(r'Our.{1,10}hypothesis was that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [4 for _ in range(len(res))]


    # case 5:
    res = re.findall(r'we.{5,15}the hypothesis that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [5 for _ in range(len(res))]


    # case 6:
    res = re.findall(r'.{100,400}\. To test this hypothesis', text)
    if res:
        hypotheses += res
        rules += [6 for _ in range(len(res))] 

    res = re.findall(r'.{100,400}\. To test these hypotheses', text)
    if res:
        hypotheses += res
        rules += [6 for _ in range(len(res))] 


    return hypotheses, rules

In [7]:
def main(file):
    text = get_text(file)
    return get_hypothesis(text)

In [8]:
main(files[50])

([], [])

## get hypothesis from all free articles

In [9]:
results = {}
H = pd.DataFrame()
for file in tqdm(files):
    # print(main(file))
    hypotheses, rules = main(file)
    # print('{:} hypotheses found in file {}'.format(len(hypotheses), file))    
    if hypotheses:
        file_names = [file for _ in range(len(hypotheses))]
        df = pd.DataFrame({'file_name': file_names, 'hypothesis': hypotheses, 'rule': rules})
        # print(df)
        H = pd.concat([H, df])


100%|██████████| 744/744 [01:40<00:00,  7.43it/s]


In [10]:
H = H.drop_duplicates()
H.reset_index(inplace=True)
H = H.drop('index', axis=1)


In [11]:
H.head()

Unnamed: 0,file_name,hypothesis,rule
0,10.txt,Our null hypothesis is that the sales producti...,4
1,10.txt,different employees are best modeled as varyi...,6
2,100.txt,Hypothesis 1 using long-horizon performance. A...,3
3,100.txt,Hypothesis 2 are likely to be more powerful wh...,3
4,100.txt,Hypothesis 3 predicts this row should be negat...,3


In [12]:
H.shape

(452, 3)

In [13]:
H.rule.value_counts()

2    202
3    158
4     50
6     24
5     18
Name: rule, dtype: int64

In [15]:
os.chdir('../../')
H.to_csv(journal+'_extracted_hypothesis.csv', index=False)

In [18]:
H_merged = pd.merge(metadata, H, how='right', left_on='full_text_filename', right_on='file_name')
H_merged = H_merged.drop('file_name', axis=1)

In [19]:
H_merged.head()

Unnamed: 0,title,full_text_filename,doi,abstract,volume,issue,page_from,page_to,url,month,year,hypothesis,rule
0,Nuclear decommissioning costs: The impact of r...,2.txt,https://doi.org/10.1016/S0165-4101(00)00020-3,Mounting nuclear plant decommissioning costs a...,29,2,207.0,230.0,https://www-sciencedirect-com.libproxy1.nus.ed...,April,2000,We hypothesize that utility value is negativel...,2
1,Nuclear decommissioning costs: The impact of r...,2.txt,https://doi.org/10.1016/S0165-4101(00)00020-3,Mounting nuclear plant decommissioning costs a...,29,2,207.0,230.0,https://www-sciencedirect-com.libproxy1.nus.ed...,April,2000,we hypothesize that the probability that nucle...,2
2,Nuclear decommissioning costs: The impact of r...,2.txt,https://doi.org/10.1016/S0165-4101(00)00020-3,Mounting nuclear plant decommissioning costs a...,29,2,207.0,230.0,https://www-sciencedirect-com.libproxy1.nus.ed...,April,2000,we hypothesize that it is less likely that suc...,2
3,Nuclear decommissioning costs: The impact of r...,2.txt,https://doi.org/10.1016/S0165-4101(00)00020-3,Mounting nuclear plant decommissioning costs a...,29,2,207.0,230.0,https://www-sciencedirect-com.libproxy1.nus.ed...,April,2000,We hypothesize that utilities operating in mor...,2
4,Nuclear decommissioning costs: The impact of r...,2.txt,https://doi.org/10.1016/S0165-4101(00)00020-3,Mounting nuclear plant decommissioning costs a...,29,2,207.0,230.0,https://www-sciencedirect-com.libproxy1.nus.ed...,April,2000,We hypothesize that rate recovery of unfunded ...,2


In [21]:
H_merged.to_csv(journal+'_extracted_hypothesis_with_paper_info.csv')