In [1]:
import re
import glob
import pandas as pd
from tqdm import tqdm

In [2]:
journal = 'JoAR'

In [3]:
# get all files
import os
# change dir
os.chdir('papers/JoAR_2001_2021_full_text/')
files = glob.glob("*.txt")


In [4]:
# load meta data csv
metadata = pd.read_csv('../JoAR_2001_2021.csv', index_col=0)
metadata.head()

Unnamed: 0,doi,title,publish_date,publish_month,publish_year,volume,issue,abstract,full_text_filename
0,https://onlinelibrary-wiley-com.libproxy1.nus....,"Talk Less, Learn More: Strategic Disclosure in...",26,April,2021,,Accepted Articles,We examine how options trading affects volunta...,
1,https://onlinelibrary-wiley-com.libproxy1.nus....,Economic Downturns and the Informativeness of ...,22,April,2021,,Accepted Articles,We examine how options trading affects volunta...,
2,https://onlinelibrary-wiley-com.libproxy1.nus....,Voice of the Customers: Local Trust Culture an...,21,April,2021,,Accepted Articles,We use complaints filed with the Consumer Fina...,
3,https://onlinelibrary-wiley-com.libproxy1.nus....,Debiasing the Measurement of Conditional Conse...,21,April,2021,,Accepted Articles,Basu's [1997] measurement of conditional conse...,
4,https://onlinelibrary-wiley-com.libproxy1.nus....,Do Majority‐of‐Minority Shareholder Voting Rig...,30,March,2021,,Accepted Articles,"In the presence of business groups, the exprop...",4.txt


In [5]:
def get_text(path_to_file):
    f = open(path_to_file, "r", encoding='utf8')
    text = f.read()
    return text

In [6]:
def get_hypothesis(text):

    # return extracted hypothesis and rule number

    # tokenized_text = text.splitlines()
    # print(tokenized_text)

    hypotheses = []
    rules = []

    text = text.replace('\n', '')

    # case 2:
    res = re.findall(r'we.{1,10}hypothesize that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [2 for _ in range(len(res))]


    # case 3:
    res = re.findall(r'hypothesis \d[a-z]?[\s:].{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [3 for _ in range(len(res))]


    # case 4:
    res = re.findall(r'Our.{1,10}hypothesis is that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [4 for _ in range(len(res))]

    res = re.findall(r'Our.{1,10}hypothesis was that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [4 for _ in range(len(res))]


    # case 5:
    res = re.findall(r'we.{5,15}the hypothesis that.{1,300}\.', text, re.I)
    if res:
        hypotheses += res
        rules += [5 for _ in range(len(res))]


    # case 6:
    res = re.findall(r'.{100,400}\. To test this hypothesis', text)
    if res:
        hypotheses += res
        rules += [6 for _ in range(len(res))] 

    res = re.findall(r'.{100,400}\. To test these hypotheses', text)
    if res:
        hypotheses += res
        rules += [6 for _ in range(len(res))] 


    return hypotheses, rules

In [7]:
def main(file):
    text = get_text(file)
    return get_hypothesis(text)

In [8]:
main(files[50])

([], [])

## get hypothesis from all free articles

In [9]:
results = {}
H = pd.DataFrame()
for file in tqdm(files):
    # print(main(file))
    hypotheses, rules = main(file)
    # print('{:} hypotheses found in file {}'.format(len(hypotheses), file))    
    if hypotheses:
        file_names = [file for _ in range(len(hypotheses))]
        df = pd.DataFrame({'file_name': file_names, 'hypothesis': hypotheses, 'rule': rules})
        # print(df)
        H = pd.concat([H, df])


100%|██████████| 523/523 [01:39<00:00,  5.25it/s]


In [10]:
H = H.drop_duplicates()
H.reset_index(inplace=True)
H = H.drop('index', axis=1)


In [11]:
H.head()

Unnamed: 0,file_name,hypothesis,rule
0,10.txt,We hypothesize that firms with less severe age...,2
1,10.txt,We hypothesize that firms with relatively low ...,2
2,10.txt,"We hypothesize that, in weak‐institution count...",2
3,10.txt,We hypothesize that in weak‐institution countr...,2
4,10.txt,We hypothesize that the decreased reliance of ...,2


In [12]:
H.shape

(300, 3)

In [13]:
H.rule.value_counts()

2    147
3     83
4     33
6     28
5      9
Name: rule, dtype: int64

In [14]:
os.chdir('../../')
H.to_csv(journal+'_extracted_hypothesis.csv', index=False)

In [15]:
H_merged = pd.merge(metadata, H, how='right', left_on='full_text_filename', right_on='file_name')
H_merged = H_merged.drop('file_name', axis=1)

In [16]:
H_merged.head()

Unnamed: 0,doi,title,publish_date,publish_month,publish_year,volume,issue,abstract,full_text_filename,hypothesis,rule
0,https://onlinelibrary-wiley-com.libproxy1.nus....,Show Me the Money! Dividend Policy in Countrie...,30,March,2021,,Accepted Articles,"We hypothesize that, in weak‐institution count...",10.txt,We hypothesize that firms with less severe age...,2
1,https://onlinelibrary-wiley-com.libproxy1.nus....,Show Me the Money! Dividend Policy in Countrie...,30,March,2021,,Accepted Articles,"We hypothesize that, in weak‐institution count...",10.txt,We hypothesize that firms with relatively low ...,2
2,https://onlinelibrary-wiley-com.libproxy1.nus....,Show Me the Money! Dividend Policy in Countrie...,30,March,2021,,Accepted Articles,"We hypothesize that, in weak‐institution count...",10.txt,"We hypothesize that, in weak‐institution count...",2
3,https://onlinelibrary-wiley-com.libproxy1.nus....,Show Me the Money! Dividend Policy in Countrie...,30,March,2021,,Accepted Articles,"We hypothesize that, in weak‐institution count...",10.txt,We hypothesize that in weak‐institution countr...,2
4,https://onlinelibrary-wiley-com.libproxy1.nus....,Show Me the Money! Dividend Policy in Countrie...,30,March,2021,,Accepted Articles,"We hypothesize that, in weak‐institution count...",10.txt,We hypothesize that the decreased reliance of ...,2


In [17]:
H_merged.to_csv(journal+'_extracted_hypothesis_with_paper_info.csv')