In [1]:
# requirement
# ! pip install textract

import re
import textract
import pandas as pd

def read_word_text(file_path):
    # reading the text from file
    text = textract.process(file_path)
    text = text.decode('utf-8')

    # replacing more than three new line characters with only two
    text = re.sub(r'[\n]{3,}', '\n\n', text)
    
    return text

## Report vs. Opinion

In [9]:
file_path = '../data/wolfe/Report_vs_Opinion/Report_Opinion_Comparison.docx'
text = read_word_text(file_path)

In [10]:
df = pd.DataFrame(columns=['title', 'type', 'url', 'text'])
lines = text.split('\n')

i = 4
doc_text = []
doc_title = lines[0] 
doc_url = lines[2]

while i < len(lines):
    if lines[i].startswith('Opinion-') or lines[i].startswith('Report-'):
        df = df.append({'title': doc_title, 'type': doc_title.split('-')[0].lower(), 'url': doc_url, 'text': '\n'.join(doc_text)}, ignore_index=True)
        doc_title = lines[i]
        doc_url = lines[i+2]
        doc_text = []
        i += 4
    else:
        doc_text.append(lines[i])
        i += 1

In [11]:
df.head(3)

Unnamed: 0,title,type,url,text
0,Report-Hawking,report,https://www.usatoday.com/story/news/2018/03/14...,"Stephen Hawking, one of the world's foremost t..."
1,Opinion-Hawking,opinion,https://www.usatoday.com/story/news/2018/03/14...,"Stephen Hawking, one of the greatest scientist..."
2,Report-Gun control,report,https://www.usatoday.com/story/news/politics/2...,WASHINGTON — As students across the country pr...


In [4]:
df.to_excel('report_vs_opinion.xlsx')

## Methods vs. Discussion

In [52]:
file_path = '../data/wolfe/Methods_vs_Discussion/Wolfe_methods_and_discussion_comparison.docx'
text = read_word_text(file_path)

# ARTICLE
# METHODS
# DISCUSSION

df = pd.DataFrame(columns=['article_title', 'methods', 'discussion'])
lines = text.split('\n')

article_title = ""
methods = []
discussion = []

i = 0

while i < len(lines):
    if lines[i].strip().lower().startswith('article'):
        article_title = ' '.join(lines[i].split(':')[1:]).strip()
        i += 4
        while i < len(lines) and not lines[i].lower().startswith('discussion'):
            methods.append(lines[i])
            i += 1
        i += 2
        while i < len(lines) and not lines[i].strip().lower().startswith('article'):
            discussion.append(lines[i])
            i += 1

        if article_title != "" and len(methods) > 0 and len(discussion) > 0:
            df = df.append({'article_title': article_title, 'methods': '\n'.join(methods), 'discussion': '\n'.join(discussion)}, ignore_index=True)
            article_title = ""
            methods = []
            discussion = []

print(len(df), i)

25 545


In [53]:
df

Unnamed: 0,article_title,methods,discussion
0,Common Modality Effects in Immediate Free Reca...,Forty participants from the University of Esse...,A third feature of our data that may constrain...
1,Retuning of Lexical-Semantic Representations ...,Thirty-three native British English speakers p...,"Finally, in contrast to these two views, which..."
2,Threat of Shock and Aversive Inhibition Induc...,Sixty-two healthy participants (39 females; ag...,"As a potential caveat, it should be noted that..."
3,Exploring the understanding and application of...,This study applied qualitative methods to capt...,Participants in the current study outlined a r...
4,Reprioritizing life A conceptual model of how...,Grounded theory methodology aims to explain a ...,In line with what was described in our study r...
5,Classroom interaction in effective and ineffec...,Twelve school systems had comprised the LSES-I...,"Third, at Adams, and at almost all positive ou..."
6,Exploring the nature of counterfactual thinkin...,"Seven athletes (4 male, 3 female) were recruit...",It is acknowledged that the small sample size ...
7,A positive psychology intervention in a hindu ...,The pilot program was conducted in a migratory...,"Many Hindu populations, particularly in lower-..."
8,Methodological pluralism and mixed methodology...,Articles were located through a PsycInfo searc...,By reflecting on the research methods that hav...
9,Examining the effectiveness of a rational emot...,Participants were 20 Greek male adolescent ath...,"The present study has limitations that, if add..."


In [54]:
df.to_excel('methods_vs_discussion.xlsx')