In [1]:
import re
import string
import pandas as pd
import numpy as np
from collections import Counter
import sqlite3
import json
import csv

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# import fasttext
# import gensim
# from gensim.models import Word2Vec
# from gensim.models import ldaseqmodel
# from gensim import corpora
# import gensim.downloader as api

from lxml import etree

import matplotlib.pyplot as plt


from time import time
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

from tqdm import tqdm

plt.rcParams['figure.dpi'] = 300
plt.rcParams["font.family"] = "serif"


def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    
    # stemming and lem
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    
    return tokens

In [2]:
path = '../data/s2orc_hci/s2orc_hci/pdf_parses/pdf_parses_0.jsonl'

# load jsonl file
with open(path, 'r', encoding='utf8') as f:
    data = f.readlines()
    data = [json.loads(line) for line in data]

# for dSec in data[1]['body_text']:
#     text = dSec['section'].lower()
#     regexp = re.compile(r'introduction')
#     if regexp.search(text):
#         print(dSec)

# for dSec in data[1]['body_text']:
#     text = dSec['section'].lower()
#     regexp = re.compile(r'related work')
#     if regexp.search(text):
#         print(dSec)


for dSec in data[3]['body_text']:
    text = str(dSec).lower()
    print(dSec)
    regexp = re.compile(r'RQ.*\?')
    matches = regexp.findall(text)
    if matches:
        print(matches)



{'section': 'II. LITERATURE REVIEW', 'text': 'Moving over to the FinTech industry, [4] uses CNNs to make predictions for stock price changes based on the image of the time series plot. The author also attempts to colour code the time series, however the results of this approach were not positive. On the other hand, [5] use CNNs in a bank telemarketing case study, whereby the aim is to predict whether a customer will take up a particular marketing campaign based on a number of numeric and nominal features per customer. The results for this study yield an impressive 76.70% accuracy, which yields the highest accuracy amongst 7 classifiers. In order to incorporate external features in the forecasting model, [6] use a deep convolutional neural network to model short and long term influences of events of stock price movements. Results from this study show that CNNs can capture longer-term influence of news events than standard feed-forward networks.', 'cite_spans': [{'start': 37, 'end': 40, 

In [3]:
def get_introduction_text(dPaper):
    texts = []
    for dSec in dPaper['body_text']:
        text = dSec['section'].lower()
        regexp = re.compile(r'intro')
        if regexp.search(text):
            texts.append(dSec['text'])
    return texts

def get_relatedWork_text(dPaper):
    texts = []
    for dSec in dPaper['body_text']:
        text = dSec['section'].lower()
        regexp = re.compile(r'related work')
        if regexp.search(text):
            texts.append(dSec['text'])
    return texts

def get_RQ_text(dPaper):
    texts = []
    for dSec in dPaper['body_text']:
        text = str(dSec).lower()
        # non greedy match
        regexp = re.compile(r'[- a-z([]*?(?:\d.|:) (?:what|how|why|is|are|can|to what extent) [^[?]*\?')
        # regexp = re.compile(r'(?:what|how|why|is|are|can|to what extent) [^[?]*\?')
        matches = regexp.findall(text)
        if matches:
            texts.extend(matches)
    return texts

In [115]:
# "1. How do students assess the contribution of the flipped-classroom approach to the learning process and the watching of videos between classes as against the watching of videos in class? 
# 2. What are the relations between the assessment of the contribution of the flippedclassroom approach to the learning process and the students' background characteristics, feelings about having the lecturer and classmates nearby, and self-assessment of the learning ability?"

In [116]:
rqs = []
intros = []
relatedWorks = []

for i in tqdm(range(100)):
    path = '../data/s2orc_hci/s2orc_hci/pdf_parses/pdf_parses_%d.jsonl'%i

    # load jsonl file
    with open(path, 'r', encoding='utf8') as f:
        data = f.readlines()
        data = [json.loads(line) for line in data]


    for dPaper in data:
        rqs.append(get_RQ_text(dPaper))
        intros.append(get_introduction_text(dPaper))
        relatedWorks.append(get_relatedWork_text(dPaper))

100%|██████████| 100/100 [01:51<00:00,  1.11s/it]


In [117]:
len(rqs)

42260

In [118]:
len([rq for rq in rqs if rq])

468

In [119]:
# write results to jsonl file
# with open('allQuestions_s2orc.jsonl', 'w', encoding='utf8') as f:
with open('RQs_s2orc.jsonl', 'w', encoding='utf8') as f:
    for idx, rq in enumerate(rqs):
        if rq:
            f.write(json.dumps(
                {
                    'intro': intros[idx],
                    'relatedWork': relatedWorks[idx],
                    'rq': rq
                }
            ))
            f.write('\n')

In [4]:
# check how many papers have intro, related work and RQs
with open('RQs_s2orc.jsonl', 'r', encoding='utf8') as f:
    data = f.readlines()
    data = [json.loads(line) for line in data]
    cnt = 0
    for d in data:
        if (d['intro'] or d['relatedWork']) and d['rq']:
            cnt += 1
    print(cnt)

283


In [5]:
# check how many papers have intro, related work and RQs
with open('allQuestions_s2orc.jsonl', 'r', encoding='utf8') as f:
    data = f.readlines()
    data = [json.loads(line) for line in data]
    cnt = 0
    for d in data:
        if (d['intro'] and d['relatedWork']) and d['rq']:
            cnt += 1
    print(cnt)

197


In [6]:
# only extract related works
rqs = []
intros = []
relatedWorks = []

for i in tqdm(range(100)):
    path = '../data/s2orc_hci/s2orc_hci/pdf_parses/pdf_parses_%d.jsonl'%i

    # load jsonl file
    with open(path, 'r', encoding='utf8') as f:
        data = f.readlines()
        data = [json.loads(line) for line in data]


    for dPaper in data:
        rqs.append(get_RQ_text(dPaper))
        intros.append(get_introduction_text(dPaper))
        relatedWorks.append(get_relatedWork_text(dPaper))

100%|██████████| 100/100 [01:56<00:00,  1.17s/it]


In [7]:
len([r for r in relatedWorks if r])

7345