In [57]:
import re
import string
import pandas as pd
import numpy as np
from collections import Counter
import sqlite3
import json
import csv

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# import fasttext
# import gensim
# from gensim.models import Word2Vec
# from gensim.models import ldaseqmodel
# from gensim import corpora
# import gensim.downloader as api

from lxml import etree

import matplotlib.pyplot as plt


from time import time
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

from tqdm import tqdm

plt.rcParams['figure.dpi'] = 300
plt.rcParams["font.family"] = "serif"


def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    
    # stemming and lem
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    
    return tokens

In [42]:
path = '../data/s2orc_hci/s2orc_hci/pdf_parses/pdf_parses_0.jsonl'

# load jsonl file
with open(path, 'r', encoding='utf8') as f:
    data = f.readlines()
    data = [json.loads(line) for line in data]

# for dSec in data[1]['body_text']:
#     text = dSec['section'].lower()
#     regexp = re.compile(r'introduction')
#     if regexp.search(text):
#         print(dSec)

# for dSec in data[1]['body_text']:
#     text = dSec['section'].lower()
#     regexp = re.compile(r'related work')
#     if regexp.search(text):
#         print(dSec)


for dSec in data[3]['body_text']:
    text = str(dSec).lower()
    print(dSec)
    regexp = re.compile(r'RQ.*\?')
    matches = regexp.findall(text)
    if matches:
        print(matches)



{'section': 'II. LITERATURE REVIEW', 'text': 'Moving over to the FinTech industry, [4] uses CNNs to make predictions for stock price changes based on the image of the time series plot. The author also attempts to colour code the time series, however the results of this approach were not positive. On the other hand, [5] use CNNs in a bank telemarketing case study, whereby the aim is to predict whether a customer will take up a particular marketing campaign based on a number of numeric and nominal features per customer. The results for this study yield an impressive 76.70% accuracy, which yields the highest accuracy amongst 7 classifiers. In order to incorporate external features in the forecasting model, [6] use a deep convolutional neural network to model short and long term influences of events of stock price movements. Results from this study show that CNNs can capture longer-term influence of news events than standard feed-forward networks.', 'cite_spans': [{'start': 37, 'end': 40, 

In [109]:
def get_introduction_text(dPaper):
    texts = []
    for dSec in dPaper['body_text']:
        text = dSec['section'].lower()
        regexp = re.compile(r'intro')
        if regexp.search(text):
            texts.append(dSec['text'])
    return texts

def get_relatedWork_text(dPaper):
    texts = []
    for dSec in dPaper['body_text']:
        text = dSec['section'].lower()
        regexp = re.compile(r'related work')
        if regexp.search(text):
            texts.append(dSec['text'])
    return texts

def get_RQ_text(dPaper):
    texts = []
    for dSec in dPaper['body_text']:
        text = str(dSec).lower()
        # non greedy match
        # regexp = re.compile(r'[- a-z([]*?(?:\d.|:) (?:what|how|why|is|are|can|to what extent) [^[?]*\?')
        regexp = re.compile(r'(?:what|how|why|is|are|can|to what extent) [^[?]*\?')
        matches = regexp.findall(text)
        if matches:
            texts.extend(matches)
    return texts

In [101]:
# "1. How do students assess the contribution of the flipped-classroom approach to the learning process and the watching of videos between classes as against the watching of videos in class? 
# 2. What are the relations between the assessment of the contribution of the flippedclassroom approach to the learning process and the students' background characteristics, feelings about having the lecturer and classmates nearby, and self-assessment of the learning ability?"

In [110]:
rqs = []
intros = []
relatedWorks = []

for i in tqdm(range(100)):
    path = '../data/s2orc_hci/s2orc_hci/pdf_parses/pdf_parses_%d.jsonl'%i

    # load jsonl file
    with open(path, 'r', encoding='utf8') as f:
        data = f.readlines()
        data = [json.loads(line) for line in data]


    for dPaper in data:
        rqs.append(get_RQ_text(dPaper))
        intros.append(get_introduction_text(dPaper))
        relatedWorks.append(get_relatedWork_text(dPaper))

100%|██████████| 100/100 [00:23<00:00,  4.29it/s]


In [111]:
len(rqs)

42260

In [112]:
len([rq for rq in rqs if rq])

2441

In [113]:
# write results to jsonl file
# with open('allQuestions_s2orc.jsonl', 'w', encoding='utf8') as f:
with open('RQs_s2orc.jsonl', 'w', encoding='utf8') as f:
    for idx, rq in enumerate(rqs):
        if rq:
            f.write(json.dumps(
                {
                    'intro': intros[idx],
                    'relatedWork': relatedWorks[idx],
                    'rq': rq
                }
            ))
            f.write('\n')

In [6]:
get_RQ_text(path)

"Although all these works rely on rapport-building conversational strategies, few of them investigate how rapport-building dialogues influence the perceived quality of the items recommended, or people's compliance towards the recommendations. Moreover, they do not investigate the impact of users' interaction mode on users' perceptions. In this paper, we aim at building a conversational recommender system that recommends recipes while building rapport with its users. More specifically, in this paper, we focus on the following research questions: RQ1: How does the way users interact with a conversational recommender system influence their perception of and their intention to cook recommended recipes? RQ2: How do a conversational recommender system's conversational strategies influence users' perception of and their intention to cook recommended recipes?"

In [7]:
get_relatedWork_text(path)

"Food Recommender Systems. A common approach for food recommender systems is to recommend a recipe based on its ingredients. In [8], for example, the authors developed a system that relies on recipes that people like to infer their preferred ingredients. The system then recommends new recipes containing the previously inferred ingredients. In [9], the authors developed a system that collects users' preferences by asking them to rate and tag the recipes they usually cook at home. The system then relies on user's preferences to rank recipes and deliver recommendations with the highest scores. This Matrix Factorization algorithm outperformed the content-based approach proposed by [8]. Other approaches only rely on dietary information to recommend recipes that would match users' needs. YumMe, the recommender system developed in [36], automatically extracts dietary information from pictures of recipes to form a user profile. The system then relies on this user profile to deliver subsequent 

In [47]:
import json
import glob
import os

# picking out papers with no RQs detected
# papers = glob.glob('./papers/conversationalAgent/HC_paper_all/xml/*.xml')
with open('RQs.jsonl', 'r') as f:
    dPapers = [json.loads(line) for line in f]
paperPaths = ['./' + '/'.join(d['path'].split('/')[-5:]) for d in dPapers]

In [53]:
allPapers = glob.glob('./papers/**/*.xml', recursive=True)
# allPapers

In [75]:
# delete and recreate folder nonRQs
os.system('rm -r ./papers/nonRQs')
os.makedirs('./papers/nonRQs', exist_ok=False)
os.makedirs('./papers/nonRQs/conversationalAgent', exist_ok=False)
os.makedirs('./papers/nonRQs/multimodalHI', exist_ok=False)

for paper in allPapers:
    if paper not in paperPaths:
        os.system('cp ' + paper + ' ./papers/nonRQs/%s'%paper.split('/')[2])
        try:
            os.system('cp ' + '/'.join(paper.split('/')[:4] + [paper.split('/')[5].split('.')[0] + '.pdf']) + ' ' + ' ./papers/nonRQs/%s'%paper.split('/')[2])
        except:
            pass
    # else:
        # print(paper)


cp: ./papers/conversationalAgent/HC_paper_all/162.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/88.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/176.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/63.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/77.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/189.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/214.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/200.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/201.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/215.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/76.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/188.pdf: No such file or directory
cp: ./papers/conversationalAgent

In [74]:
os.system('cp ' + '/'.join(paper.split('/')[:4] + [paper.split('/')[5].split('.')[0] + '.pdf']) + ' ' + ' ./papers/nonRQs/%s'%paper.split('/')[2])

0

In [73]:
'/'.join(paper.split('/')[:4] + [paper.split('/')[5].split('.')[0] + '.pdf'])

'./papers/multimodalHI/MHI/85.pdf'