In [1]:
import re
import string
import pandas as pd
import numpy as np
from collections import Counter
import sqlite3
import json
import csv

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# import fasttext
# import gensim
# from gensim.models import Word2Vec
# from gensim.models import ldaseqmodel
# from gensim import corpora
# import gensim.downloader as api

from lxml import etree

import matplotlib.pyplot as plt


from time import time
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

plt.rcParams['figure.dpi'] = 300
plt.rcParams["font.family"] = "serif"


def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    
    # stemming and lem
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    
    return tokens

In [2]:
# Read in xmls
# select sections containing discussion
tree = etree.parse('./papers/conversationalAgent/HC_paper_all/xml/17.xml')

# select sections containing discussion
nodes = tree.xpath("(//*[("
        "contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
        ",'rq1:') or "
        "contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
        ",'rq 1:')"
        ")]) [last()]")
if len(nodes) != 0:
    node = nodes[0]
    title = ''.join(node.xpath("text()"))
    text = ''.join(node.xpath("../descendant::*/text()")[1:])
    # print(title)
else:
    print("NOT FOUND!")
title

"Although all these works rely on rapport-building conversational strategies, few of them investigate how rapport-building dialogues influence the perceived quality of the items recommended, or people's compliance towards the recommendations. Moreover, they do not investigate the impact of users' interaction mode on users' perceptions. In this paper, we aim at building a conversational recommender system that recommends recipes while building rapport with its users. More specifically, in this paper, we focus on the following research questions: RQ1: How does the way users interact with a conversational recommender system influence their perception of and their intention to cook recommended recipes? RQ2: How do a conversational recommender system's conversational strategies influence users' perception of and their intention to cook recommended recipes?"

In [3]:
def get_introduction_text(path):
    """Get discussion text from xml file.
    
    Args:
        path: Paper path.
    
    Returns:
        Discussion text.
    """
    tree = etree.parse(path)
    nodes = tree.xpath("(//*[local-name()='head' and ("
            "contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
            ",'introduction')"
            ")]) [last()]")
    if len(nodes) != 0:
        node = nodes[0]
        title = ''.join(node.xpath("text()"))
        text = ''.join(node.xpath("../descendant::*/text()")[1:])
        # print(title)
    else:
        print("NOT FOUND!")
        text = ''
    return text


def get_discussion_text(path):
    """Get discussion text from xml file.
    
    Args:
        path: Paper path.
    
    Returns:
        Discussion text.
    """
    tree = etree.parse(path)
    nodes = tree.xpath("(//*[local-name()='head' and ("
            "contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
            ",'discussion') or "
            "contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
            ",'implication')"
            ")]) [last()]")
    if len(nodes) != 0:
        node = nodes[0]
        title = ''.join(node.xpath("text()"))
        text = ''.join(node.xpath("../descendant::*/text()")[1:])
        # print(title)
    else:
        print("NOT FOUND!")
        text = ''
    return text


def get_relatedWork_text(path):
    """Get discussion text from xml file.
    
    Args:
        path: Paper path.
    
    Returns:
        related work text.
    """
    tree = etree.parse(path)
    nodes = tree.xpath("(//*[local-name()='head' and ("
            "contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
            ",'related work') or "
            "contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
            ",'background')"
            ")]) [last()]")
    if len(nodes) != 0:
        node = nodes[0]
        title = ''.join(node.xpath("text()"))
        text = ''.join(node.xpath("../descendant::*/text()")[1:])
        # print(title)
    else:
        print("NOT FOUND!")
        text = ''
    return text

def get_RQ_text(path):
    # Read in xmls
    # select sections containing discussion
    tree = etree.parse(path)

    # select sections containing discussion
    nodes = tree.xpath("(//*[("
            "contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
            ",'rq1:') or "
            "contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
            ",'rq 1:')"
            ")]) [last()]")
    if len(nodes) != 0:
        node = nodes[0]
        title = ''.join(node.xpath("text()"))
        text = ''.join(node.xpath("../descendant::*/text()")[1:])
        # print(title)
    else:
        print("NOT FOUND!")
    return title

In [4]:
path = r'./papers/conversationalAgent/HC_paper_all/xml/17.xml'

In [5]:
get_introduction_text(path)

'Healthy eating implies complex decision making processes [6], including being aware of healthy options and choosing among them [24]. One solution to overcome this issue and help people to make healthier choices is to develop health-aware food recommender systems [31]. While significant effort has been put recently into optimizing the food selection algorithms [30], many other factors can also influence users\' overall experience when interacting with a recommender system [14]. Indeed, the way the recommendation is presented [18], the system\'s response time [33], or even the length of the system\'s utterances [20] can have an influence on users\' perception of the system.One trend to improve users\' experience is to make the interaction more natural by designing the recommendation process as a conversation [23]. Besides helping users to achieve task-oriented goals, conversations can also fulfill interpersonal functions, such as building rapport [29]. Rapport can be described as a dyna

In [6]:
get_RQ_text(path)

"Although all these works rely on rapport-building conversational strategies, few of them investigate how rapport-building dialogues influence the perceived quality of the items recommended, or people's compliance towards the recommendations. Moreover, they do not investigate the impact of users' interaction mode on users' perceptions. In this paper, we aim at building a conversational recommender system that recommends recipes while building rapport with its users. More specifically, in this paper, we focus on the following research questions: RQ1: How does the way users interact with a conversational recommender system influence their perception of and their intention to cook recommended recipes? RQ2: How do a conversational recommender system's conversational strategies influence users' perception of and their intention to cook recommended recipes?"

In [7]:
get_relatedWork_text(path)

"Food Recommender Systems. A common approach for food recommender systems is to recommend a recipe based on its ingredients. In [8], for example, the authors developed a system that relies on recipes that people like to infer their preferred ingredients. The system then recommends new recipes containing the previously inferred ingredients. In [9], the authors developed a system that collects users' preferences by asking them to rate and tag the recipes they usually cook at home. The system then relies on user's preferences to rank recipes and deliver recommendations with the highest scores. This Matrix Factorization algorithm outperformed the content-based approach proposed by [8]. Other approaches only rely on dietary information to recommend recipes that would match users' needs. YumMe, the recommender system developed in [36], automatically extracts dietary information from pictures of recipes to form a user profile. The system then relies on this user profile to deliver subsequent 

In [47]:
import json
import glob
import os

# picking out papers with no RQs detected
# papers = glob.glob('./papers/conversationalAgent/HC_paper_all/xml/*.xml')
with open('RQs.jsonl', 'r') as f:
    dPapers = [json.loads(line) for line in f]
paperPaths = ['./' + '/'.join(d['path'].split('/')[-5:]) for d in dPapers]

In [53]:
allPapers = glob.glob('./papers/**/*.xml', recursive=True)
# allPapers

In [75]:
# delete and recreate folder nonRQs
os.system('rm -r ./papers/nonRQs')
os.makedirs('./papers/nonRQs', exist_ok=False)
os.makedirs('./papers/nonRQs/conversationalAgent', exist_ok=False)
os.makedirs('./papers/nonRQs/multimodalHI', exist_ok=False)

for paper in allPapers:
    if paper not in paperPaths:
        os.system('cp ' + paper + ' ./papers/nonRQs/%s'%paper.split('/')[2])
        try:
            os.system('cp ' + '/'.join(paper.split('/')[:4] + [paper.split('/')[5].split('.')[0] + '.pdf']) + ' ' + ' ./papers/nonRQs/%s'%paper.split('/')[2])
        except:
            pass
    # else:
        # print(paper)


cp: ./papers/conversationalAgent/HC_paper_all/162.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/88.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/176.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/63.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/77.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/189.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/214.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/200.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/201.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/215.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/76.pdf: No such file or directory
cp: ./papers/conversationalAgent/HC_paper_all/188.pdf: No such file or directory
cp: ./papers/conversationalAgent

In [74]:
os.system('cp ' + '/'.join(paper.split('/')[:4] + [paper.split('/')[5].split('.')[0] + '.pdf']) + ' ' + ' ./papers/nonRQs/%s'%paper.split('/')[2])

0

In [73]:
'/'.join(paper.split('/')[:4] + [paper.split('/')[5].split('.')[0] + '.pdf'])

'./papers/multimodalHI/MHI/85.pdf'