In [66]:
import os
import requests
import re
import json

from bs4 import BeautifulSoup


In [103]:

accepted_papers_url = {
    2016: ['https://sigir.org/chiir2016/accepted-papers.html'],
    2017: ['https://sigir.org/chiir2017/accepted-papers.html'],
    2018: [
        'https://sigir.org/chiir2018/papers.php',
        'https://sigir.org/chiir2018/shortpapers.php',
        'https://sigir.org/chiir2018/demos.php',
        
    ],
    2019: ['https://sigir.org/chiir2019/accepted.html'],
    2020: ['https://sigir.org/chiir2020/accepted.html'],
    2021: ['https://acm-chiir.github.io/chiir2021/accepted.html'],
    2022: ['https://ai.ur.de/chiir2022/program/papers']
}

accepted_paper_page = {}

for year in accepted_papers_url:
    for ui, url in enumerate(accepted_papers_url[year]):
        paper_html_file = f'accepted-papers-CHIIR-{year}-{ui+1}.html'
        if os.path.exists(paper_html_file):
            with open(paper_html_file, 'rt') as fh:
                accepted_paper_page[year] = fh.read()
        else:
            response = requests.get(url)
            if response.status_code != 200:
                print('error GETting paper page for year', year)
            with open(paper_html_file, 'wt') as fh:
                fh.write(response.text)
            accepted_paper_page[year] = response.text


In [293]:
def read_page_html(html_file):
    with open(html_file, 'rt') as fh:
        html = fh.read()
        return BeautifulSoup(html, 'lxml')
    

def init_author():
    return {
        'name': None,
        'affiliation': None
    }

def init_paper():
    return {
        'title': None,
        'id': None,
        'authors': [],
        'paper_type': None,
        'year': None,
        'acm_dl_url': None,
        'authors_string': None
    }




In [294]:
def extract_paper_info_2016(section):
    paper = init_paper()
    link = section.find('a')
    paper['acm_dl_url'] = link['href']
    paper['title'] = link.text.strip()
    paper['authors'] = []
    for author in section.find_all('li'):
        match = re.match(r'(.+)   \((.+)\)', author.text.strip())
        if match:
            author = init_author()
            author['name'] = match.group(1)
            author['affiliation'] = match.group(2)
            paper['authors'].append(author)
    return paper
    

def extract_papers_info_2016(page_soup):
    content_div = page_soup.find('div', id='content')
    print('CALLING extract_papers_info_2016')
    papers = []

    paper_type = None
    for child in content_div:
        if child.name == 'h2':
            paper_type = child.text
        elif child.name == 'section' and 'scheduled-paper' in child.attrs['class']:
            paper = extract_paper_info_2016(child)
            paper['paper_type'] = paper_type
            paper['year'] = 2016
            papers.append(paper)
    print(papers)
    return papers


def extract_papers_info_2017(page_soup):
    page_soup.find_all('div', class_='container')[2]
    main_col = page_soup.find('div', class_='col-md-8')
    main_col = page_soup.find('div', id='page-content')

    paper_type = None
    papers = []
    paper = None
    for child in main_col:
        if child.name == 'h2':
            paper_type = child.text.strip()
        elif child.name == 'p':
            paper = init_paper()
            paper['title'] = child.text.strip()
            paper['paper_type'] = paper_type
            paper['year'] = 2017
        elif child.name == 'ul':
            ul = child
            for li in child.find_all('li'):
                author = init_author()
                author['name'] = li.text.strip()
                paper['authors'].append(author)
            papers.append(paper)
        else:
            pass
    return papers


def extract_papers_info_2018(page_soup):
    papers = []
    paper_type = None
    paper_type = page_soup.find('title').text.strip()
    papers_div = page_soup.find('section', class_='container main').find('div', class_='span12')
    papers_div

    for child in papers_div:
        if child.name == 'h4':
            paper = init_paper()
            paper['title'] = child.text.strip()
            paper['paper_type'] = paper_type
            paper['year'] = 2018
        elif child.name == 'ul':
            for author in child.find_all('li'):
                match = re.match(r'(.+) \((.+)\)', author.text.strip())
                if match:
                    author = init_author()
                    author['name'] = match.group(1)
                    author['affiliation'] = match.group(2)
                    #print(match)
                    #print(author)
                    paper['authors'].append(author)
                else:
                    print('NO PATTERN MATCH:', author.text)
            papers.append(paper)
        else:
            pass
            #print(child)
    return papers


def extract_papers_info_2019(page_soup):
    content_div = page_soup.find('div', id='content').find('div', class_='paragraph-style')
    content_div
    papers = []
    for child in content_div:
        if child.name == 'h4':
            paper_type = child.text.strip()
        if child.name == 'p':
            paper = init_paper()
            paper['title'] = child.find('b').text.strip()
            paper['paper_type'] = paper_type
            paper['year'] = 2019
            authors_string = [s for s in child.stripped_strings][1]
            paper['full_author_string'] = authors_string
            try:
                for author_string in re.split(r'\), ', authors_string):
                    author = init_author()
                    name, affil = author_string.split(' (', 1)
                    author['name'] = name
                    author['affiliation'] = affil
                    paper['authors'].append(author)
            except ValueError:
                pass
            papers.append(paper)
    return papers
        

def extract_papers_info_2020(page_soup):
    main_div = page_soup.find('div', id='main')
    papers = []
    paper_type = None
    for child in main_div:
        if child.name == 'h2':
            paper_type = child.text.strip()
        elif child.name == 'div':
            paper = init_paper()
            title_soup = child.find('li', class_='doc-title')
            if title_soup is None:
                break
            paper['title'] = title_soup.text.strip()
            paper['year'] = 2020
            paper['paper_type'] = paper_type
            author_soup = child.find('li', class_='doc-author')
            if author_soup is None:
                author_soup = child.find('li', class_='doc-author&gt;')
                author_soup = child.find('li', class_='doc-author>')
            if author_soup is None:
                for li in child.find_all('li'):
                    print(li.attrs['class'])
            authors_string = author_soup.text.strip()
            for author_string in authors_string.split(' and '):
                author = init_author()
                author['name'] = author_string
                paper['authors'].append(author)
            papers.append(paper)
        else:
            #print(child)
            pass
    return papers
        

def extract_papers_info_2021(page_soup):
    content_div = page_soup.find('div', class_='col-xs-12 col-sm-12')
    papers = []
    paper_type = None
    for child in content_div:
        if child.name == 'h3':
            paper_type = child.text.strip()
        if child.name == 'div':
            paper = init_paper()
            paper_div = child
            title_soup = paper_div.find('h4')
            if title_soup is None:
                continue
            paper['title'] = title_soup.text.strip()
            paper['paper_type'] = paper_type
            paper['year'] = 2021
            #print(child)
            author_soup = paper_div.find('p')
            authors_string = author_soup.text.strip()
            paper['full_author_string'] = authors_string
            if authors_string[-1] == ')':
                authors_string = authors_string[:-1]
            try:
                for author_string in re.split(r'\), ', authors_string):
                    author = init_author()
                    name, affil = author_string.split(' (', 1)
                    author['name'] = name
                    author['affiliation'] = affil
                    paper['authors'].append(author)
            except ValueError:
                print(author_string)
                pass
            papers.append(paper)
    return papers


def extract_papers_info_2022(page_soup):
    papers = []
    paper_type = None
    for div in page_soup.find_all('div'):
        if 'id' not in div.attrs:
            continue
        #print(div)
        for child in div:
            if child.name == 'h2':
                paper_type = child.text.strip()
            elif child.name == 'dl':
                paper = init_paper()
                title_soup = child.find('dt')
                if title_soup is None:
                    continue
                    print(child)
                paper['title'] = title_soup.text.strip()
                paper['paper_type'] = paper_type
                paper['year'] = 2022
                authors_string = child.find('dd').text.strip()
                if ' and ' in authors_string:
                    head, tail = authors_string.split(' and ')
                    authors = head.split(', ') + [tail]
                else:
                    authors_string = [authors_string]
                for author_string in authors:
                    author = init_author()
                    author['name'] = author_string
                    paper['authors'].append(author)
                papers.append(paper)
            else:
                pass
    return papers




In [295]:
extract_papers = {
    2016: extract_papers_info_2016,
    2017: extract_papers_info_2017,
    2018: extract_papers_info_2018,
    2019: extract_papers_info_2019,
    2020: extract_papers_info_2020,
    2021: extract_papers_info_2021,
    2022: extract_papers_info_2022,
}

years = [2016, 2017, 2018, 2019, 2020, 2021, 2022]

for year in years:
    papers = []
    for i in range(len(accepted_papers_url[year])):
        paper_html_file = f'accepted-papers-CHIIR-{year}-{i+1}.html'
        page_soup = read_page_html(paper_html_file)
        papers += extract_papers[year](page_soup)
    paper_json_file = f'accepted-papers-CHIIR-{year}.json'
    with open(paper_json_file, 'wt') as fh:
        json.dump(papers, fh)



CALLING extract_papers_info_2016
[{'title': 'Deepening the Role of the User: Neuro-Physiological Evidence as a Basis for Studying and Improving Search', 'id': None, 'authors': [{'name': 'Javed Mostafa', 'affiliation': 'University of North Carolina at Chapel Hill, USA'}, {'name': 'Jacek Gwizdka', 'affiliation': 'University of Texas at Austin, USA'}], 'paper_type': 'Perspectives papers', 'year': 2016, 'acm_dl_url': 'http://dl.acm.org/authorize?N09498', 'authors_string': None}, {'title': 'Active and Passive Utility of Search Interface Features in Different Information Seeking Task Stages', 'id': None, 'authors': [{'name': 'Hugo C. Huurdeman', 'affiliation': 'University of Amsterdam, Netherlands'}, {'name': 'Max L. Wilson', 'affiliation': 'University of Nottingham, United Kingdom'}, {'name': 'Jaap Kamps', 'affiliation': 'University of Amsterdam, Netherlands'}], 'paper_type': 'Full-length papers', 'year': 2016, 'acm_dl_url': 'http://dl.acm.org/authorize?N09482', 'authors_string': None}, {'t

In [302]:
from openpyxl import Workbook

wb = Workbook()

ws_papers = wb.active
ws_papers.title = 'papers'
ws_authors = wb.create_sheet(title="authors")
ws_papers.append(['year', 'paper_type', 'paper_id', 'paper_title', 'acm_dl_url', 'authors_string'])
ws_authors.append(['year', 'paper_type', 'paper_id', 'paper_title', 'author_name', 'author_affiliation'])


In [303]:
for year in years:
    paper_json_file = f'accepted-papers-CHIIR-{year}.json'
    print(year)
    with open(paper_json_file, 'rt') as fh:
        papers_json = json.load(fh)
        for paper in papers_json:
            #print(paper)
            paper_row = [paper['year'], paper['paper_type'], paper['id'], paper['title'], paper['acm_dl_url'], paper['authors_string']]
            ws_papers.append(paper_row)
            for author in paper['authors']:
                row = [year, paper['paper_type'], paper['id'], paper['title'], author['name'], author['affiliation']]
                ws_authors.append(row)

wb.save('CHIIR_paper_info.xlsx')


2016
2017
2018
2019
2020
2021
2022


In [316]:
papers_json = []

for year in years:
    paper_json_file = f'accepted-papers-CHIIR-{year}.json'
    with open(paper_json_file, 'rt') as fh:
        papers_json += json.load(fh)


In [468]:
doi_sheet_url = 'https://docs.google.com/spreadsheets/d/1-FhHxvdCA6m58mr8f_cD1e0LD9D2dXK6y8duOoeh2dU/export?gid=0&format=tsv'

response = requests.get(doi_sheet_url)


if response.status_code == 200:
    doi_sheet = response.text

In [469]:
import copy

def score_levenshtein_distance(term1: str, term2: str) -> int:
    """Calculate Levenshtein distance between two string.

    :param term1: a term string
    :type term1: str
    :param term2: a term string
    :type term2: str
    :return: the number of overlapping ngrams
    :rtype: int
    """
    if len(term1) > len(term2):
        term1, term2 = term2, term1
    distances = range(len(term1) + 1)
    for i2, c2 in enumerate(term2):
        distances_ = [i2 + 1]
        for i1, c1 in enumerate(term1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]





def parse_doi_sheet(doi_sheet):
    rows = doi_sheet.split('\r\n')
    headers = rows.pop(0).split('\t')
    print(headers)
    for row in rows:
        row = row.split('\t')
        row_json = {header: row[hi] for hi, header in enumerate(headers)}
        yield row_json
    return None


def calculate_title_overlap(paper, title_string):
    dist = score_levenshtein_distance(paper['title'], title_string)
    sim = (len(paper['title']) - dist) / len(paper['title'])
    if sim > 0.3:
        #print(dist, sim)
        #print('\tJSON TITLE:', paper['title'])
        #print('\tDOI TITLE:', title_string)
        #print('\tAUTHORS:', paper['authors'])
        pass
    return sim


def calculate_title_word_overlap(paper, title_string):
    json_words = {w for w in re.split(r'\W+', paper['title'].lower()) if len(w) > 4}
    doi_words = {w for w in re.split(r'\W+', title_string.lower()) if len(w) > 4}
    overlap = json_words.intersection(doi_words)
    sim = (len(overlap)) / len(doi_words)
    if sim > 0.5:
        #print('\t', overlap, sim)
        #print('\tJSON TITLE:', paper['title'])
        #print('\tDOI TITLE:', title_string)
        #print('\tAUTHORS:', paper['authors'])
        pass
    return sim


def calculate_author_overlap(paper, doi_author_string):
    # remove the trailing dot
    doi_author_string = doi_author_string[:-1]
    if ' and ' in doi_author_string and ', ' not in doi_author_string:
        # two authors
        doi_authors = doi_author_string.split(' and ')
    elif ', ' in doi_author_string:
        # more than two authors
        doi_authors = doi_author_string.split(', ')
        if doi_authors[-1].startswith('and '):
            doi_authors[-1] = doi_authors[-1][4:]
    else:
        # single author
        doi_authors = [doi_author_string]
    same_authors = 0
    for doi_author in doi_authors:
        for paper_author in paper['authors']:
            dist = score_levenshtein_distance(paper_author['name'], doi_author)
            sim = (len(doi_author) - dist) / len(doi_author)
            if sim > 0.6:
                same_authors += 1
                #print(sim, same_authors, len(doi_authors))
                #print('SAME JSON AUTHOR:', paper_author['name'])
                #print('SAME DOI AUTHOR:', doi_author)
    sim = same_authors / len(doi_authors)
    return sim


def find_paper_same_title(doi_paper: str, papers_json):
    author_string, title_string, ref_string, doi_string = split_reference_string(doi_paper)
    doi_year = int(doi_paper['Year'])
    year_papers = [paper for paper in papers_json if paper['year'] == doi_year]
    for paper in year_papers:
        if paper['title'] in title_string:
            #print(paper['title'])
            #print(title_string)
            return paper


def find_paper(doi_paper: str, papers_json):
    author_string, title_string, ref_string, doi_string = split_reference_string(doi_paper)
    #print(author_string)
    #print(title_string)
    #print(ref_string)
    #print(doi_string)
    doi_year = int(doi_paper['Year'])
    year_papers = [paper for paper in papers_json if paper['year'] == doi_year]
    for paper in year_papers:
        title_sim = calculate_title_overlap(paper, title_string)
        title_word_sim = calculate_title_word_overlap(paper, title_string)
        author_sim = calculate_author_overlap(paper, author_string)
        if title_sim > 0.9:
            return paper
        if title_sim > 0.40 and author_sim > 0.9:
            return paper
        elif title_sim > 0.40:
            print('OVERLAP TITLE:', paper['title'])
            print('OVERLAP TITLE:', title_string)
            print(author_string)
            print(paper['authors'])
            print('title_sim:', title_sim, 'author_sim:', author_sim)
        if author_sim > 0.5:
            if title_word_sim > 0.7:
                return paper
            print(author_sim)
            print(title_word_sim)
            print('OVERLAP JSON AUTHOR:', paper['authors'])
            print('OVERLAP DOI AUTHOR:', author_string)
            print(paper)
            print(doi_paper)
            
    return None


def split_reference_string(doi_paper: dict) -> dict:
    author_string, ref_string = doi_paper['Reference string'].split(f" {doi_paper['Year']}. ")
    proc_string = None
    if 'In Proceedings of the 20' in ref_string:
        proc_string = 'In Proceedings of the 20'
    elif 'In ACM SIGIR Conference' in ref_string:
        proc_string = 'In ACM SIGIR Conference' 
    elif 'Proceedings of the 2020 Conference' in ref_string:
        proc_string = 'Proceedings of the 2020 Conference'
    title_string, ref_string = ref_string.split(proc_string)
    if ' DOI: ' in ref_string:
        doi_string = ref_string.split(' DOI: ')[1]
    else:
        doi_index = ref_string.index('https://')
        doi_string = ref_string[doi_index:]
    return author_string, title_string, ref_string, doi_string

    

match_count = 0
no_match_count = 0
skip_types = {
    'keynote',
    'workshop',
    'tutorial',
    'doctoral consortium'
}

web_papers = copy.deepcopy(papers_json)


doi_papers = [doi_paper for doi_paper in parse_doi_sheet(doi_sheet)]
print('total doi papers:', len(doi_papers))
doi_papers = [doi_paper for doi_paper in doi_papers if doi_paper['Conference'] == 'CHIIR']
print('chiir doi papers:', len(doi_papers))
doi_papers = [doi_paper for doi_paper in doi_papers if doi_paper['Type'] not in skip_types]
print('research doi papers:', len(doi_papers))
matched = {}

for doi_paper in doi_papers:
    paper = find_paper_same_title(doi_paper, web_papers)
    if paper is not None:
        web_papers.remove(paper)
        match_count += 1
        #matched.append(doi_paper, paper)
        matched[doi_paper['Reference string']] = paper
        author_string, title_string, ref_string, doi_string = split_reference_string(doi_paper)
        paper['doi'] = doi_string
        
doi_papers = [d for d in doi_papers if d['Reference string'] not in matched]
print('matched title doi papers:', len(matched))
print('different title doi papers:', len(doi_papers))


for doi_paper in doi_papers:
    paper = find_paper(doi_paper, web_papers)
    if paper is None:
        no_match_count += 1
        print('\nNO MATCH:', doi_paper, '\n')
    else:
        web_papers.remove(paper)
        match_count += 1
        matched[doi_paper['Reference string']] = paper
        author_string, title_string, ref_string, doi_string = split_reference_string(doi_paper)
        paper['doi'] = doi_string
    
doi_papers = [d for d in doi_papers if d['Reference string'] not in matched]
print('matched title doi papers:', len(matched))
print('different title doi papers:', len(doi_papers))


['Conference', 'Year', 'Reference string', 'Type', 'Award', 'IIR study paper?', 'Notes']
total doi papers: 682
chiir doi papers: 414
research doi papers: 356
matched title doi papers: 234
different title doi papers: 122

NO MATCH: {'Conference': 'CHIIR', 'Year': '2017', 'Reference string': "HÃ¤kon Wium Lie. 2017. CSS and User-Adapted Web Presentations. In Proceedings of the 2017 Conference on Conference Human Information Interaction and Retrieval (CHIIR '17). ACM, New York, NY, USA, 5-5. DOI: https://doi.org/10.1145/3020165.3038294", 'Type': '', 'Award': '', 'IIR study paper?': '', 'Notes': ''} 

0.75
0.1
OVERLAP JSON AUTHOR: [{'name': 'Ion Madrazo', 'affiliation': 'Boise State University'}, {'name': 'Oghenemaro Anuyah', 'affiliation': 'Boise State University'}, {'name': 'Nevena Dragovic', 'affiliation': 'Boise State University'}, {'name': 'Maria Soledad Pera', 'affiliation': 'Boise State University'}]
OVERLAP DOI AUTHOR: Ion Madrazo Azpiazu, Nevena Dragovic, Oghenemaro Anuyah, and Mar

In [470]:
doi_papers

[{'Conference': 'CHIIR',
  'Year': '2017',
  'Reference string': "HÃ¤kon Wium Lie. 2017. CSS and User-Adapted Web Presentations. In Proceedings of the 2017 Conference on Conference Human Information Interaction and Retrieval (CHIIR '17). ACM, New York, NY, USA, 5-5. DOI: https://doi.org/10.1145/3020165.3038294",
  'Type': '',
  'Award': '',
  'IIR study paper?': '',
  'Notes': ''},
 {'Conference': 'CHIIR',
  'Year': '2018',
  'Reference string': "Ion Madrazo Azpiazu, Nevena Dragovic, Oghenemaro Anuyah, and Maria Soledad Pera. 2018. Looking for the Movie Seven or Sven from the Movie Frozen?: A Multi-perspective Strategy for Recommending Queries for Children. In Proceedings of the 2018 Conference on Human Information Interaction & Retrieval (CHIIR '18). ACM, New York, NY, USA, 92-101. DOI: https://doi.org/10.1145/3176349.3176379",
  'Type': '',
  'Award': '',
  'IIR study paper?': '',
  'Notes': ''},
 {'Conference': 'CHIIR',
  'Year': '2018',
  'Reference string': "Rebecca Reynolds and P

In [471]:
[p for p in web_papers if 'Full' in p['paper_type'] or 'Short' in p['paper_type']]
#[p['paper_type'] for p in web_papers]


[{'title': 'Supporting Critical Thinking and Creativity in Search',
  'id': None,
  'authors': [{'name': 'Soo Young Rieh', 'affiliation': None}],
  'paper_type': 'Full papers',
  'year': 2017,
  'acm_dl_url': None,
  'authors_string': None},
 {'title': 'ReQuIK: Facilitating Information Discovery for Children Through Query Suggestions',
  'id': None,
  'authors': [{'name': 'Ion Madrazo', 'affiliation': 'Boise State University'},
   {'name': 'Oghenemaro Anuyah', 'affiliation': 'Boise State University'},
   {'name': 'Nevena Dragovic', 'affiliation': 'Boise State University'},
   {'name': 'Maria Soledad Pera', 'affiliation': 'Boise State University'}],
  'paper_type': 'Full Papers | CHIIR 2018',
  'year': 2018,
  'acm_dl_url': None,
  'authors_string': None},
 {'title': 'What Information and Learning Scientists Have to Learn from One Another, About Inquiry During Learning',
  'id': None,
  'authors': [{'name': 'Rebecca Reynolds',
    'affiliation': 'Rutgers University'},
   {'name': 'Prebe

In [472]:
metadata.keys()

dict_keys(['Conference', 'Year', 'Reference string', 'Type', 'Award', 'IIR study paper?', 'Notes', 'Reference author_string', 'Reference title_string', 'DOI', 'website_title', 'website_id', 'website_authors', 'website_paper_type', 'website_year', 'website_authors_string', 'website_doi'])

In [473]:
wb = Workbook()

ws_papers = wb.active
ws_papers.title = 'papers'
ws_authors = wb.create_sheet(title="authors")
paper_headers = [
    'Conference', 'Year', 'Reference string', 'Type', 'Award', 'IIR study paper?', 'Notes', 
    'Reference author_string', 'Reference title_string', 'DOI', 
    'website_title', 'website_paper_type', 'website_year', 'website_authors_string'
]
author_headers = [
    'Conference', 'Year', 'DOI', 
    'website_title', 'website_author_name', 'website_author_affiliation'
]
ws_papers.append(paper_headers)
ws_authors.append(author_headers)




In [474]:
published_papers = [doi_paper for doi_paper in parse_doi_sheet(doi_sheet)]
chiir_papers = [doi_paper for doi_paper in published_papers if doi_paper['Conference'] == 'CHIIR']
print('chiir doi papers:', len(chiir_papers))
research_papers = [doi_paper for doi_paper in chiir_papers if doi_paper['Type'] not in skip_types]
print('research doi papers:', len(research_papers))

for di, doi_paper in enumerate(research_papers):
    metadata = copy.deepcopy(doi_paper)
    author_string, title_string, ref_string, doi_string = split_reference_string(doi_paper)
    metadata['Reference author_string'] = author_string
    metadata['Reference title_string'] = title_string
    metadata['DOI'] = doi_string
    if doi_paper['Reference string'] in matched:
        web_paper = matched[doi_paper['Reference string']]
        #print()
        for field in web_paper:
            if field == 'acm_dl_url':
                continue
            metadata[f'website_{field}'] = web_paper[field]
    author_string, title_string, ref_string, doi_string = split_reference_string(doi_paper)
    print(di, doi_string)
    paper_row = [None if header not in metadata else metadata[header] for header in paper_headers]
    ws_papers.append(paper_row)
    if 'website_authors' in metadata:
        for author in metadata['website_authors']:
            author_row = []
            for header in author_headers:
                if header == 'website_author_name':
                    author_row.append(author['name'])
                elif header == 'website_author_affiliation':
                    author_row.append(author['affiliation'])
                elif header in metadata:
                    author_row.append(metadata[header])
                else:
                    author_row.append(None)
            ws_authors.append(author_row)
    else:
        author_row = []
        for header in author_headers:
            if header in {'website_author_name', 'website_author_affiliation'}:
                row.append(None)
            elif header in metadata:
                author_row.append(metadata[header])
            else:
                author_row.append(None)
        ws_authors.append(author_row)
    #print(metadata)

wb.save('CHIIR_paper_info.xlsx')


['Conference', 'Year', 'Reference string', 'Type', 'Award', 'IIR study paper?', 'Notes']
chiir doi papers: 414
research doi papers: 356
0 https://doi.org/10.1145/2854946.2854957
1 https://doi.org/10.1145/2854946.2854971
2 https://doi.org/10.1145/2854946.2854966
3 https://doi.org/10.1145/2854946.2854963
4 https://doi.org/10.1145/2854946.2854977
5 https://doi.org/10.1145/2854946.2854964
6 https://doi.org/10.1145/2854946.2854979
7 https://doi.org/10.1145/2854946.2854973
8 https://doi.org/10.1145/2854946.2854965
9 https://doi.org/10.1145/2854946.2854958
10 https://doi.org/10.1145/2854946.2854975
11 https://doi.org/10.1145/2854946.2854978
12 https://doi.org/10.1145/2854946.2854961
13 https://doi.org/10.1145/2854946.2854967
14 https://doi.org/10.1145/2854946.2854976
15 https://doi.org/10.1145/2854946.2854962
16 https://doi.org/10.1145/2854946.2854972
17 https://doi.org/10.1145/2854946.2854968
18 https://doi.org/10.1145/2854946.2854959
19 https://doi.org/10.1145/2854946.2854969
20 https://doi

In [476]:
for paper in papers_json:
    for author in paper['authors']:
        if 'Nirmal' in author['name']:
            print(paper)

{'title': 'Exploring Usersâ\x80\x99 Learning Gains Within Search Sessions', 'id': None, 'authors': [{'name': 'Nirmal Roy, Felipe Moraes', 'affiliation': None}, {'name': 'Claudia Hauff', 'affiliation': None}], 'paper_type': 'Short Papers', 'year': 2020, 'acm_dl_url': None, 'authors_string': None}
{'title': 'Searching to Learn with Instructional Scaffolding', 'id': None, 'authors': [{'name': 'Arthur Câmara', 'affiliation': 'Delft University of Technology, Netherlands'}, {'name': 'Nirmal Roy', 'affiliation': 'Delft University of Technology, Netherlands'}, {'name': 'David Maxwell', 'affiliation': 'Delft University of Technology, Netherlands'}, {'name': 'Claudia Hauff', 'affiliation': 'Delft University of Technology, Netherlands'}], 'paper_type': 'Full Papers', 'year': 2021, 'acm_dl_url': None, 'authors_string': None, 'full_author_string': 'Arthur Câmara (Delft University of Technology, Netherlands), Nirmal Roy (Delft University of Technology, Netherlands), David Maxwell (Delft University o