# Detailed discussion structures

This notebook collects more detailed features describing the structure of discussions. The output is a pandas DataFrame containing the following columns:

* `review_id` official forum id
* `permalink` link to forum on OpenReview
* Number of comments
  * `total_comments` all comments on forum
  * `total_author_comments` all comments from author (continuations counted as separate comments)
  * `total_reviewer_comments` total number of comments by all reviewers (continuations are rare)
* Number of tokens, as produced by Stanza tokenizer
  * `total_tokens` tokens from all comments on forum
  * `author_tokens`
  * `reviewer_tokens`
* `max_path_length` maximum length of a thread in the forum
* `max_num_participants` maximum number of unique participants in a thread
* `num_unofficial_participants` participants besides authors, reviewers, ACs

You can add more features in the `gather_stats` method of `NoteNode`.

This code has not been tested with conferences besides ICLR 2019.

In [1]:
import collections
import openreview
import pandas as pd
import pptree
import stanza
STANZA_PIPELINE = stanza.Pipeline(lang="en",
                                  processors="tokenize")
import tqdm


# A client is required for any OpenReview API actions
guest_client = openreview.Client(baseurl='https://api.openreview.net')

# Change these values according to your needs
INVITATION = 'ICLR.cc/2019/Conference/-/Blind_Submission'
LIMIT = 10

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2023-01-29 14:59:32 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2023-01-29 14:59:32 INFO: Use device: cpu
2023-01-29 14:59:32 INFO: Loading: tokenize
2023-01-29 14:59:32 INFO: Done loading processors!


The cell below contains helper functions including a `NoteNode` object which is intended to be a wrapper around the Note object in the OpenReview API. The main code is in the following cell.

In [2]:
def get_author_code(signature):
  """Returns a short code representing author type.
  
     This code is only tested for ICLR 2019, but small changes should make it work for other years.
  """
  
  # The signature field almost always contains a list of length 1, and the item is almost always of
  # the format "ICLR.cc/<year>/Conference/Paper1592/<entity>"
  sig = signature.split("/")[-1] # extracting the 'entity' part of the signature
  
  if sig == 'Authors':
    return "A"
  elif sig == "Conference":
    return "C"
  elif sig == "(anonymous)":
    return "N"
  elif "Area_Chair" in sig:
    return "AC" + sig[-1]
  elif "AnonReviewer" in sig:
    return "R" + sig[-1]
  elif sig.startswith("~"):
    return "S" # Someone
  else:
    print(sig)
    assert False # Should not get here for ICLR 2019.
    
  
def get_path(node):
  """Get a list of note ids leading up to this node, root excluded.
  """
  path = []
  while node.reply_to is not None:
    path.append(node)
    node = node.parent
  return list(reversed(path))

def get_commenter_path(path):
  """Get a list of commenters using their author codes, accounting for continuations.
  """
  authors = []
  for node in path:
    if authors and authors[-1] == node.author:
      continue
    else:
      authors.append(node.author) 
  return [get_author_code(signature) for signature in authors]

def mean(l):
  assert l
  return sum(l)/len(l)
    
def get_review_discussions(notes):
  """Assembles a tree from a list of notes, and calculates features of the tree.
  """
  ordered_notes = list(reversed(sorted([(note.tcdate, note, NoteNode(note)) for note in notes])))
  note_objs = {note.id: node for _, note, node in ordered_notes}
  root_node = None
  for _, _, note_node in ordered_notes:
    note_node.tokenize()
    if note_node.reply_to is None:
      root_node = note_node
      continue
    elif note_node.reply_to in note_objs:
      # Attach parent and children
      note_objs[note_node.reply_to].children.append(note_node)
      note_node.parent = note_objs[note_node.reply_to]
      note_node.assembled = True
    else: # A deleted node
      return None

  return root_node.gather_stats()

def get_content_from_note(note):
  """Extracts the main text from a Note."""
  if 'review' in note.content:
    return note.content['review']
  elif 'comment' in note.content:
    return note.content['comment']
  elif 'metareview' in note.content:
    return note.content['metareview']
  elif 'abstract' in note.content:
    return ""
  else:
    assert False # These are all the valid options for ICLR 2019    

class NoteNode(object):
  def __init__(self, note):
    self.note_id = note.id
    self.text = get_content_from_note(note)
    self.author, = note.signatures
    self.author_code = get_author_code(self.author)
    self.short_info = f'{self.author_code}___{self.note_id}'
    self.forum_permalink = f"https://openreview.net/forum?id={note.forum}"
    
    self.reply_to = note.replyto
    self.parent = None
    self.children = []
    if self.reply_to is None:
      self.assembled = True
    else:
      self.assembled = False
    self.tokenized_text = self.tokenize()

    
  def tokenize(self):
    """Tokenize this node's text using Stanza."""
    doc = STANZA_PIPELINE(self.text)
    tokenized_text = []
    for sentence in doc.sentences:
      tokenized_text.append([token.to_dict()[0]["text"] for token in sentence.tokens])
    return tokenized_text
    
  def pretty_print(self):
    """Print subtree rooted at this node using pptree."""
    pp_root = pptree.Node(self.short_info)
    pp_ancestors = {self.note_id:pp_root}
    descendants = list(self.children)
    while descendants:
      descendant = descendants.pop(0)
      descendants += descendant.children
      pp_descendant = pptree.Node(descendant.short_info, pp_ancestors[descendant.reply_to])
      pp_ancestors[descendant.note_id] = pp_descendant
       
    pptree.print_tree(pp_root)
    
  def gather_stats(self):
    
    # Collect some stats on a BFS
    bfs_stats = collections.Counter()
    queue = [self]
    descendant_map = {} # Collecting nodes with no known children
    unofficial_participants = set()
    while queue:
      curr = queue.pop(0)
      if not curr.assembled:
        assert False # Shouldn't be able to get here if the nodes in the subtree aren't all assembled
      
      # Finding descendants
      descendant_map[curr.note_id] = curr # This node is a potential descendant
      if curr.reply_to in descendant_map:
        del(descendant_map[curr.reply_to]) # Remove potential descendant if this node is their child.
    
      # Counting tokens
      num_tokens = len(sum(curr.tokenized_text, []))
      bfs_stats["total_comments"] += 1
      bfs_stats['total_tokens'] += num_tokens
      if curr.author_code.startswith('R'):
        bfs_stats["total_reviewer_comments"] += 1
        bfs_stats['reviewer_tokens'] += num_tokens
      elif curr.author_code == 'A':
        bfs_stats["total_author_comments"] += 1
        bfs_stats['author_tokens'] += num_tokens
      elif not "C" in curr.author_code: # Janky way to check for AC (area chair) and C (conference)
        unofficial_participants.add(curr.author)
      queue += curr.children
    
    stats = {
      "forum_id": self.note_id,
      "permalink": self.forum_permalink,
      "num_unofficial_participants": len(unofficial_participants),
    }
    
    stats.update(bfs_stats)
    
    # Instead of doing a DFS, collect the path to each node without children, then calculate
    # features of this set of paths.
    paths = {descendant_id:get_path(node) for descendant_id, node in descendant_map.items()}
    stats["max_path_length"] = max([len(i) for i in paths.values()])
    stats["mean_path_length"] = mean([len(i) for i in paths.values()])
    stats["max_num_participants"] = max([len(set(get_commenter_path(i))) for i in paths.values()])
    
    return stats

In [3]:
df_dicts = []
for i, forum_note in tqdm.tqdm(enumerate(openreview.tools.iterget_notes(
        guest_client, invitation=INVITATION))):
    this_forum_notes = guest_client.get_notes(forum=forum_note.id)
    df_dicts.append(get_review_discussions(this_forum_notes))
    if i == LIMIT - 1:
      break

df = pd.DataFrame.from_dict(d for d in df_dicts if d is not None)

display(df.sort_values(by='max_num_participants', ascending=False))

9it [00:53,  5.94s/it]


Unnamed: 0,forum_id,permalink,num_unofficial_participants,total_comments,total_tokens,total_author_comments,author_tokens,total_reviewer_comments,reviewer_tokens,max_path_length,mean_path_length,max_num_participants
5,SJf6BhAqK7,https://openreview.net/forum?id=SJf6BhAqK7,0,18,8380,9,5066,7,3283,5,3.166667,3
0,rJl0r3R9KX,https://openreview.net/forum?id=rJl0r3R9KX,0,11,4359,5,2668,4,1571,4,2.0,2
1,SylCrnCcFX,https://openreview.net/forum?id=SylCrnCcFX,1,15,3648,7,2379,3,1061,2,1.555556,2
2,H1xAH2RqK7,https://openreview.net/forum?id=H1xAH2RqK7,1,13,5726,7,4404,3,1195,2,1.75,2
3,HJeABnCqKQ,https://openreview.net/forum?id=HJeABnCqKQ,0,12,2703,4,937,6,1364,3,2.0,2
4,SyVpB2RqFX,https://openreview.net/forum?id=SyVpB2RqFX,0,22,9551,12,7097,6,2042,6,3.25,2
6,HylTBhA5tQ,https://openreview.net/forum?id=HylTBhA5tQ,0,15,4270,8,2585,5,1654,7,2.333333,2
7,B1gTShAct7,https://openreview.net/forum?id=B1gTShAct7,0,13,5934,5,3746,6,2136,4,2.4,2
