In [None]:
import pandas as pd
import numpy as np
import re
import math
import xml.etree.ElementTree as ET
import os

In [None]:
cwd = os.getcwd()

In [None]:
cmv_delta_submissions = 'cmw_submissions_sample_1.tsv'
cmv_delta_threads = 'cmw_comments_sample_1_deltas_thread.tsv'
cmv_delta_comments = 'cmw_comments_sample_1_only_delta_comments.tsv'

submissions_df = pd.read_csv(cmv_delta_submissions, sep='\t')
comments_df = pd.read_csv(cmv_delta_threads, sep='\t')
delta_comments_df = pd.read_csv(cmv_delta_comments, sep='\t')

In [None]:
len(submissions_df)

500

### 1. Collecting ids of submissions from delta-awarded threads and detecting delta-awarded comments

In [None]:
def find_delta_submissions(comments_df):
  delta_submission_ids = set()

  for i, comment in comments_df.iterrows():
    if comment['delta'] == True:
      delta_submission_ids.add(comment['link_id'])

  return delta_submission_ids

In [None]:
delta_sub_ids = find_delta_submissions(comments_df)

In [None]:
delta_comments_ids = set(delta_comments_df['id'])

In [None]:
len(delta_sub_ids) # number of posts that have delta awarded comments

269

In [None]:
len(delta_comments_ids) # number of delta awarded comments

631

### 2. Keeping only submissions with deltas awarded in the comments

In [None]:
submissions_df = submissions_df.loc[submissions_df['id'].isin(delta_sub_ids)]

## 3. Reconstructing conversations where deltas were awarded to direct replies

In [None]:
conversations = {}
comment_dict = {}
saved_submissions = []

for i, comment in delta_comments_df.iterrows():
  if pd.isna(comment['parent_comment_id']):
    submission_id = comment['link_id']
    comment_user = comment['author']
    comment_id = comment['id']
    #comment_dict[submission_id] = {'text': comment['body']}
    try:
      submission = submissions_df[submissions_df['id'] == submission_id].iloc[0]
    except IndexError:
      continue

    i = saved_submissions.count(submission_id) + 1
    saved_submissions.append(submission_id)
    submission_id = str(submission_id) + "_" + str(i)

    conversations[submission_id] = {'link': submission['permalink'],
                                    'title': submission['title'],
                                    'original_poster': submission['author'],
                                    'original_post': submission['selftext'],
                                    'comment_user': comment_user,
                                    'comment_id': comment['id'],
                                    'delta_comments': {'text': comment['body']},
                                    'threads': []}

In [None]:
# OLD VERSION WITH TXT FILES

"""
filename = "_direct_comment_delta.txt"
folder = "/content/drive/MyDrive/Marina Pavlova thesis project/samples/reddit CMV/dialogues/"

for submission_id, data in conversations.items():
  post_id = submission_id.split('_')[0]

  text = f"""<Submission ID>: {post_id}\n<Link> {data['link']}\n<Title>\
  {data['title']}\n<OP>{data['original_poster']}</OP>\n\
<Original post>\n{data['original_post']}\n\n<Comment_id> {data['comment_id']}\n\
<User> {data['comment_user']}\n<Comment>\n{data['delta_comments']['text']}"""

  print(f"Submission ID: {post_id}")
  #print(f"Title: {data['title']}")
  #print(f"Original post: {data['original_post']}")
  #print(f"\nDirect comment awarded with delta: {data['delta_comments']['text']}")

  current_filename = folder + submission_id + filename

  with open(current_filename, 'w', encoding="utf-8") as f:
    f.write()
    f.write(text)
  break
"""

Submission ID: 78227727.0


In [None]:
def conversation_to_xml(submission_id, data):
    thread = ET.Element("thread")

    id, index = submission_id.split('_')

    submission = ET.SubElement(thread, "submission", {
        "id": str(id),
        "index": str(index),
        "link": data.get("link", "")
    })

    ET.SubElement(submission, "title").text = data.get("title", "")
    ET.SubElement(submission, "original_poster").text = data.get("original_poster", "")
    ET.SubElement(submission, "original_post").text = data.get("original_post", "")

    comment = ET.SubElement(thread, "comment", {
        "id": str(data.get("comment_id", "")),
        "user": data.get("comment_user", ""),
        "delta": str(True)
    })

    ET.SubElement(comment, "text").text = data.get("delta_comments", {}).get("text", "")

    return ET.ElementTree(thread)

In [None]:
def indent(elem, level=0):
    """helper to pretty print XML."""
    i = "\n" + level * "  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        for e in elem:
            indent(e, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i

In [None]:
def process_conversations(conversations, output_dir):
    """
    converting a dictionary of Reddit conversations to XML files.
    params:
        conversations (dict),
        output_dir (str)
    """
    os.makedirs(output_dir, exist_ok=True)

    for submission_id, data in conversations.items():
        tree = conversation_to_xml(submission_id, data)
        indent(tree.getroot())
        outfile = os.path.join(output_dir, f"{submission_id}_direct_comment_delta.xml")
        tree.write(outfile, encoding="utf-8", xml_declaration=True)
        print(f"Wrote XML: {submission_id}")

In [None]:
output_folder = f"{cwd}/direct_deltas/"
process_conversations(conversations, output_folder)

Wrote XML: 78227727.0_1
Wrote XML: 78227727.0_2
Wrote XML: 80229182.0_1
Wrote XML: 84617375.0_1
Wrote XML: 87228342.0_1
Wrote XML: 87228342.0_2
Wrote XML: 93803852.0_1
Wrote XML: 95163485.0_1
Wrote XML: 95220169.0_1
Wrote XML: 96123581.0_1
Wrote XML: 99329272.0_1
Wrote XML: 99329272.0_2
Wrote XML: 104870418.0_1
Wrote XML: 112092849.0_1
Wrote XML: 126209817.0_1
Wrote XML: 134214340.0_1
Wrote XML: 142721845.0_1
Wrote XML: 145491128.0_1
Wrote XML: 154839924.0_1
Wrote XML: 154839924.0_2
Wrote XML: 154839924.0_3
Wrote XML: 154839924.0_4
Wrote XML: 156866222.0_1
Wrote XML: 172456552.0_1
Wrote XML: 173157689.0_1
Wrote XML: 173572831.0_1
Wrote XML: 173814707.0_1
Wrote XML: 197802628.0_1
Wrote XML: 197802628.0_2
Wrote XML: 199744342.0_1
Wrote XML: 200394744.0_1
Wrote XML: 207157398.0_1
Wrote XML: 207157398.0_2
Wrote XML: 207157398.0_3
Wrote XML: 208058488.0_1
Wrote XML: 209099690.0_1
Wrote XML: 211207339.0_1
Wrote XML: 248557201.0_1
Wrote XML: 248557201.0_2
Wrote XML: 248893922.0_1
Wrote XML: 2

In [None]:
print(len(conversations)) # number of delta awarded direct comments

354


In [None]:
direct_deltas_info = {}

for submission_id, thread in conversations.items():
  direct_deltas_info[submission_id] = {"thread_length": len(thread['delta_comments']),
                                         "original_post_length": len(thread['original_post']),
                                         "comment_length": len(thread['delta_comments']['text'])}

In [None]:
df_direct = pd.DataFrame.from_dict(direct_deltas_info, orient="index")
avg_length_direct_threads = round(float(df_direct["thread_length"].aggregate(np.mean)),2)
med_length_direct_threads = int(df_direct["thread_length"].aggregate(np.median))
max_length_direct_threads = int(df_direct["thread_length"].aggregate(np.max))
avg_op_direct_threads = round(float(df_direct["original_post_length"].aggregate(np.mean)),2)
avg_comms_direct_threads = round(float(df_direct["comment_length"].aggregate(np.mean)),2)

  avg_length_direct_threads = round(float(df_direct["thread_length"].aggregate(np.mean)),2)
  med_length_direct_threads = int(df_direct["thread_length"].aggregate(np.median))
  max_length_direct_threads = int(df_direct["thread_length"].aggregate(np.max))
  avg_op_direct_threads = round(float(df_direct["original_post_length"].aggregate(np.mean)),2)
  avg_comms_direct_threads = round(float(df_direct["comment_length"].aggregate(np.mean)),2)


In [None]:
print(avg_length_direct_threads, max_length_direct_threads, med_length_direct_threads, avg_op_direct_threads, avg_comms_direct_threads)

1.0 1 1 2019.09 1268.26


In [None]:
df_direct.to_csv(f"{cwd}/info_direct_deltas_May17.csv",
          index_label=None)

## 4. Reconstructing conversation trees where deltas were awarded somewhere in the threads

In [None]:
def collect_delta_comms(delta_comments_df):

  #conversations = {}
  comment_dict = {}
  #saved_submissions = []
  k = 0

  for i, comment in delta_comments_df.iterrows():
    if pd.notna(comment['parent_comment_id']):
      k += 1
      comment_id = comment['id']
      comment_dict[comment_id] = {'submission_id': comment['link_id'],
                                  'parent_id': comment['parent_comment_id'],
                                  'delta': comment['delta'], # should be True
                                  'depth': 0, #delta awarded comment is presumably the last one in a thread
                                  'user': comment['author'],
                                  'comment_id': comment_id,
                                  'text': comment['body']}

  return comment_dict

In [None]:
delta_comment_dict = collect_delta_comms(delta_comments_df)

In [None]:
len(delta_comment_dict)

275

In [None]:
def build_conversation_threads(delta_comment_dict, comments_df, submissions_df):
  output_threads = []

  for comm, info in delta_comment_dict.items():
    print(comm)
    thread = True
    to_write = {}
    thread_comms = []

    """
    conversations[submission_id] = {'link': submission['permalink'],
                                    'title': submission['title'],
                                    'original_poster': submission['author'],
                                    'original_post': submission['selftext'],
                                    'comment_user': comment_user,
                                    'comment_id': comment['id'],
                                    'delta_comments': {'text': comment['body']},
                                    'threads': []}
    """

    submission_id = info['submission_id']
    title, original_post, link, original_poster = extract_submission(submission_id, submissions_df)
    to_write[submission_id] = {'link': link,
                               'title': title,
                               'original_poster': original_poster,
                               'original_post': original_post,
                               'thread': []}

    thread_comms = [(info['depth'], comm, info['user'], info['delta'], info['text']),]

    to_write[submission_id]['thread'].extend(thread_comms)

    parent_id = info['parent_id']
    i = 1
    if parent_id in comments_df['id'].values:
      comment_dict, next_parent, i = extract_comment(parent_id, comments_df, i)
      while thread:
        for comm_id, data in comment_dict.items():
          if data['submission_id'] == submission_id:
            thread_comms = [(data['depth'], comm_id, data['user'],
                             data['delta'], data['text']),]

            next_parent = data['parent_id']
            to_write[submission_id]['thread'].extend(thread_comms)

            if np.isnan(next_parent):
              thread = False

            else:
              parent_id = next_parent
              comment_dict, next_parent, i = extract_comment(parent_id,
                                                             comments_df, i)
    output_threads.append(to_write)
  return output_threads

In [None]:
def extract_submission(submission_id, submission_df):

  while 1:
    try:
      submission = submissions_df[submissions_df['id'] == submission_id].iloc[0]
    except IndexError:
      continue

    title = submission['title']
    original_post = submission['selftext']
    link = submission['permalink']
    original_poster = submission['author']
    break

  return title, original_post, link, original_poster

In [None]:
def extract_comment(comment_id, comments_df, i):
  comment_dict = {}
  comment = comments_df[comments_df['id'] == comment_id].iloc[0]
  if pd.notna(comment['author']):
    user = comment['author']
  else:
    user = ''
  comment_dict[comment_id] = {'submission_id': comment['link_id'],
                              'parent_id': float(comment['parent_comment_id']),
                              'user': user,
                              'delta': bool(comment['delta']),
                              'depth': i,
                              'text': comment['body']}
  i += 1

  return comment_dict, comment['parent_comment_id'], i

In [None]:
result = build_conversation_threads(delta_comment_dict, comments_df, submissions_df) # a list of threads, each thread is a dict

26643733268.0
26643775105.0
26686699717.0
26687698517.0
26740134444.0
26750979253.0
26873158216.0
26872984537.0
26908571076.0
27011089247.0
27018164445.0
27040111048.0
27134184706.0
27218962908.0
27220947540.0
27321547339.0
27321651285.0
27321932295.0
27369937081.0
27422318450.0
27422397933.0
27422447719.0
27422700831.0
27441627715.0
27473589748.0
27499349401.0
27506495783.0
27679008903.0
27679123596.0
27842818277.0
27848977344.0
27922510893.0
27932979967.0
28006828121.0
28036829628.0
28063196573.0
28072831886.0
28063561003.0
28078399630.0
28157136344.0
28198082495.0
28257923699.0
28288171155.0
28351918789.0
28421521192.0
28568802844.0
28580591781.0
28662264557.0
28697392869.0
28898238598.0
29005966665.0
29005971671.0
29005992409.0
29005997443.0
29125014980.0
29125066011.0
29126683568.0
29127230711.0
29215079927.0
29215092994.0
29217859451.0
29251021378.0
29272818934.0
29322755514.0
29427799985.0
29427867599.0
29427958090.0
29428338295.0
29433097574.0
29493401402.0
29520323059.0
295203

In [None]:
print(len(result)) # 275 threads with a delta-awarded comment

275


In [None]:
def conversation_to_xml_deltathreads(submission_id, data):
    comments = []
    thread = ET.Element("thread")

    id, index = submission_id.split('_')
    threaded = data['thread']

    submission = ET.SubElement(thread, "submission", {
        "id": str(id),
        "index": str(index),
        "link": data.get("link", "")
    })

    ET.SubElement(submission, "title").text = data.get("title", "")
    ET.SubElement(submission, "original_poster").text = data.get("original_poster", "")
    ET.SubElement(submission, "original_post").text = data.get("original_post", "")

    for turn in sorted(threaded, key=lambda x: x[0], reverse=True):
        comments.append(turn[4])

        comment = ET.SubElement(thread, "comment", {
            "id": str(turn[1]),
            "user": str(turn[2]),
            "delta": str(turn[3])
        })
        ET.SubElement(comment, "text").text = str(turn[4])

    comment_length = round(len(" ".join(comments))/len(threaded),2)

    return ET.ElementTree(thread), comment_length

In [None]:
output_dir = f"{cwd}/delta_threads/"
saved_submissions= []
delta_threads_comments_id = []
delta_threads_info = {}

for exchange in result:
  submission_id = list(exchange.keys())[0]
  data = exchange[submission_id]
  thread = data['thread']

  i = saved_submissions.count(submission_id) + 1
  saved_submissions.append(submission_id)
  submission_name = str(submission_id) + "_" + str(i) # for the filename


  tree, comments_length = conversation_to_xml_deltathreads(submission_name, data)


  delta_threads_info[submission_name] = {"thread_length": len(thread),
                                         "original_post_length": len(data['original_post']),
                                         "comments_avg_length": comments_length}

  indent(tree.getroot())
  outfile = os.path.join(output_dir, f"{submission_name}_delta_threads.xml")
  tree.write(outfile, encoding="utf-8", xml_declaration=True)
  print(f"Wrote XML: {submission_name}")


Wrote XML: 76196414.0_1
Wrote XML: 76196414.0_2
Wrote XML: 80733377.0_1
Wrote XML: 80733377.0_2
Wrote XML: 86147245.0_1
Wrote XML: 87228342.0_1
Wrote XML: 99290486.0_1
Wrote XML: 99329272.0_1
Wrote XML: 102987767.0_1
Wrote XML: 112874326.0_1
Wrote XML: 113567594.0_1
Wrote XML: 115882088.0_1
Wrote XML: 125791631.0_1
Wrote XML: 134214340.0_1
Wrote XML: 134470458.0_1
Wrote XML: 144404372.0_1
Wrote XML: 144404372.0_2
Wrote XML: 144404372.0_3
Wrote XML: 149345306.0_1
Wrote XML: 154839924.0_1
Wrote XML: 154839924.0_2
Wrote XML: 154839924.0_3
Wrote XML: 154839924.0_4
Wrote XML: 156866222.0_1
Wrote XML: 160170780.0_1
Wrote XML: 162950857.0_1
Wrote XML: 163717378.0_1
Wrote XML: 181959121.0_1
Wrote XML: 181959121.0_2
Wrote XML: 199744342.0_1
Wrote XML: 200394744.0_1
Wrote XML: 208058488.0_1
Wrote XML: 209151241.0_1
Wrote XML: 217468142.0_1
Wrote XML: 220698466.0_1
Wrote XML: 223614694.0_1
Wrote XML: 223614694.0_2
Wrote XML: 223826952.0_1
Wrote XML: 225482448.0_1
Wrote XML: 234217718.0_1
Wrote XM

In [None]:
df = pd.DataFrame.from_dict(delta_threads_info, orient="index")
avg_length_delta_threads = round(float(df["thread_length"].aggregate(np.mean)),2)
avg_op_delta_threads = round(float(df["original_post_length"].aggregate(np.mean)),2)
avg_comms_delta_threads = round(float(df["comments_avg_length"].aggregate(np.mean)),2)

  avg_length_delta_threads = round(float(df["thread_length"].aggregate(np.mean)),2)
  avg_op_delta_threads = round(float(df["original_post_length"].aggregate(np.mean)),2)
  avg_comms_delta_threads = round(float(df["comments_avg_length"].aggregate(np.mean)),2)


In [None]:
df.to_csv(f"{cwd}/info_delta_threads.csv",
          index_label=None)

In [None]:
# OLD VERSION WITH TXT FILES
"""
filename = "_thread_delta.txt"
folder = f"{cwd}/threads/"
saved_submissions= []
thread_length = {}
delta_threads_comments_id = []

for exchange in result:
  submission_id = list(exchange.keys())[0]
  data = exchange[submission_id]
  thread = data['thread']

  i = saved_submissions.count(submission_id) + 1
  saved_submissions.append(submission_id)
  submission_name = str(submission_id) + "_" + str(i) # for the filename
  current_filename = folder + submission_name + filename

  comment = ""

  for turn in sorted(thread, key=lambda x: x[0], reverse=True):
    if submission_name not in thread_length:
      thread_length[submission_name] = len(thread)
    comment_id = turn[1]
    delta = turn[2]
    original_text = turn[3]
    delta_threads_comments_id.append(comment_id)
    comment += f"Comment ID: {comment_id}\nDelta: {delta}\n\nComment\n{original_text}\n\n"

  with open(current_filename, 'w', encoding="utf-8") as f:
    post = f"Submission ID: {submission_id} {data['link']}\nTitle: {data['title']}\n\nOriginal post\n{data['original_post']}\n\n"
    text = post + comment
    print(f"Submission ID: {submission_id}")
    f.write(text)
"""

## 5. Extracting no-delta threads

In [None]:
comments_df = pd.read_csv(cmv_delta_threads, sep='\t') # delta threads
comms_in_delta_threads = comments_df['id'].to_list()

In [None]:
all_comments = 'cmw_comments_sample_1.tsv'
all_comments_df = pd.read_csv(all_comments, sep='\t')

In [None]:
len(all_comments_df)

47447

In [None]:
delta_less_comments_df = all_comments_df[~all_comments_df['id'].isin(comms_in_delta_threads)]

In [None]:
delta_less_comments_df['delta'] = False

In [None]:
submissions_df = pd.read_csv(cmv_delta_submissions, sep='\t')

In [None]:
def collect_deltaless_comms(deltaless_comments_df):

  #conversations = {}
  comment_dict = {}
  comments_info = []
  #saved_submissions = []
  #k = 0


  for i, comment in deltaless_comments_df.iterrows():
    comment_id = comment['id']

    if comment['body'] != '[deleted]':
      if pd.isna(comment['parent_comment_id']): # direct reply

        # comments_info.append([(comment_id, submission_id, parent_id, delta, depth, text),])

        comments_info.extend([(comment_id, comment['link_id'], comment['parent_comment_id'],
                               comment['author'], comment['delta'], 0, comment['body']),])

        comment_dict[comment_id] = {'submission_id': comment['link_id'],
                                    'parent_id': comment['link_id'],
                                    'user': comment['author'],
                                    'delta': comment['delta'], # should be False
                                    'depth': 0, #last comment in a thread
                                    'text': comment['body']}

      else:
        comments_info.extend([(comment_id, comment['link_id'], comment['parent_comment_id'],
                               comment['author'], comment['delta'], -1, comment['body']),])

        comment_dict[comment_id] = {'submission_id': comment['link_id'],
                                    'parent_id': comment['parent_comment_id'],
                                    'user': comment['author'],
                                    'delta': comment['delta'], # should be False
                                    'depth': -1, #unknown
                                    'text': comment['body']}

    else:
      continue

  return comment_dict, comments_info

In [None]:
all_comment_dict, all_comment_info = collect_deltaless_comms(delta_less_comments_df)

In [None]:
from collections import defaultdict
submissions = defaultdict(list)

for cid, data in all_comment_dict.items():
  submissions[data['submission_id']].append((cid, data))

In [None]:
def build_threads(submissions):

    all_threads = {}

    for submission_id, comment_list in submissions.items():

        # parent <-> children map
        children_map = defaultdict(list)
        comment_lookup = {}

        for cid, data in comment_list:
            comment_lookup[cid] = data
            children_map[data['parent_id']].append(cid)

        # reconstruct thread recursively
        def build_tree(comment_id):
            comment = comment_lookup[comment_id]
            return {'id': comment_id,
                    'text': comment['text'],
                    'user': comment['user'],
                    'delta': comment['delta'],
                    'depth': comment['depth'],
                    'children': [build_tree(child_id) for child_id in children_map.get(comment_id, [])]}

        # root comments = those whose parent is the submission itself
        root_ids = [cid for cid, data in comment_list if data['parent_id'] == submission_id]
        threads = [build_tree(root_id) for root_id in root_ids]
        all_threads[submission_id] = threads

    return all_threads

In [None]:
non_delta_threads = build_threads(submissions)

In [None]:
def dfs(flat, node, depth):
        #print(node, type(node))
        flat.append(([depth, node['id'], node['user'], node['delta']], node['text']))

        for child in node.get('children', []):
            #print(child)
            depth += 1
            dfs(flat, child, depth)
        return flat

In [None]:
output_non_delta_threads = []

for submission_id, comment_threads in non_delta_threads.items():
  if not math.isnan(submission_id):
    title, original_post, link, original_poster = extract_submission(submission_id, submissions_df)

    for i, thread in enumerate(comment_threads, 1):
      to_write = {}
      submission_name = f"{submission_id}_{i}"
      thread_comms = []
      thread_comms = dfs(thread_comms, thread, 0)

      to_write[submission_name] = {'title': title,
                                   'original_poster': original_poster,
                                   'original_post': original_post,
                                   'link': link,
                                   'thread': thread_comms}

      output_non_delta_threads.append(to_write)

In [None]:
print(len(output_non_delta_threads))

8482


In [None]:
output_non_delta_threads[0]

{'75326877.0_1': {'title': "I think non-violent criminals shouldn't be sent to prison. CMV?",
  'original_poster': 'dichotomie',
  'original_post': "I think anything that isn't stuff like murder, rape, arson, robbery (but not burglary) or the attempt or intention to do those things shouldn't be given prison time right off the bat.\n\nI think there should be some sort of three strike system where you could be fined, put on house arrest, or probation before receiving jail time. I think this could be a better system because we wouldn't be mixing relatively okay people with career criminals and it would be less of a strain on our prison system which is already overcrowded due to things like mandatory minimums and the war on drugs.\n\nThis is something I just thought about on a whim so I'd like to hear the negatives and why something like this wouldn't work in practice.",
  'link': 'https://www.reddit.com//r/changemyview/comments/18uil9/i_think_nonviolent_criminals_shouldnt_be_sent_to/',
  

In [None]:
def conversation_to_xml_deltaless(submission_id, data):
    comments = []
    thread = ET.Element("thread")

    id, index = submission_id.split('_')
    threaded = data['thread']

    submission = ET.SubElement(thread, "submission", {
        "id": str(id),
        "index": str(index),
        "link": data.get("link", "")
    })

    ET.SubElement(submission, "title").text = data.get("title", "")
    ET.SubElement(submission, "original_poster").text = data.get("original_poster", "")
    ET.SubElement(submission, "original_post").text = data.get("original_post", "")

    for turn in threaded:
        comments.append(turn[1])

        comment = ET.SubElement(thread, "comment", {
            "id": str(turn[0][1]),
            "user": str(turn[0][2]),
            "delta": str(turn[0][3])
        })
        ET.SubElement(comment, "text").text = str(turn[1])

    comment_length = round(len(" ".join(comments))/len(threaded),2)

    return ET.ElementTree(thread), comment_length


In [None]:
output_dir = f"{cwd}/deltaless_threads/"
deltaless_threads_info = {}

for exchange in output_non_delta_threads:
  submission_id = list(exchange.keys())[0]
  data = exchange[submission_id]
  thread = data['thread']

  tree, comments_length = conversation_to_xml_deltaless(submission_id, data)


  deltaless_threads_info[submission_id] = {"thread_length": len(thread),
                                         "original_post_length": len(data['original_post']),
                                         "comments_avg_length": comments_length}

  indent(tree.getroot())
  outfile = os.path.join(output_dir, f"{submission_id}_deltaless_thread.xml")
  tree.write(outfile, encoding="utf-8", xml_declaration=True)
  print(f"Wrote XML: {submission_id}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Wrote XML: 565121219.0_96
Wrote XML: 565121219.0_97
Wrote XML: 565121219.0_98
Wrote XML: 565121219.0_99
Wrote XML: 565121219.0_100
Wrote XML: 565121219.0_101
Wrote XML: 565121219.0_102
Wrote XML: 565121219.0_103
Wrote XML: 565121219.0_104
Wrote XML: 567468370.0_1
Wrote XML: 567468370.0_2
Wrote XML: 567468370.0_3
Wrote XML: 567468370.0_4
Wrote XML: 567468370.0_5
Wrote XML: 567468370.0_6
Wrote XML: 572887756.0_1
Wrote XML: 572887756.0_2
Wrote XML: 572887756.0_3
Wrote XML: 572887756.0_4
Wrote XML: 572887756.0_5
Wrote XML: 572887756.0_6
Wrote XML: 572887756.0_7
Wrote XML: 572887756.0_8
Wrote XML: 572887756.0_9
Wrote XML: 577631717.0_1
Wrote XML: 577631717.0_2
Wrote XML: 577631717.0_3
Wrote XML: 577631717.0_4
Wrote XML: 577631717.0_5
Wrote XML: 577631717.0_6
Wrote XML: 577631717.0_7
Wrote XML: 577631717.0_8
Wrote XML: 577631717.0_9
Wrote XML: 577631717.0_10
Wrote XML: 577631717.0_11
Wrote XML: 577631717.0_12
Wrote XML: 5851983

In [None]:
# OLD VERSION TXT

"""
filename = "_deltaless_thread.txt"
folder = f"{cwd}/deltaless_threads/"

thread_length = {}

for exchange in output_non_delta_threads:
  submission_id = list(exchange.keys())[0]
  data = exchange[submission_id]
  thread = data['thread']

  current_filename = folder + submission_id + filename

  comment = ""
  for turn in thread:
    comment_id = turn[0][1]
    delta = turn[0][2]
    original_text = turn[1]
    #print(comment_id, delta)
    comment += f"Comment ID: {comment_id}\nDelta: {delta}\n\nComment\n{original_text}\n\n"
    depth = turn[0][0]

  thread_length[submission_id] = depth + 1

  with open(current_filename, 'w', encoding="utf-8") as f:
    post = f"Submission ID: {submission_id} {data['link']}\nTitle: {data['title']}\n\nOriginal post\n{data['original_post']}\n\n"
    text = post + comment
    print(f"Submission ID: {submission_id}")
    f.write(text)
"""

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Submission ID: 607392345.0_58
Submission ID: 607392345.0_59
Submission ID: 607392345.0_60
Submission ID: 607392345.0_61
Submission ID: 607392345.0_62
Submission ID: 607392345.0_63
Submission ID: 607392345.0_64
Submission ID: 607392345.0_65
Submission ID: 607392345.0_66
Submission ID: 607392345.0_67
Submission ID: 607392345.0_68
Submission ID: 607392345.0_69
Submission ID: 607392345.0_70
Submission ID: 607392345.0_71
Submission ID: 607392345.0_72
Submission ID: 607392345.0_73
Submission ID: 607392345.0_74
Submission ID: 607392345.0_75
Submission ID: 607392345.0_76
Submission ID: 607392345.0_77
Submission ID: 607392345.0_78
Submission ID: 607392345.0_79
Submission ID: 607392345.0_80
Submission ID: 607392345.0_81
Submission ID: 607392345.0_82
Submission ID: 607392345.0_83
Submission ID: 607392345.0_84
Submission ID: 608332769.0_1
Submission ID: 608332769.0_2
Submission ID: 608332769.0_3
Submission ID: 608332769.0_4
Submissio

In [None]:
df2 = pd.DataFrame.from_dict(deltaless_threads_info, orient="index")
avg_length_deltaless_threads = round(float(df2["thread_length"].aggregate(np.mean)),2)
med_length_deltaless_threads = int(df2["thread_length"].aggregate(np.median))
max_length_deltaless_threads = int(df2["thread_length"].aggregate(np.max))
avg_op_deltaless_threads = round(float(df2["original_post_length"].aggregate(np.mean)),2)
avg_comms_deltaless_threads = round(float(df2["comments_avg_length"].aggregate(np.mean)),2)

  avg_length_deltaless_threads = round(float(df2["thread_length"].aggregate(np.mean)),2)
  med_length_deltaless_threads = int(df2["thread_length"].aggregate(np.median))
  max_length_deltaless_threads = int(df2["thread_length"].aggregate(np.max))
  avg_op_deltaless_threads = round(float(df2["original_post_length"].aggregate(np.mean)),2)
  avg_comms_deltaless_threads = round(float(df2["comments_avg_length"].aggregate(np.mean)),2)


In [None]:
print(avg_length_deltaless_threads, max_length_deltaless_threads, med_length_deltaless_threads, avg_op_deltaless_threads, avg_comms_deltaless_threads)

4.14 603 1 1766.03 580.04


In [None]:
avg_length_deltaless2_threads = round(float(df2_long["thread_length"].aggregate(np.mean)),2)
med_length_deltaless2_threads = int(df2_long["thread_length"].aggregate(np.median))
max_length_deltaless2_threads = int(df2_long["thread_length"].aggregate(np.max))
avg_op_deltaless2_threads = round(float(df2_long["original_post_length"].aggregate(np.mean)),2)
avg_comms_deltaless2_threads = round(float(df2_long["comments_avg_length"].aggregate(np.mean)),2)

  avg_length_deltaless2_threads = round(float(df2_long["thread_length"].aggregate(np.mean)),2)
  med_length_deltaless2_threads = int(df2_long["thread_length"].aggregate(np.median))
  max_length_deltaless2_threads = int(df2_long["thread_length"].aggregate(np.max))
  avg_op_deltaless2_threads = round(float(df2_long["original_post_length"].aggregate(np.mean)),2)
  avg_comms_deltaless2_threads = round(float(df2_long["comments_avg_length"].aggregate(np.mean)),2)


In [None]:
print(avg_length_deltaless2_threads, max_length_deltaless2_threads, med_length_deltaless2_threads, avg_op_deltaless2_threads, avg_comms_deltaless2_threads)

8.41 603 4 1911.67 517.83


In [None]:
df2.to_csv(f"{cwd}/info_deltaless_threads.csv",
          index_label=None)