In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import glob
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [None]:
# for lexical similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = TfidfVectorizer()

In [None]:
# for semantic similarity

from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2') # sbert

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# for detecting questions
import re
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Lexical similarity (TF-IDF cosine similarity)

In [None]:
def lexical_sim(text1, text2):

  tfidf_matrix = vectorizer.fit_transform([text1, text2])
  cos_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
  return cos_sim

## Semantic similarity (embeddings)

In [None]:
def semantic_sim(text1, text2):

  embeddings = model.encode([text1, text2])
  cos_sim_emb = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))
  return cos_sim_emb

## Quote vs new content

In [None]:
def compute_quote_ratio(comment_text):
    lines = comment_text.split('\n')
    quoted = [line for line in lines if line.strip().startswith('&gt;')] # > sign
    original = [line for line in lines if not line.strip().startswith('&gt;')]

    quoted_text = ' '.join(quoted).strip()
    original_text = ' '.join(original).strip()

    len_quoted = len(quoted_text.split()) # words
    len_original = len(original_text.split()) # words

    total = len_original + len_quoted # words
    ratio = len_original / total if total > 0 else 0

    return ratio, original_text, len_quoted, len_original

In [None]:
def count_questions_in_original(original_text):

    sentences = sent_tokenize(original_text)
    question_count = sum(1 for sent in sentences if sent.strip().endswith('?'))

    return question_count

## File parsing

In [None]:
"""
STOPWORDS = set(stopwords.words("english"))

def clean_text(text, STOPWORDS):
  text = text.lower()
  text1 = re.sub(r'http\S+|www\.\S+', '', text)
  text1 = re.sub(r'u\/\w+|r\/\w+', '', text1)
  tokens = [word for word in text1.split() if word.isalpha() and word not in STOPWORDS]
  text1 = " ".join(tokens)
  if len(text1) > 0:
    return text1
  else:
    return text
"""

In [None]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r'http\S+|www\.\S+', '', text)
  text = re.sub(r'u\/\w+|r\/\w+', '', text)
  return text

In [None]:
def process_thread(xml_content: str):
    soup = BeautifulSoup(xml_content, features="xml")
    submission = soup.find('submission')
    original_post = submission.find('original_post').text.strip()
    original_post = clean_text(original_post)

    comments = soup.find_all('comment')
    comment_texts = [c.find('text').text.strip() for  c in comments]
    clean_comms = []
    for c in comment_texts:
      c_clean = clean_text(c)
      clean_comms.append(c_clean)

    return original_post, clean_comms

In [None]:
def analyze_thread(original_post, comment_texts):
    results = []

    # first comment vs original post
    if comment_texts:
        first_comment = comment_texts[0]
        lexical = lexical_sim(original_post, first_comment)
        semantic = semantic_sim(original_post, first_comment)
        ratio, original_text, len_quote, len_original = compute_quote_ratio(first_comment)
        question_count = count_questions_in_original(original_text)
        results.append({
            'comment_id': 'comment_1',
            'lexical_similarity': lexical,
            'semantic_similarity': semantic,
            'new_content_proportion': ratio,
            'len_quote': len_quote,
            'len_original': len_original,
            'question_count': question_count,
        })

    # each comment vs the previous one
    for i, text in enumerate(comment_texts, 1):
      if i != 0 and i < len(comment_texts):
        prev = comment_texts[i - 1]
        curr = comment_texts[i] # starting from the 2nd comment
        lexical = lexical_sim(prev, curr)
        semantic = semantic_sim(prev, curr)
        ratio, original_text, len_quote, len_original = compute_quote_ratio(curr)
        question_count = count_questions_in_original(original_text)
        results.append({
            'comment_id': f"comment_{i + 1}",
            'lexical_similarity': lexical,
            'semantic_similarity': semantic,
            'new_content_proportion': ratio,
            'len_quote': len_quote,
            'len_original': len_original,
            'question_count': question_count
        })

    return results

In [None]:
def analyze_thread_2(original_post, comment_texts):
    results = []

    # first comment vs original post
    if comment_texts:
        first_comment = comment_texts[0]
        lexical = lexical_sim(original_post, first_comment)
        semantic = semantic_sim(original_post, first_comment)
        ratio, original_text, len_quote, len_original = compute_quote_ratio(first_comment)
        question_count = count_questions_in_original(original_text)
        results.append({
            'comment_id': 'comment_1',
            'lexical_similarity': lexical,
            'semantic_similarity': semantic,
            'new_content_proportion': ratio,
            'len_quote': len_quote,
            'len_original': len_original,
            'question_count': question_count,
        })

    # each comment vs the previous one
    for i, text in enumerate(comment_texts, 1):
      if i != 0 and i < len(comment_texts):
        prev = original_post
        curr = comment_texts[i] # starting from the 2nd comment
        lexical = lexical_sim(prev, curr)
        semantic = semantic_sim(prev, curr)
        ratio, original_text, len_quote, len_original = compute_quote_ratio(curr)
        question_count = count_questions_in_original(original_text)
        results.append({
            'comment_id': f"comment_{i + 1}",
            'lexical_similarity': lexical,
            'semantic_similarity': semantic,
            'new_content_proportion': ratio,
            'len_quote': len_quote,
            'len_original': len_original,
            'question_count': question_count
        })

    return results

## Running the code

In [None]:
cwd = os.getcwd()

In [None]:
xml_folder = f"{cwd}/100_sample/"
xml_files = glob.glob(os.path.join(xml_folder, "*.xml"))

In [None]:
all_results = []

for file in xml_files:
    with open(file, 'r', encoding='utf-8') as f:
      xml_content = f.read()
    op, comms = process_thread(xml_content)
    thread_results = analyze_thread_2(op, comms)
    all_results.append({
    'file': os.path.basename(file),
    'similarity_metrics': thread_results
    })



In [None]:
all_results

[{'file': '113567594.0_1_delta_threads.xml',
  'similarity_metrics': [{'comment_id': 'comment_1',
    'lexical_similarity': np.float64(0.48840128378835784),
    'semantic_similarity': np.float32(0.6072525),
    'new_content_proportion': 1.0,
    'len_quote': 0,
    'len_original': 360,
    'question_count': 0},
   {'comment_id': 'comment_2',
    'lexical_similarity': np.float64(0.44564922223342424),
    'semantic_similarity': np.float32(0.50172573),
    'new_content_proportion': 0.728,
    'len_quote': 34,
    'len_original': 91,
    'question_count': 0},
   {'comment_id': 'comment_3',
    'lexical_similarity': np.float64(0.37546164666506615),
    'semantic_similarity': np.float32(0.5452645),
    'new_content_proportion': 1.0,
    'len_quote': 0,
    'len_original': 215,
    'question_count': 0},
   {'comment_id': 'comment_4',
    'lexical_similarity': np.float64(0.21270756717870878),
    'semantic_similarity': np.float32(0.39616746),
    'new_content_proportion': 1.0,
    'len_quote':

## Transforming into a df

In [None]:
import pandas as pd

In [None]:
flattened = []

for entry in all_results:
  flat_row = {'filename': entry['file']}
  for i, comment in enumerate(entry['similarity_metrics'], start=1):
    prefix = f'c{i}_'
    flat_row[prefix + 'lexical_similarity'] = float(comment['lexical_similarity'])
    flat_row[prefix + 'semantic_similarity'] = float(comment['semantic_similarity'])
    flat_row[prefix + 'new_content_proportion'] = comment['new_content_proportion']
    flat_row[prefix + 'len_quote'] = comment['len_quote']
    flat_row[prefix + 'len_original'] = comment['len_original']
    flat_row[prefix + 'question_count'] = comment['question_count']
  flattened.append(flat_row)

In [None]:
df = pd.DataFrame(flattened)

In [None]:
df

Unnamed: 0,filename,c1_lexical_similarity,c1_semantic_similarity,c1_new_content_proportion,c1_len_quote,c1_len_original,c1_question_count,c2_lexical_similarity,c2_semantic_similarity,c2_new_content_proportion,...,c15_new_content_proportion,c15_len_quote,c15_len_original,c15_question_count,c16_lexical_similarity,c16_semantic_similarity,c16_new_content_proportion,c16_len_quote,c16_len_original,c16_question_count
0,113567594.0_1_delta_threads.xml,0.488401,0.607252,1.0,0,360,0,0.445649,0.501726,0.728,...,,,,,,,,,,
1,1082495263.0_2_delta_threads.xml,0.516155,0.326257,0.886525,16,125,0,0.319109,0.540175,1.0,...,,,,,,,,,,
2,1409948101.0_2_delta_threads.xml,0.141375,0.157165,1.0,0,28,0,0.145488,0.150579,1.0,...,,,,,,,,,,
3,1378810771.0_1_delta_threads.xml,0.4956,0.667209,1.0,0,136,1,0.245747,0.477267,1.0,...,,,,,,,,,,
4,1437482501.0_1_delta_threads.xml,0.370636,0.497295,1.0,0,45,2,0.224723,0.50589,1.0,...,,,,,,,,,,
5,154839924.0_4_delta_threads.xml,0.458793,0.51284,1.0,0,576,1,0.40472,0.324843,0.321429,...,,,,,,,,,,
6,1719432989.0_3_delta_threads.xml,0.365017,0.409463,1.0,0,35,0,0.293389,0.557498,1.0,...,,,,,,,,,,
7,1719432989.0_1_delta_threads.xml,0.463852,0.662579,1.0,0,171,3,0.847646,0.855992,0.655319,...,,,,,,,,,,
8,1821161756.0_2_delta_threads.xml,0.496218,0.610117,0.758621,56,176,5,0.336253,0.545164,1.0,...,,,,,,,,,,
9,1821161756.0_1_delta_threads.xml,0.319614,0.46753,1.0,0,30,2,0.058738,0.258601,1.0,...,,,,,,,,,,


In [None]:
features = [
    "lexical_similarity",
    "semantic_similarity",
    "new_content_proportion",
    "len_quote",
    "len_original",
    "question_count",
]
c_cols = [c for c in df.columns if c.startswith("c")]

In [None]:
def parse_col(col):
    m = re.match(r"c(\d+)_(.+)", col)
    return (int(m.group(1)), m.group(2)) if m else (None, None)

In [None]:
ordered_cols = ["filename"]
for feat in features:
    cols_for_feat = [c for c in c_cols if c.endswith(feat)]
    # sort by the numeric cN
    cols_for_feat = sorted(cols_for_feat, key=lambda c: parse_col(c)[0])
    ordered_cols.extend(cols_for_feat)

leftovers = [c for c in df.columns if c not in ordered_cols]
ordered_cols.extend(leftovers)

In [None]:
df = df[ordered_cols]

In [None]:
df

Unnamed: 0,filename,c1_lexical_similarity,c2_lexical_similarity,c3_lexical_similarity,c4_lexical_similarity,c5_lexical_similarity,c6_lexical_similarity,c7_lexical_similarity,c8_lexical_similarity,c9_lexical_similarity,...,c7_question_count,c8_question_count,c9_question_count,c10_question_count,c11_question_count,c12_question_count,c13_question_count,c14_question_count,c15_question_count,c16_question_count
0,113567594.0_1_delta_threads.xml,0.488401,0.445649,0.375462,0.212708,0.230426,0.282581,0.188461,0.217827,,...,1.0,0.0,,,,,,,,
1,1082495263.0_2_delta_threads.xml,0.516155,0.319109,0.518953,0.5533,0.674431,0.183618,0.273141,,,...,0.0,,,,,,,,,
2,1409948101.0_2_delta_threads.xml,0.141375,0.145488,0.053159,0.074178,0.276373,0.137113,0.510727,,,...,1.0,,,,,,,,,
3,1378810771.0_1_delta_threads.xml,0.4956,0.245747,0.178331,0.179095,0.257632,0.049083,0.125001,,,...,4.0,,,,,,,,,
4,1437482501.0_1_delta_threads.xml,0.370636,0.224723,0.330418,0.123331,0.075396,0.078745,,,,...,,,,,,,,,,
5,154839924.0_4_delta_threads.xml,0.458793,0.40472,0.090714,0.074942,0.281379,0.477594,0.206816,,,...,0.0,,,,,,,,,
6,1719432989.0_3_delta_threads.xml,0.365017,0.293389,0.264342,0.151523,0.311666,0.225627,0.214413,0.215563,0.183693,...,0.0,0.0,0.0,,,,,,,
7,1719432989.0_1_delta_threads.xml,0.463852,0.847646,0.14563,0.243258,0.081223,0.331908,0.406744,0.022549,0.487805,...,1.0,0.0,1.0,0.0,1.0,,,,,
8,1821161756.0_2_delta_threads.xml,0.496218,0.336253,0.105017,0.039874,0.120076,0.040936,0.040936,,,...,1.0,,,,,,,,,
9,1821161756.0_1_delta_threads.xml,0.319614,0.058738,0.333366,0.269887,0.259431,0.067672,0.134662,,,...,1.0,,,,,,,,,


In [None]:
df.to_csv(f"{cwd}/97_op_nlp.csv", index=False)