In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import glob
import json
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



## file parsing (op + 5 comments in a list)

In [None]:
"""
STOPWORDS = set(stopwords.words("english"))

def clean_text(text, STOPWORDS):
  text = text.lower()
  text1 = re.sub(r'http\S+|www\.\S+', '', text)
  text1 = re.sub(r'u\/\w+|r\/\w+', '', text1)
  tokens = [word for word in text1.split() if word.isalpha() and word not in STOPWORDS]
  text1 = " ".join(tokens)
  if len(text1) > 1:
    return text1
  else:
    return text
"""

In [None]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r'http\S+|www\.\S+', '', text)
  text = re.sub(r'u\/\w+|r\/\w+', '', text)
  return text

In [None]:
def process_thread(xml_content, filename):
    soup = BeautifulSoup(xml_content, features="xml")
    submission = soup.find('submission')
    original_post = submission.find('original_post').text.strip()
    original_post = clean_text(original_post)
    op_user = submission.find('original_poster').text.strip() if submission.find('original_poster') else "unknown_op"

    user_map = {'op': op_user}

    comments = soup.find_all('comment')
    comment_texts = []
    for i, comment in enumerate(comments):
      text = comment.find('text').text.strip()
      text = clean_text(text)
      user = comment.get('user', f'user_{i}')
      comment_texts.append(text)
      user_map[f"comment_{i+1}"] = user

    comment_texts.insert(0, original_post)
    json_line = {"filename": filename,
                 "unique_users": len(set(user_map.values())),
                 "users": user_map}

    return original_post, comment_texts, json_line

# emotion pipeline

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from tqdm import tqdm

In [None]:
model_name = "tae898/emoberta-large"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 512
device = 0 if torch.cuda.is_available() else -1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/408 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
torch.cuda.is_available()

True

In [None]:
emotion_pipeline = pipeline("text-classification", model=model,
                            tokenizer=tokenizer, return_all_scores=True,
                            top_k=None, device=device)

Device set to use cuda:0


# pipeline execution

In [None]:
def analyze_reddit_thread(comments, row_data):
    results = []

    for i, comment in enumerate(comments, 1):

      c_name = 'op' if i == 1 else f"c{i-1}"

      emotion_scores = emotion_pipeline(comment, truncation=True,
                                        padding='max_length', max_length=512)[0]

      top_emotion = max(emotion_scores, key=lambda x: x['score'])
      row_data[f'{c_name}_main'] = top_emotion['label']
      #row_data[f'{c_name}_confidence'] = round(top_emotion['score'], 3)

      for item in emotion_scores:
        label = item['label']
        score = round(item['score'], 3)
        row_data[f'{c_name}_{label}'] = score

    return pd.DataFrame([row_data])

In [None]:
cwd = os.getcwd()

In [None]:
input_dir = f"{cwd}/100_sample/"
xml_files = glob.glob(os.path.join(input_dir, "*.xml"))
emo_df = pd.DataFrame()
jsonl_output = f"{cwd}/users.jsonl"

for file in tqdm(xml_files):
  if file.split('/')[-1] in filenames:
    with open(file, 'r', encoding='utf-8') as f:
      xml_content = f.read()
      filename = os.path.basename(file)
      op, comms, json_line = process_thread(xml_content, filename)
      # print(op, comms, json_line)
    with open(jsonl_output, 'a', encoding="utf-8") as f:
      f.write(json.dumps(json_line) + "\n")
    try:
      row_data = {'filename': filename}
      emo_scores = analyze_reddit_thread(comms, row_data)
      emo_df = pd.concat([emo_df, emo_scores], ignore_index=True)
    except Exception as e:
      print(f" CRASHED on comment {i}")
      print(e)


100%|██████████| 97/97 [01:44<00:00,  1.07s/it]


In [None]:
emo_df['outcome'] = emo_df['filename'].apply(lambda x: 0 if 'deltaless' in str(x) else 1)

  emo_df['outcome'] = emo_df['filename'].apply(lambda x: 0 if 'deltaless' in str(x) else 1)


In [None]:
emo_df

Unnamed: 0,filename,op_main,op_anger,op_sadness,op_neutral,op_joy,op_disgust,op_fear,op_surprise,c1_main,...,c15_disgust,c16_main,c16_neutral,c16_anger,c16_joy,c16_sadness,c16_disgust,c16_fear,c16_surprise,outcome
0,113567594.0_1_delta_threads.xml,anger,0.686,0.22,0.053,0.029,0.006,0.004,0.002,neutral,...,,,,,,,,,,1
1,1082495263.0_2_delta_threads.xml,anger,0.619,0.068,0.236,0.056,0.012,0.007,0.002,anger,...,,,,,,,,,,1
2,1409948101.0_2_delta_threads.xml,neutral,0.16,0.018,0.74,0.068,0.012,0.002,0.001,neutral,...,,,,,,,,,,1
3,1378810771.0_1_delta_threads.xml,anger,0.523,0.168,0.169,0.086,0.049,0.003,0.002,anger,...,,,,,,,,,,1
4,1437482501.0_1_delta_threads.xml,neutral,0.319,0.068,0.408,0.15,0.045,0.007,0.002,neutral,...,,,,,,,,,,1
5,154839924.0_4_delta_threads.xml,neutral,0.045,0.009,0.603,0.34,0.001,0.001,0.001,neutral,...,,,,,,,,,,1
6,1719432989.0_3_delta_threads.xml,neutral,0.259,0.026,0.583,0.11,0.014,0.005,0.002,anger,...,,,,,,,,,,1
7,1719432989.0_1_delta_threads.xml,neutral,0.259,0.026,0.583,0.11,0.014,0.005,0.002,anger,...,,,,,,,,,,1
8,1821161756.0_2_delta_threads.xml,anger,0.697,0.045,0.078,0.058,0.116,0.003,0.004,anger,...,,,,,,,,,,1
9,1821161756.0_1_delta_threads.xml,anger,0.697,0.045,0.078,0.058,0.116,0.003,0.004,anger,...,,,,,,,,,,1


In [None]:
file_path = f'{cwd}/97_emo_scores.csv'

In [None]:
emo_df.to_csv(file_path, index=False)

In [None]:
emo_df.head()

Unnamed: 0,filename,op_main,op_neutral,op_anger,op_joy,op_sadness,op_disgust,op_surprise,op_fear,c1_main,...,c19_fear,c20_main,c20_anger,c20_neutral,c20_joy,c20_sadness,c20_disgust,c20_surprise,c20_fear,outcome
0,1062071645.0_1_delta_threads.xml,neutral,0.5,0.305,0.167,0.019,0.005,0.003,0.002,anger,...,,,,,,,,,,1
1,1075040167.0_1_delta_threads.xml,anger,0.078,0.616,0.058,0.082,0.13,0.018,0.019,anger,...,,,,,,,,,,1
2,115882088.0_1_delta_threads.xml,neutral,0.716,0.153,0.086,0.015,0.019,0.003,0.009,neutral,...,,,,,,,,,,1
3,1102614149.0_2_delta_threads.xml,neutral,0.598,0.244,0.098,0.052,0.003,0.002,0.004,neutral,...,,,,,,,,,,1
4,134214340.0_1_delta_threads.xml,neutral,0.465,0.167,0.33,0.027,0.003,0.003,0.004,anger,...,,,,,,,,,,1
