In [1]:
import pandas as pd
import pickle
import tiktoken
from typing import Dict, List, Tuple
import os
import xml.etree.ElementTree as ET
from xml.dom import minidom
from collections import defaultdict
from data_preprocessor import DataPreprocessor

'''
The modified cutoff_after_last_interaction method now better aligns with your requirements. Here's how it works:

a. We first traverse all threads to find the user's last interaction time.
b. We then remove any comments (including root comments) that occur after the user's last interaction.
c. This approach ensures that the context sphere only contains information up to the user's last interaction, assuming that anything after that should not be part of the context.

This is a good approach because:
- It preserves the chronological context of the user's interactions.
- It removes potentially irrelevant information that the user hasn't seen or interacted with.
- It helps in creating a more focused and relevant context sphere for the user.
'''

class CommentThreadManager:
    def __init__(self, df: pd.DataFrame):
        self.article_comments = {article_id: group for article_id, group in df.groupby('ID_Article')}

    def build_comment_thread(self, comments: pd.DataFrame, parent_id: int, depth: int = 0) -> List[Dict]:
        replies = comments[comments['ID_Posting_Parent'] == parent_id]
        return [{
            'id': int(reply['ID_Posting']),
            'parent_id': int(reply['ID_Posting_Parent']) if pd.notnull(reply['ID_Posting_Parent']) else None,
            'user_id': int(reply['ID_CommunityIdentity']),
            'user_name': reply['UserCommunityName'],
            'user_gender': reply['UserGender'],
            'user_created_at': reply['UserCreatedAt'].isoformat() if pd.notnull(reply['UserCreatedAt']) else None,
            'comment_headline': reply['PostingHeadline'],
            'comment_text': reply['PostingComment'],
            'comment_created_at': reply['PostingCreatedAt'].isoformat() if pd.notnull(reply['PostingCreatedAt']) else None,
            'comment_length': int(reply['CommentLength']),
            'depth': depth,
            'replies': self.build_comment_thread(comments, int(reply['ID_Posting']), depth + 1)
        } for _, reply in replies.iterrows()]

    def get_article_threads(self, article_id: int) -> Dict:
        if article_id not in self.article_comments:
            return None

        article_df = self.article_comments[article_id]
        root_comments = article_df[article_df['ID_Posting_Parent'].isnull() | (article_df['ID_Posting_Parent'] == 0)]

        threads = self.build_comment_thread(article_df, 0)
        article_meta = article_df.iloc[0]

        return {
            'article_id': int(article_id),
            'article_title': article_meta['ArticleTitle'],
            'article_publish_date': article_meta['ArticlePublishingDate'].isoformat() if pd.notnull(article_meta['ArticlePublishingDate']) else None,
            'article_channel': article_meta['ArticleChannel'],
            'article_ressort_name': article_meta['ArticleRessortName'],
            'total_comments': len(article_df),
            'root_comments': len(root_comments),
            'comment_threads': threads
        }

class UserContextSphere:
    def __init__(self, df: pd.DataFrame, thread_manager: CommentThreadManager):
        self.df = df
        self.thread_manager = thread_manager
        self.user_comments = defaultdict(list)
        self._populate_user_comments()

    def _populate_user_comments(self):
        for _, row in self.df.iterrows():
            self.user_comments[row['ID_CommunityIdentity']].append(row)

    def get_user_context(self, user_id: int) -> Dict:
        if user_id not in self.user_comments:
            return None

        user_df = pd.DataFrame(self.user_comments[user_id])
        total_comments = len(user_df)
        total_replies = len(user_df[user_df['ID_Posting_Parent'].notnull()])

        user_context = {
            'user_id': int(user_id),
            'user_name': user_df['UserCommunityName'].iloc[0],
            'user_gender': user_df['UserGender'].iloc[0],
            'user_created_at': user_df['UserCreatedAt'].iloc[0].isoformat(),
            'total_comments': total_comments,
            'total_replies': total_replies,
            'articles': {}
        }

        for article_id, article_comments in user_df.groupby('ID_Article'):
            article_id = int(article_id)
            article_thread = self.thread_manager.get_article_threads(article_id)

            if article_thread:
                user_context['articles'][article_id] = {
                    'article_title': article_thread['article_title'],
                    'article_publish_date': article_thread['article_publish_date'],
                    'user_comments_count': len(article_comments),
                    'user_replies_count': len(article_comments[article_comments['ID_Posting_Parent'].notnull()]),
                    'threads': [self.find_thread_for_comment(article_thread['comment_threads'], int(comment['ID_Posting']))
                                for _, comment in article_comments.iterrows()]
                }

        return user_context

    def find_thread_for_comment(self, threads: List[Dict], comment_id: int) -> Dict:
        for thread in threads:
            if thread['id'] == comment_id:
                return thread
            result = self.find_thread_for_comment(thread['replies'], comment_id)
            if result:
                return thread
        return None

    def cutoff_after_last_interaction(self, user_context: Dict, user_id: int) -> Tuple[Dict, int]:
        removed_comments = 0
        last_interaction_time = None

        def process_thread(thread: Dict) -> Tuple[Dict, int]:
            nonlocal removed_comments, last_interaction_time
            if thread is None or 'comment_created_at' not in thread:
                return None, 1  # Remove this thread as it's invalid

            thread_time = pd.to_datetime(thread['comment_created_at'])
            if last_interaction_time and thread_time > last_interaction_time:
                return None, 1  # Remove this thread and all its replies

            if thread['user_id'] == user_id:
                if last_interaction_time is None or thread_time > last_interaction_time:
                    last_interaction_time = thread_time

            new_replies = []
            for reply in thread.get('replies', []):
                processed_reply, removed_count = process_thread(reply)
                removed_comments += removed_count
                if processed_reply:
                    new_replies.append(processed_reply)

            thread['replies'] = new_replies
            return thread, 0

        for article_id in user_context['articles']:
            new_threads = []
            for thread in user_context['articles'][article_id]['threads']:
                processed_thread, _ = process_thread(thread)
                if processed_thread:
                    new_threads.append(processed_thread)

            user_context['articles'][article_id]['threads'] = new_threads

        # Remove comments after the last interaction
        if last_interaction_time:
            for article_id in user_context['articles']:
                user_context['articles'][article_id]['threads'] = [
                    thread for thread in user_context['articles'][article_id]['threads']
                    if pd.to_datetime(thread['comment_created_at']) <= last_interaction_time
                ]
                removed_comments += len(user_context['articles'][article_id]['threads'])

        return user_context, removed_comments


    def escape_markdown(self, text: str) -> str:
        escape_chars = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
        for char in escape_chars:
            text = text.replace(char, '\\' + char)
        return text.replace('`', "'")

    def format_comment_thread(self, comment: Dict) -> str:
        root = ET.Element("comment_thread")
        self.add_comment_to_xml(root, comment)
        xml_str = ET.tostring(root, encoding='unicode')
        pretty_xml = minidom.parseString(xml_str).toprettyxml(indent="  ")
        return pretty_xml

    def add_comment_to_xml(self, parent: ET.Element, comment: Dict):
        comment_elem = ET.SubElement(parent, "comment")
        author_elem = ET.SubElement(comment_elem, "author")
        author_elem.text = self.escape_markdown(comment['user_name'])
        content_elem = ET.SubElement(comment_elem, "content")
        content_elem.text = self.escape_markdown(comment['comment_text'])
        if comment['replies']:
            replies_elem = ET.SubElement(comment_elem, "replies")
            for reply in comment['replies']:
                self.add_comment_to_xml(replies_elem, reply)

    def generate_formatted_user_context(self, user_id: int) -> str:
        user_context = self.get_user_context(user_id)
        if not user_context:
            return f"No data found for user ID {user_id}"

        output = []

        output.append("# User Context\n")
        output.append(f"- **User ID:** {user_context['user_id']}\n")
        output.append(f"- **Username:** {self.escape_markdown(user_context['user_name'])}\n")
        output.append(f"- **Gender:** {self.escape_markdown(user_context['user_gender'])}\n")
        output.append(f"- **Created At:** {user_context['user_created_at']}\n")
        output.append(f"- **Total Comments:** {user_context['total_comments']}\n")
        output.append(f"- **Total Replies:** {user_context['total_replies']}\n")
        output.append("\n---\n\n")

        for article_id, article_data in user_context['articles'].items():
            output.append("# Article Context\n")
            output.append(f"- **Article ID:** {article_id}\n")
            output.append(f"- **Article Title:** {self.escape_markdown(article_data['article_title'])}\n")
            output.append(f"- **Article Publish Date:** {article_data['article_publish_date']}\n")
            output.append(f"- **User Comments Count:** {article_data['user_comments_count']}\n")
            output.append(f"- **User Replies Count:** {article_data['user_replies_count']}\n")
            output.append("\n---\n\n")

            output.append("# Comment Threads\n\n")
            for i, thread in enumerate(article_data['threads'], 1):
                output.append(f"## Thread {i}\n\n")
                output.append(self.format_comment_thread(thread))
                output.append("\n---\n\n")

        output.append("# End of Context")
        return "".join(output)

    def generate_user_report_with_cutoff(self, user_id: int) -> Tuple[str, int, int]:
        user_context = self.get_user_context(user_id)
        if not user_context:
            return f"No data found for user ID {user_id}", 0, 0

        user_context, removed_comments = self.cutoff_after_last_interaction(user_context, user_id)

        report = self.generate_formatted_user_context(user_id)
        encoding = tiktoken.get_encoding("cl100k_base")
        token_count = len(encoding.encode(report))

        return report, token_count, removed_comments

    def find_users_with_few_comments(self, min_comments: int = 1, max_comments: int = 5) -> List[int]:
        user_comment_counts = self.df['ID_CommunityIdentity'].value_counts()
        users_with_few_comments = user_comment_counts[
            (user_comment_counts >= min_comments) & (user_comment_counts <= max_comments)
            ].index.tolist()
        return users_with_few_comments

In [2]:
# Load File
preprocessed_file = "../data/preprocessed/preprocessed_data.pkl"

if not os.path.exists(preprocessed_file):
    print("Preprocessed data not found. Preprocessing...")
    preprocessor = DataPreprocessor('../data/raw_csv/Postings_01052019_31052019.csv')
    preprocessor.process()
    with open(preprocessed_file, 'wb') as f:
        pickle.dump(preprocessor.df, f)
    print(f"Preprocessed data saved to {preprocessed_file}")
else:
    print("Loading preprocessed data...")
    preprocessor = DataPreprocessor.load_preprocessed_data(preprocessed_file)

Loading preprocessed data...
Preprocessed data loaded from ../data/preprocessed/preprocessed_data.pkl


In [3]:
# Main execution
thread_manager = CommentThreadManager(preprocessor.df)
user_context_sphere = UserContextSphere(preprocessor.df, thread_manager)

spheres_dir_no_cutoff = "spheres/no_cutoff"
spheres_dir_cutoff = "spheres/cutoff"
os.makedirs(spheres_dir_no_cutoff, exist_ok=True)
os.makedirs(spheres_dir_cutoff, exist_ok=True)

user_id = 26373  # Replace with the desired user ID

formatted_context = user_context_sphere.generate_formatted_user_context(user_id)

if formatted_context != f"No data found for user ID {user_id}":
    filename_no_cutoff = f"{spheres_dir_no_cutoff}/{user_id}.md"
    with open(filename_no_cutoff, 'w', encoding='utf-8') as f:
        f.write(formatted_context)
    print(f"User context without cutoff saved to {filename_no_cutoff}")

    report, token_count, removed_comments = user_context_sphere.generate_user_report_with_cutoff(user_id)

    filename_cutoff = f"{spheres_dir_cutoff}/{user_id}.md"
    with open(filename_cutoff, 'w', encoding='utf-8') as f:
        f.write(report)
    print(f"User context with cutoff saved to {filename_cutoff}")

    print(f"Token count: {token_count}")
    print(f"Removed comments: {removed_comments}")
else:
    error_message = f"# Error\n\nNo data found for user ID {user_id}"

    for dir_path in [spheres_dir_no_cutoff, spheres_dir_cutoff]:
        filename = f"{dir_path}/{user_id}_error.md"
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(error_message)
        print(f"Error message saved to {filename}")

User context without cutoff saved to spheres/no_cutoff/26373.md
User context with cutoff saved to spheres/cutoff/26373.md
Token count: 713856
Removed comments: 13


In [None]:
thread_manager = CommentThreadManager(preprocessor.df)
user_context_sphere = UserContextSphere(preprocessor.df, thread_manager)

# Directory setup
output_dir = "user_reports"
os.makedirs(output_dir, exist_ok=True)

# Find users with few comments
min_comments, max_comments = 2, 5
users_with_few_comments = user_context_sphere.find_users_with_few_comments(min_comments, max_comments)

# Save list of users with few comments
with open(os.path.join(output_dir, "users_with_few_comments.md"), "w", encoding="utf-8") as f:
    f.write(f"# Users with {min_comments}-{max_comments} comments\n\n")
    for user_id in users_with_few_comments:
        f.write(f"- User ID: {user_id}\n")

# Process and save reports for users with few comments
summary = []
for user_id in users_with_few_comments:
    report, token_count, removed_comments = user_context_sphere.generate_user_report_with_cutoff(user_id)

    if report != f"No data found for user ID {user_id}":
        filename = os.path.join(output_dir, f"user_{user_id}_report.md")
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(report)
        summary.append(f"User {user_id}: {token_count} tokens, {removed_comments} comments removed")
    else:
        summary.append(f"User {user_id}: No data found")

# Save summary
with open(os.path.join(output_dir, "summary.md"), "w", encoding="utf-8") as f:
    f.write("# Summary of User Reports\n\n")
    for line in summary:
        f.write(f"- {line}\n")

print(f"Reports and summary saved in {output_dir}")