In [29]:
import pandas as pd
import numpy as np
from datetime import datetime
import pickle
import tiktoken
from typing import Dict, List, Optional, Tuple, Set
import os
import json


class DataPreprocessor:
    def __init__(self, file_path: str):
        """
        Initialize the DataPreprocessor with the file path.

        Args:
            file_path (str): The path to the CSV file containing the data.
        """
        self.file_path = file_path
        self.df = None

    def process(self):
        """
        Process the data by loading it from the CSV file and applying various preprocessing steps.
        """
        self.df = pd.read_csv(self.file_path)
        self._convert_dates()
        self._handle_missing_values()
        self._convert_id_columns()
        self._create_new_features()

    def _convert_dates(self):
        """
        Convert date columns to datetime format.
        """
        date_columns = ['PostingCreatedAt', 'ArticlePublishingDate', 'UserCreatedAt']
        for col in date_columns:
            self.df[col] = pd.to_datetime(self.df[col])

    def _handle_missing_values(self):
        """
        Handle missing values in the data by filling them with appropriate values.
        """
        self.df['PostingHeadline'] = self.df['PostingHeadline'].fillna('No Headline')
        self.df['PostingComment'] = self.df['PostingComment'].fillna('No Comment')
        self.df['UserGender'] = self.df['UserGender'].fillna('Unknown')
        self.df['UserCommunityName'] = self.df['UserCommunityName'].fillna('Unknown')

    def _convert_id_columns(self):
        """
        Convert ID columns to integer type.
        """
        id_columns = ['ID_Posting', 'ID_Posting_Parent', 'ID_CommunityIdentity', 'ID_Article']
        for col in id_columns:
            self.df[col] = self.df[col].fillna(0).astype(int)

    def _create_new_features(self):
        """
        Create new features based on existing columns.
        """
        self.df['CommentLength'] = self.df['PostingComment'].str.len()
        self.df['DaysSinceUserCreation'] = (self.df['PostingCreatedAt'] - self.df['UserCreatedAt']).dt.days
        self.df['IsReply'] = self.df['ID_Posting_Parent'] != 0
        self.df['PostingHour'] = self.df['PostingCreatedAt'].dt.hour
        self.df['PostingDayOfWeek'] = self.df['PostingCreatedAt'].dt.dayofweek

    def save_preprocessed_data(self, output_path: str):
        """
        Save the preprocessed data to a pickle file.

        Args:
            output_path (str): The path where the preprocessed data will be saved.
        """
        with open(output_path, 'wb') as f:
            pickle.dump(self.df, f)
        print(f"Preprocessed data saved to {output_path}")

    @classmethod
    def load_preprocessed_data(cls, input_path: str):
        """
        Load the preprocessed data from a pickle file.

        Args:
            input_path (str): The path to the pickle file containing the preprocessed data.

        Returns:
            DataPreprocessor: An instance of DataPreprocessor with the loaded data.
        """
        with open(input_path, 'rb') as f:
            df = pickle.load(f)
        preprocessor = cls(None)
        preprocessor.df = df
        print(f"Preprocessed data loaded from {input_path}")
        return preprocessor

class CommentThreadManager:
    def __init__(self, df: pd.DataFrame):
        """
        Initialize the CommentThreadManager with the preprocessed data.

        Args:
            df (pd.DataFrame): The preprocessed data containing comment information.
        """
        self.article_comments = {article_id: group for article_id, group in df.groupby('ID_Article')}

    def build_comment_thread(self, comments: pd.DataFrame, parent_id: int, depth: int = 0) -> List[Dict]:
        """
        Build a hierarchical structure of comments and their replies.

        Args:
            comments (pd.DataFrame): The comments data for a specific article.
            parent_id (int): The ID of the parent comment.
            depth (int): The depth of the current comment in the thread.

        Returns:
            List[Dict]: A list of dictionaries representing the comment thread.
        """
        replies = comments[comments['ID_Posting_Parent'] == parent_id]
        return [{
            'id': int(reply['ID_Posting']),
            'parent_id': int(reply['ID_Posting_Parent']) if pd.notnull(reply['ID_Posting_Parent']) else None,
            'user_id': int(reply['ID_CommunityIdentity']),
            'user_name': reply['UserCommunityName'],
            'user_gender': reply['UserGender'],
            'user_created_at': reply['UserCreatedAt'].isoformat() if pd.notnull(reply['UserCreatedAt']) else None,
            'comment_headline': reply['PostingHeadline'],
            'comment_text': reply['PostingComment'],
            'comment_created_at': reply['PostingCreatedAt'].isoformat() if pd.notnull(reply['PostingCreatedAt']) else None,
            'comment_length': int(reply['CommentLength']),
            'depth': depth,
            'replies': self.build_comment_thread(comments, int(reply['ID_Posting']), depth + 1)
        } for _, reply in replies.iterrows()]

    def get_article_threads(self, article_id: int) -> Optional[Dict]:
        """
        Get the comment threads for a specific article.

        Args:
            article_id (int): The ID of the article.

        Returns:
            Optional[Dict]: A dictionary representing the article's comment threads, or None if the article is not found.
        """
        if article_id not in self.article_comments:
            return None

        article_df = self.article_comments[article_id]
        root_comments = article_df[article_df['ID_Posting_Parent'].isnull() | (article_df['ID_Posting_Parent'] == 0)]

        threads = self.build_comment_thread(article_df, 0)
        article_meta = article_df.iloc[0]

        return {
            'article_id': int(article_id),
            'article_title': article_meta['ArticleTitle'],
            'article_publish_date': article_meta['ArticlePublishingDate'].isoformat() if pd.notnull(article_meta['ArticlePublishingDate']) else None,
            'article_channel': article_meta['ArticleChannel'],
            'article_ressort_name': article_meta['ArticleRessortName'],
            'total_comments': len(article_df),
            'root_comments': len(root_comments),
            'comment_threads': threads
        }

    def get_article_ids(self) -> List[int]:
        """
        Get the list of article IDs.

        Returns:
            List[int]: A list of article IDs.
        """
        return list(self.article_comments.keys())

class UserContextSphere:
    def __init__(self, df: pd.DataFrame, thread_manager: CommentThreadManager):
        self.df = df
        self.thread_manager = thread_manager
        self.user_comments = {user_id: group for user_id, group in df.groupby('ID_CommunityIdentity')}


    def get_user_context(self, user_id: int) -> Optional[Dict]:
        """
        Get the context information for a specific user, including only the threads where the user has written a comment.
    
        Args:
            user_id (int): The ID of the user.
    
        Returns:
            Optional[Dict]: A dictionary representing the user's context, or None if the user is not found.
        """
        if user_id not in self.user_comments:
            return None
    
        user_df = self.user_comments[user_id]
        total_comments = len(user_df)
        total_replies = len(user_df[user_df['ID_Posting_Parent'].notnull()])
    
        user_context = {
            'user_id': int(user_id),
            'user_name': user_df['UserCommunityName'].iloc[0],
            'user_gender': user_df['UserGender'].iloc[0],
            'user_created_at': user_df['UserCreatedAt'].iloc[0].isoformat(),
            'total_comments': total_comments,
            'total_replies': total_replies,
            'articles': {}
        }
    
        for article_id, article_comments in user_df.groupby('ID_Article'):
            article_id = int(article_id)
            article_thread = self.thread_manager.get_article_threads(article_id)
    
            if article_thread:
                user_threads = []
                for _, comment in article_comments.iterrows():
                    thread = self.find_thread_for_comment(article_thread['comment_threads'], int(comment['ID_Posting']))
                    if thread:
                        user_threads.append(thread)
    
                if user_threads:
                    user_context['articles'][article_id] = {
                        'article_title': article_thread['article_title'],
                        'article_publish_date': article_thread['article_publish_date'],
                        'user_comments_count': len(article_comments),
                        'user_replies_count': len(article_comments[article_comments['ID_Posting_Parent'].notnull()]),
                        'threads': user_threads
                    }
    
        return user_context

    def find_thread_for_comment(self, threads: List[Dict], comment_id: int) -> Optional[Dict]:
        """
        Find the thread that contains a specific comment.

        Args:
            threads (List[Dict]): A list of comment threads.
            comment_id (int): The ID of the comment to find.

        Returns:
            Optional[Dict]: The thread containing the comment, or None if the comment is not found.
        """
        for thread in threads:
            if thread['id'] == comment_id:
                return thread
            result = self.find_thread_for_comment(thread['replies'], comment_id)
            if result:
                return thread
        return None


    def filter_user_threads(self, user_context: Dict, user_id: int) -> Tuple[Dict, int]:
        removed_threads = 0
        filtered_context = user_context.copy()
        filtered_context['articles'] = {}
    
        for article_id, article_data in user_context['articles'].items():
            user_threads = []
            for thread in article_data['threads']:
                filtered_thread = self.filter_thread_for_user(thread, user_id)
                if filtered_thread:
                    user_threads.append(filtered_thread)
                else:
                    removed_threads += 1
    
            if user_threads:
                filtered_context['articles'][article_id] = article_data.copy()
                filtered_context['articles'][article_id]['threads'] = user_threads
                filtered_context['articles'][article_id]['user_comments_count'] = sum(self.count_user_comments(thread, user_id) for thread in user_threads)
            else:
                removed_threads += 1
    
        return filtered_context, removed_threads

    def filter_thread_for_user(self, thread: Dict, user_id: int) -> Optional[Dict]:
        if thread['user_id'] == user_id:
            filtered_replies = [self.filter_thread_for_user(reply, user_id) for reply in thread['replies']]
            filtered_replies = [reply for reply in filtered_replies if reply]
            filtered_thread = thread.copy()
            filtered_thread['replies'] = filtered_replies
            return filtered_thread
        else:
            for reply in thread['replies']:
                filtered_reply = self.filter_thread_for_user(reply, user_id)
                if filtered_reply:
                    return thread.copy()
        return None

    def count_user_comments(self, thread: Dict, user_id: int) -> int:
        count = 1 if thread['user_id'] == user_id else 0
        for reply in thread['replies']:
            count += self.count_user_comments(reply, user_id)
        return count

    def user_has_comment_in_thread(self, thread: Dict, user_id: int) -> bool:
        """
        Check if the user has a comment in the given thread or its replies.

        Args:
            thread (Dict): The thread to check.
            user_id (int): The ID of the user.

        Returns:
            bool: True if the user has a comment in the thread, False otherwise.
        """
        if thread['user_id'] == user_id:
            return True
        for reply in thread['replies']:
            if self.user_has_comment_in_thread(reply, user_id):
                return True
        return False

    def generate_user_report_with_cutoff(self, user_id: int) -> Tuple[str, int, int]:
        """
        Generate a user report with cutoff, keeping only threads where the user has commented.

        Args:
            user_id (int): The ID of the user.

        Returns:
            Tuple[str, int, int]: A tuple containing the user report, token count, and the number of removed threads.
        """
        user_context = self.get_user_context(user_id)
        if not user_context:
            return f"No data found for user ID {user_id}", 0, 0

        filtered_context, removed_threads = self.filter_user_threads(user_context, user_id)

        report = json.dumps(filtered_context, indent=2)
        token_count = self.count_tokens(report)

        return report, token_count, removed_threads

    def count_tokens(self, text: str) -> int:
        """
        Count the number of tokens in a given text.

        Args:
            text (str): The input text.

        Returns:
            int: The number of tokens in the text.
        """
        encoding = tiktoken.get_encoding("cl100k_base")
        tokens_lst = encoding.encode(text)
        return len(tokens_lst)

In [21]:
import json
import os

# Main execution
preprocessed_file = "../data/preprocessed/preprocessed_data.pkl"

if not os.path.exists(preprocessed_file):
    print("Preprocessed data not found. Preprocessing...")
    preprocessor = DataPreprocessor('./data/raw_csv/Postings_01052019_31052019.csv')
    preprocessor.process()
    preprocessor.save_preprocessed_data(preprocessed_file)
else:
    print("Loading preprocessed data...")
    preprocessor = DataPreprocessor.load_preprocessed_data(preprocessed_file)

Loading preprocessed data...
Preprocessed data loaded from ../data/preprocessed/preprocessed_data.pkl


In [30]:
thread_manager = CommentThreadManager(preprocessor.df)
user_context_sphere = UserContextSphere(preprocessor.df, thread_manager)

# Create the spheres directories if they don't exist
spheres_dir_no_cutoff = "spheres/no_cutoff"
spheres_dir_cutoff = "spheres/cutoff"
os.makedirs(spheres_dir_no_cutoff, exist_ok=True)
os.makedirs(spheres_dir_cutoff, exist_ok=True)

user_id = 22231  # Replace with the desired user ID

# Generate and save context without cutoff
user_context_no_cutoff = user_context_sphere.get_user_context(user_id)
if user_context_no_cutoff:
    token_count_no_cutoff = user_context_sphere.count_tokens(json.dumps(user_context_no_cutoff))
    user_context_no_cutoff['token_count'] = token_count_no_cutoff

    filename_no_cutoff = f"{spheres_dir_no_cutoff}/{user_id}.json"
    with open(filename_no_cutoff, 'w', encoding='utf-8') as f:
        json.dump(user_context_no_cutoff, f, indent=2)
    print(f"User context without cutoff saved to {filename_no_cutoff}")

    # Generate and save context with cutoff
    # Generate and save context with cutoff
    user_report_cutoff, token_count_cutoff, removed_threads = user_context_sphere.generate_user_report_with_cutoff(user_id)
    user_context_cutoff = json.loads(user_report_cutoff)
    user_context_cutoff['token_count'] = token_count_cutoff
    user_context_cutoff['removed_threads'] = removed_threads
    
    filename_cutoff = f"{spheres_dir_cutoff}/{user_id}.json"
    with open(filename_cutoff, 'w', encoding='utf-8') as f:
        json.dump(user_context_cutoff, f, indent=2)
    print(f"User context with cutoff saved to {filename_cutoff}")
    
    print(f"Token count without cutoff: {token_count_no_cutoff}")
    print(f"Token count with cutoff: {token_count_cutoff}")
    print(f"Removed threads: {removed_threads}")
else:
    error_message = {"error": f"No data found for user ID {user_id}"}

    # Save error message to both directories
    for dir_path in [spheres_dir_no_cutoff, spheres_dir_cutoff]:
        filename = f"{dir_path}/{user_id}_error.json"
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(error_message, f, indent=2)
        print(f"Error message saved to {filename}")

User context without cutoff saved to spheres/no_cutoff/22231.json
User context with cutoff saved to spheres/cutoff/22231.json
Token count without cutoff: 17528
Token count with cutoff: 18842
Removed threads: 0
