In [None]:
# Merge csv Posting and Votes
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
df = pd.read_csv(r'./data/raw_csv/Postings_01052019_31052019.csv')

# Display the first 10 rows
print(df.head(10).to_string())

In [5]:
import pandas as pd
import numpy as np
from datetime import datetime
import json
import pickle

class DataPreprocessor:
    """
    A class to preprocess the raw comment data.
    """

    def __init__(self, file_path):
        """
        Initialize the DataPreprocessor with the path to the raw data file.
        
        :param file_path: str, path to the raw CSV file
        """
        self.file_path = file_path
        self.df = None

    def load_data(self):
        """Load the raw data from CSV file."""
        self.df = pd.read_csv(self.file_path)

    def preprocess(self):
        """Apply all preprocessing steps to the data."""
        self._convert_dates()
        self._handle_missing_values()
        self._convert_id_columns()
        self._create_new_features()

    def _convert_dates(self):
        """Convert date columns to datetime format."""
        date_columns = ['PostingCreatedAt', 'ArticlePublishingDate', 'UserCreatedAt']
        for col in date_columns:
            self.df[col] = pd.to_datetime(self.df[col])

    def _handle_missing_values(self):
        """Handle missing values in the dataset."""
        self.df['PostingHeadline'] = self.df['PostingHeadline'].fillna('No Headline')
        self.df['PostingComment'] = self.df['PostingComment'].fillna('No Comment')
        self.df['UserGender'] = self.df['UserGender'].fillna('Unknown')
        self.df['UserCommunityName'] = self.df['UserCommunityName'].fillna('Unknown')

    def _convert_id_columns(self):
        """Convert ID columns to integers."""
        id_columns = ['ID_Posting', 'ID_Posting_Parent', 'ID_CommunityIdentity', 'ID_Article']
        for col in id_columns:
            self.df[col] = self.df[col].fillna(0).astype(int)

    def _create_new_features(self):
        """Create new features from existing data."""
        self.df['CommentLength'] = self.df['PostingComment'].str.len()
        self.df['DaysSinceUserCreation'] = (self.df['PostingCreatedAt'] - self.df['UserCreatedAt']).dt.days
        self.df['IsReply'] = self.df['ID_Posting_Parent'] != 0
        self.df['PostingHour'] = self.df['PostingCreatedAt'].dt.hour
        self.df['PostingDayOfWeek'] = self.df['PostingCreatedAt'].dt.dayofweek

    def save_preprocessed_data(self, output_path):
        """
        Save the preprocessed data to a CSV file.
        
        :param output_path: str, path to save the preprocessed CSV file
        """
        self.df.to_csv(output_path, index=False)

        def save_preprocessed_data(self, output_path):
            """
            Save the preprocessed data to a pickle file.
            
            :param output_path: str, path to save the preprocessed pickle file
            """
        with open(output_path, 'wb') as f:
            pickle.dump(self.df, f)
        print(f"Preprocessed data saved to {output_path}")

    @classmethod
    def load_preprocessed_data(cls, input_path):
        """
        Load preprocessed data from a pickle file.
        
        :param input_path: str, path to the preprocessed pickle file
        :return: DataPreprocessor instance with loaded data
        """
        with open(input_path, 'rb') as f:
            df = pickle.load(f)

        preprocessor = cls(None)  # Create instance without file path
        preprocessor.df = df
        print(f"Preprocessed data loaded from {input_path}")
        return preprocessor

class CommentThreadManager:
    """
    A class to manage and structure comment threads for articles.
    """

    def __init__(self, df):
        """
        Initialize the CommentThreadManager with a preprocessed DataFrame.
        
        :param df: pandas DataFrame, preprocessed comment data
        """
        self.article_comments = {article_id: group for article_id, group in df.groupby('ID_Article')}

    def build_comment_thread(self, comments, parent_id, depth=0):
        thread = []
        replies = comments[comments['ID_Posting_Parent'] == parent_id]
        for _, reply in replies.iterrows():
            sub_thread = self.build_comment_thread(comments, int(reply['ID_Posting']), depth + 1)
            thread_item = {
                'id': int(reply['ID_Posting']),
                'parent_id': int(reply['ID_Posting_Parent']) if pd.notnull(reply['ID_Posting_Parent']) else None,
                'user_id': int(reply['ID_CommunityIdentity']),
                'user_name': reply['UserCommunityName'],
                'user_gender': reply['UserGender'],
                'user_created_at': reply['UserCreatedAt'].isoformat() if pd.notnull(reply['UserCreatedAt']) else None,
                'comment_headline': reply['PostingHeadline'],
                'comment_text': reply['PostingComment'],
                'comment_created_at': reply['PostingCreatedAt'].isoformat() if pd.notnull(reply['PostingCreatedAt']) else None,
                'comment_length': int(reply['CommentLength']),
                'depth': depth,
                'replies': sub_thread,
                'thread_stats': {
                    'total_comments': 1 + sum(r['thread_stats']['total_comments'] for r in sub_thread),
                    'max_depth': depth + max([r['thread_stats']['max_depth'] for r in sub_thread] + [0]),
                }
            }
            thread.append(thread_item)
        return thread

    def get_article_threads(self, article_id):
        """
        Get the structured comment threads for a specific article.
        
        :param article_id: int, ID of the article
        :return: dict, structured article data with comment threads
        """
        if article_id not in self.article_comments:
            return None

        article_df = self.article_comments[article_id]
        root_comments = article_df[article_df['ID_Posting_Parent'].isnull() | (article_df['ID_Posting_Parent'] == 0)]

        threads = []
        for _, comment in root_comments.iterrows():
            thread = {
                'id': int(comment['ID_Posting']),
                'parent_id': None,
                'user_id': int(comment['ID_CommunityIdentity']),
                'user_name': comment['UserCommunityName'],
                'user_gender': comment['UserGender'],
                'user_created_at': comment['UserCreatedAt'].isoformat() if pd.notnull(comment['UserCreatedAt']) else None,
                'comment_headline': comment['PostingHeadline'],
                'comment_text': comment['PostingComment'],
                'comment_created_at': comment['PostingCreatedAt'].isoformat() if pd.notnull(comment['PostingCreatedAt']) else None,
                'comment_length': int(comment['CommentLength']),
                'depth': 0,
                'replies': self.build_comment_thread(article_df, int(comment['ID_Posting']), 1)
            }
            threads.append(thread)

        article_meta = article_df.iloc[0]

        return {
            'article_id': int(article_id),
            'article_title': article_meta['ArticleTitle'],
            'article_publish_date': article_meta['ArticlePublishingDate'].isoformat() if pd.notnull(article_meta['ArticlePublishingDate']) else None,
            'article_channel': article_meta['ArticleChannel'],
            'article_ressort_name': article_meta['ArticleRessortName'],
            'total_comments': len(article_df),
            'root_comments': len(root_comments),
            'comment_threads': threads
        }

    def get_article_ids(self):
        """
        Get a list of all article IDs in the dataset.
        
        :return: list of int, article IDs
        """
        return list(self.article_comments.keys())
    
    def get_user_ids(self):
        """
        Get a list of all user IDs in the dataset.
        
        :return: list of int, user IDs
        """
        return self.df['ID_CommunityIdentity'].unique().tolist()



In [25]:
# Usage example
if __name__ == "__main__":
    # Preprocess the data
    preprocessor = DataPreprocessor('./data/raw_csv/Postings_01052019_31052019.csv')
    preprocessor.load_data()
    preprocessor.preprocess()
    preprocessor.save_preprocessed_data('preprocessed_data.csv')

    # Create comment thread manager
    manager = CommentThreadManager(preprocessor.df)

    # Get threads for a specific article
    article_id = 2000102330973  # Example article ID
    result = manager.get_article_threads(article_id)

    # Pretty print the result
    print(json.dumps(result, indent=2))

KeyboardInterrupt: 

# Now lets start building the context sphere
I understand your point. You're looking to create a more comprehensive view of a user's activity and interactions within the comment threads. Instead of just seeing isolated comments made by a user, you want to see the full context of their engagement. This includes:

1. All comments made by the user across different articles.
2. For each comment, you want to see the entire thread it belongs to, not just the user's comment in isolation.
3. This will show what the user was responding to and how their comment fits into the larger conversation.
4. It will also reveal any subsequent responses to the user's comments.

The goal is to build a "context sphere" around each user, showing their complete interaction history within the comment ecosystem. This approach will provide a more nuanced understanding of the user's behavior, opinions, and how they engage with others in discussions.

Is this interpretation correct? If so, I can suggest how to modify your existing code to achieve this goal.

In [11]:
class UserContextSphere:
    def __init__(self, df, thread_manager):
        self.df = df
        self.thread_manager = thread_manager
        self.user_comments = {user_id: group for user_id, group in df.groupby('ID_CommunityIdentity')}

    def get_user_context(self, user_id):
        if user_id not in self.user_comments:
            return None

        user_df = self.user_comments[user_id]

        # Calculate user statistics
        total_comments = len(user_df)
        total_replies = len(user_df[user_df['ID_Posting_Parent'].notnull()])

        user_context = {
            'user_id': int(user_id),
            'user_name': user_df['UserCommunityName'].iloc[0],
            'user_gender': user_df['UserGender'].iloc[0],
            'user_created_at': user_df['UserCreatedAt'].iloc[0].isoformat(),
            'total_comments': total_comments,
            'total_replies': total_replies,
            'articles': {}
        }

        # Group comments by article
        for article_id, article_comments in user_df.groupby('ID_Article'):
            article_id = int(article_id)
            article_thread = self.thread_manager.get_article_threads(article_id)

            if article_thread:
                user_context['articles'][article_id] = {
                    'article_title': article_thread['article_title'],
                    'article_publish_date': article_thread['article_publish_date'],
                    'user_comments_count': len(article_comments),
                    'user_replies_count': len(article_comments[article_comments['ID_Posting_Parent'].notnull()]),
                    'threads': []
                }

                for _, comment in article_comments.iterrows():
                    thread = self.find_thread_for_comment(article_thread['comment_threads'], int(comment['ID_Posting']))
                    if thread:
                        user_context['articles'][article_id]['threads'].append(thread)

        return user_context

    def find_thread_for_comment(self, threads, comment_id):
        for thread in threads:
            if thread['id'] == comment_id:
                return thread
            if thread['replies']:
                result = self.find_thread_for_comment(thread['replies'], comment_id)
                if result:
                    return thread  # Return the whole thread, not just the subthread
        return None

    def generate_user_report(self, user_id):
        user_context = self.get_user_context(user_id)
        if not user_context:
            return f"No data found for user ID {user_id}"
    
        report = f"User Report for ID: {user_context['user_id']}\n"
        report += f"Name: {user_context['user_name']}\n"
        report += f"Gender: {user_context['user_gender']}\n"
        report += f"Created At: {user_context['user_created_at']}\n"
        report += f"Total Comments: {user_context['total_comments']}\n"
        report += f"Total Replies: {user_context['total_replies']}\n\n"
    
        for article_id, article_data in user_context['articles'].items():
            report += f"Article: {article_data['article_title']}\n"
            report += f"Published: {article_data['article_publish_date']}\n"
            report += f"User Comments on this Article: {article_data['user_comments_count']}\n"
            report += f"User Replies on this Article: {article_data['user_replies_count']}\n\n"
    
            for thread in article_data['threads']:
                report += self.format_thread(thread, user_id, 0)
                report += "\n"
    
        return report

    def format_thread(self, thread, user_id, depth):
        indent = "  " * depth
        is_user = thread['user_id'] == user_id
        user_indicator = " (USER)" if is_user else ""
    
        formatted = f"{indent}[{thread['user_name']}]{user_indicator}: {thread['comment_text']}\n"
    
        for reply in thread['replies']:
            formatted += self.format_thread(reply, user_id, depth + 1)
    
        return formatted

In [12]:
import os

if __name__ == "__main__":
    preprocessed_file = "data/preprocessed/preprocessed_data.pkl"

    if not os.path.exists(preprocessed_file):
        print("Preprocessed data not found. Preprocessing...")
        preprocessor = DataPreprocessor('./data/raw_csv/Postings_01052019_31052019.csv')
        preprocessor.load_data()
        preprocessor.preprocess()
        preprocessor.save_preprocessed_data(preprocessed_file)
    else:
        print("Loading preprocessed data...")
        preprocessor = DataPreprocessor.load_preprocessed_data(preprocessed_file)

    # Create CommentThreadManager
    thread_manager = CommentThreadManager(preprocessor.df)

    # Create UserContextSphere
    user_context_sphere = UserContextSphere(preprocessor.df, thread_manager)

    # Example: Generate report for a specific user
    user_id = 671476  # Replace with the desired user ID
    user_report = user_context_sphere.generate_user_report(user_id)

    print(user_report)

Loading preprocessed data...
Preprocessed data loaded from data/preprocessed/preprocessed_data.pkl
User Report for ID: 671476
Name: Ravenspower
Gender: Unknown
Created At: 2018-04-14T13:42:28.470000
Total Comments: 431
Total Replies: 431

Article: Anna Gasser: "Das ist meine erste eigene Wohnung"
Published: 2019-05-08T06:00:00
User Comments on this Article: 1
User Replies on this Article: 1

[Ravenspower] (USER): ....wird nicht drumherumkommen in zukunft wieder das "italienische modell" zu bewerben. also mehrere generationen unter einen dach!

Article: Österreichische Streamerinnen: "Als Frau kämpft man mit Vorurteilen"
Published: 2019-05-05T10:00:00
User Comments on this Article: 6
User Replies on this Article: 6

[AlBundyFan]: ich kann doch selbst speilen - warum soll ich meine zeit damit verplempern anderen beim spielen zuzusehen. ich mein zwischendurch mal....aber mehrere millionen streamer gibt es anscheinend. da könnte man auch 24/7 am tag zusehen und manche sind da vermutlcih ga