In [None]:
# Merge csv Posting and Votes
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
df = pd.read_csv(r'./data/raw_csv/Postings_01052019_31052019.csv')

# Display the first 10 rows
print(df.head(10).to_string())

In [13]:
import pandas as pd
import numpy as np
from datetime import datetime
import json

class DataPreprocessor:
    """
    A class to preprocess the raw comment data.
    """

    def __init__(self, file_path):
        """
        Initialize the DataPreprocessor with the path to the raw data file.
        
        :param file_path: str, path to the raw CSV file
        """
        self.file_path = file_path
        self.df = None

    def load_data(self):
        """Load the raw data from CSV file."""
        self.df = pd.read_csv(self.file_path)

    def preprocess(self):
        """Apply all preprocessing steps to the data."""
        self._convert_dates()
        self._handle_missing_values()
        self._convert_id_columns()
        self._create_new_features()

    def _convert_dates(self):
        """Convert date columns to datetime format."""
        date_columns = ['PostingCreatedAt', 'ArticlePublishingDate', 'UserCreatedAt']
        for col in date_columns:
            self.df[col] = pd.to_datetime(self.df[col])

    def _handle_missing_values(self):
        """Handle missing values in the dataset."""
        self.df['PostingHeadline'] = self.df['PostingHeadline'].fillna('No Headline')
        self.df['PostingComment'] = self.df['PostingComment'].fillna('No Comment')
        self.df['UserGender'] = self.df['UserGender'].fillna('Unknown')
        self.df['UserCommunityName'] = self.df['UserCommunityName'].fillna('Unknown')

    def _convert_id_columns(self):
        """Convert ID columns to integers."""
        id_columns = ['ID_Posting', 'ID_Posting_Parent', 'ID_CommunityIdentity', 'ID_Article']
        for col in id_columns:
            self.df[col] = self.df[col].fillna(0).astype(int)

    def _create_new_features(self):
        """Create new features from existing data."""
        self.df['CommentLength'] = self.df['PostingComment'].str.len()
        self.df['DaysSinceUserCreation'] = (self.df['PostingCreatedAt'] - self.df['UserCreatedAt']).dt.days
        self.df['IsReply'] = self.df['ID_Posting_Parent'] != 0
        self.df['PostingHour'] = self.df['PostingCreatedAt'].dt.hour
        self.df['PostingDayOfWeek'] = self.df['PostingCreatedAt'].dt.dayofweek

    def save_preprocessed_data(self, output_path):
        """
        Save the preprocessed data to a CSV file.
        
        :param output_path: str, path to save the preprocessed CSV file
        """
        self.df.to_csv(output_path, index=False)

class CommentThreadManager:
    """
    A class to manage and structure comment threads for articles.
    """

    def __init__(self, df):
        """
        Initialize the CommentThreadManager with a preprocessed DataFrame.
        
        :param df: pandas DataFrame, preprocessed comment data
        """
        self.article_comments = {article_id: group for article_id, group in df.groupby('ID_Article')}

    def build_comment_thread(self, comments, parent_id, depth=0):
        """
        Recursively build a comment thread structure.
        
        :param comments: pandas DataFrame, comments for a specific article
        :param parent_id: int, ID of the parent comment
        :param depth: int, current depth in the comment thread
        :return: list of dict, structured comment thread
        """
        thread = []
        replies = comments[comments['ID_Posting_Parent'] == parent_id]
        for _, reply in replies.iterrows():
            thread.append({
                'id': int(reply['ID_Posting']),
                'parent_id': int(reply['ID_Posting_Parent']) if pd.notnull(reply['ID_Posting_Parent']) else None,
                'user_id': int(reply['ID_CommunityIdentity']),
                'user_name': reply['UserCommunityName'],
                'user_gender': reply['UserGender'],
                'user_created_at': reply['UserCreatedAt'].isoformat() if pd.notnull(reply['UserCreatedAt']) else None,
                'comment_headline': reply['PostingHeadline'],
                'comment_text': reply['PostingComment'],
                'comment_created_at': reply['PostingCreatedAt'].isoformat() if pd.notnull(reply['PostingCreatedAt']) else None,
                'comment_length': int(reply['CommentLength']),
                'depth': depth,
                'replies': self.build_comment_thread(comments, int(reply['ID_Posting']), depth + 1)
            })
        return thread

    def get_article_threads(self, article_id):
        """
        Get the structured comment threads for a specific article.
        
        :param article_id: int, ID of the article
        :return: dict, structured article data with comment threads
        """
        if article_id not in self.article_comments:
            return None

        article_df = self.article_comments[article_id]
        root_comments = article_df[article_df['ID_Posting_Parent'].isnull() | (article_df['ID_Posting_Parent'] == 0)]

        threads = []
        for _, comment in root_comments.iterrows():
            thread = {
                'id': int(comment['ID_Posting']),
                'parent_id': None,
                'user_id': int(comment['ID_CommunityIdentity']),
                'user_name': comment['UserCommunityName'],
                'user_gender': comment['UserGender'],
                'user_created_at': comment['UserCreatedAt'].isoformat() if pd.notnull(comment['UserCreatedAt']) else None,
                'comment_headline': comment['PostingHeadline'],
                'comment_text': comment['PostingComment'],
                'comment_created_at': comment['PostingCreatedAt'].isoformat() if pd.notnull(comment['PostingCreatedAt']) else None,
                'comment_length': int(comment['CommentLength']),
                'depth': 0,
                'replies': self.build_comment_thread(article_df, int(comment['ID_Posting']), 1)
            }
            threads.append(thread)

        article_meta = article_df.iloc[0]

        return {
            'article_id': int(article_id),
            'article_title': article_meta['ArticleTitle'],
            'article_publish_date': article_meta['ArticlePublishingDate'].isoformat() if pd.notnull(article_meta['ArticlePublishingDate']) else None,
            'article_channel': article_meta['ArticleChannel'],
            'article_ressort_name': article_meta['ArticleRessortName'],
            'total_comments': len(article_df),
            'root_comments': len(root_comments),
            'comment_threads': threads
        }

    def get_article_ids(self):
        """
        Get a list of all article IDs in the dataset.
        
        :return: list of int, article IDs
        """
        return list(self.article_comments.keys())
    
    def get_user_ids(self):
        """
        Get a list of all user IDs in the dataset.
        
        :return: list of int, user IDs
        """
        return self.df['ID_CommunityIdentity'].unique().tolist()



In [14]:
# Usage example
if __name__ == "__main__":
    # Preprocess the data
    preprocessor = DataPreprocessor('./data/raw_csv/Postings_01052019_31052019.csv')
    preprocessor.load_data()
    preprocessor.preprocess()
    preprocessor.save_preprocessed_data('preprocessed_data.csv')

    # Create comment thread manager
    manager = CommentThreadManager(preprocessor.df)

    # Get threads for a specific article
    article_id = 2000102330973  # Example article ID
    result = manager.get_article_threads(article_id)

    # Pretty print the result
    print(json.dumps(result, indent=2))

{
  "article_id": 2000102330973,
  "article_title": "1. Mai in Wien: SP\u00d6 fordert von Strache R\u00fccktritt",
  "article_publish_date": "2019-05-01T10:28:57.490000",
  "article_channel": "Inland",
  "article_ressort_name": "Parteien",
  "total_comments": 2712,
  "root_comments": 628,
  "comment_threads": [
    {
      "id": 1041080828,
      "parent_id": null,
      "user_id": 671476,
      "user_name": "Ravenspower",
      "user_gender": "Unknown",
      "user_created_at": "2018-04-14T13:42:28.470000",
      "comment_headline": "Heute w\u00e4re der perfekte Tag f\u00fcr die SP\u00d6 gewesen....",
      "comment_text": "ihr noch nicht erf\u00fclltes versprechen, den silberstein-vertrag vorzulegen, einzul\u00f6sen! denn immer noch interessiert es den einen oder anderen w\u00e4hler, auch nichtw\u00e4hler, wer den schandvertrag tats\u00e4chlich unterschrieben hat! wer auch immer es war... hier m\u00fcssen noch diverse \"k\u00f6pfe rollen\" und die SP\u00d6 muss ordnung machen im eige

# Now lets start building the context sphere
I understand your point. You're looking to create a more comprehensive view of a user's activity and interactions within the comment threads. Instead of just seeing isolated comments made by a user, you want to see the full context of their engagement. This includes:

1. All comments made by the user across different articles.
2. For each comment, you want to see the entire thread it belongs to, not just the user's comment in isolation.
3. This will show what the user was responding to and how their comment fits into the larger conversation.
4. It will also reveal any subsequent responses to the user's comments.

The goal is to build a "context sphere" around each user, showing their complete interaction history within the comment ecosystem. This approach will provide a more nuanced understanding of the user's behavior, opinions, and how they engage with others in discussions.

Is this interpretation correct? If so, I can suggest how to modify your existing code to achieve this goal.

In [15]:
class UserContextSphere:
    def __init__(self, df, thread_manager):
        self.df = df
        self.thread_manager = thread_manager
        self.user_comments = {user_id: group for user_id, group in df.groupby('ID_CommunityIdentity')}

    def get_user_context(self, user_id):
        if user_id not in self.user_comments:
            return None

        user_df = self.user_comments[user_id]
        user_context = {
            'user_id': int(user_id),
            'user_name': user_df['UserCommunityName'].iloc[0],
            'user_gender': user_df['UserGender'].iloc[0],
            'user_created_at': user_df['UserCreatedAt'].iloc[0].isoformat(),
            'total_comments': len(user_df),
            'comment_threads': []
        }

        for _, comment in user_df.iterrows():
            article_id = int(comment['ID_Article'])
            article_thread = self.thread_manager.get_article_threads(article_id)

            if article_thread:
                user_comment = self.find_user_comment(article_thread['comment_threads'], int(comment['ID_Posting']))
                if user_comment:
                    user_context['comment_threads'].append({
                        'article_id': article_id,
                        'article_title': article_thread['article_title'],
                        'article_publish_date': article_thread['article_publish_date'],
                        'thread': user_comment
                    })

        return user_context

    def find_user_comment(self, threads, comment_id):
        for thread in threads:
            if thread['id'] == comment_id:
                return thread
            if thread['replies']:
                result = self.find_user_comment(thread['replies'], comment_id)
                if result:
                    return result
        return None

In [None]:
if __name__ == "__main__":
    # Initialize and preprocess data
    preprocessor = DataPreprocessor('./data/raw_csv/Postings_01052019_31052019.csv')
    preprocessor.load_data()
    preprocessor.preprocess()

    # Create CommentThreadManager
    thread_manager = CommentThreadManager(preprocessor.df)

    # Create UserContextSphere
    user_context_sphere = UserContextSphere(preprocessor.df, thread_manager)

    # Example: Get context sphere for a specific user
    user_id = 671476  # Replace with the desired user ID
    user_context = user_context_sphere.get_user_context(user_id)

    if user_context:
        print(f"User ID: {user_context['user_id']}")
        print(f"User Name: {user_context['user_name']}")
        print(f"Total Comments: {user_context['total_comments']}")
        print("\nComment Threads:")
        for thread in user_context['comment_threads']:
            print(f"  Article: {thread['article_title']}")
            print(f"  User's Comment: {thread['thread']['comment_text']}")
            print(f"  Replies: {len(thread['thread']['replies'])}")
            print("  ---")
    else:
        print(f"No context found for user ID {user_id}")