In [1]:
import polars as pl

# Define constants
DATA_PATH = "../data/reddit/comments/RC_2015-01_parsed.ndjson"
TOKENS_PER_USER = 1000
MIN_COMMENTS_THRESHOLD = 20
SAMPLE_USER_COUNT = 100 # How many users to process (adjust as needed)
HEIGHT = 53851542

# 1. Load the parsed data
df = pl.scan_ndjson(DATA_PATH, infer_schema_length=10000, schema_overrides={"created_utc": pl.Datetime})
#height = len(df.collect())

In [2]:
import numpy as np

# 2. Calculate user comment counts
user_stats = df.group_by("author").agg(
    pl.len().alias("comment_count"),
    pl.sum("token_count_approx").alias("total_tokens_approx")
)

# 3. Filter out unwanted users
filtered_users = user_stats.filter(
    ~pl.col("author").is_in(["[deleted]", "AutoModerator"])
).filter(
    ~pl.col("author").str.contains("(?i)bot") # Case-insensitive bot filter
)

# 4. Filter for users meeting the minimum comment threshold
active_users = filtered_users.filter(
    pl.col("comment_count") >= MIN_COMMENTS_THRESHOLD
).collect()

# 5. Select a sample of users to process
selected_user_sample = active_users.sample(n=SAMPLE_USER_COUNT).select("author")

# Collect the authors list for filtering the main dataframe
selected_authors = selected_user_sample.get_column("author").to_list()

# 6. Filter the original DataFrame for selected authors' comments
user_comments_lazy = df.filter(pl.col("author").is_in(selected_authors))

user_comments_shuffled = user_comments_lazy.with_columns(
    # Generate a random float for each row within the author group
    _random_sort_key=pl.lit(np.random.rand()).over("author")
).sort(["author", "_random_sort_key"]).drop("_random_sort_key")


# 7. Calculate cumulative tokens for each user (original order)
#    The order within each author group is arbitrary at this stage (depends on scan order)
user_comments_with_cumsum = user_comments_lazy.with_columns(
    pl.col("token_count_approx").cum_sum().over("author").alias("cumulative_tokens")
)

# 8. Filter comments to stay within the token limit for each user (based on original order)
comments_within_limit = user_comments_with_cumsum.filter(
    pl.col("cumulative_tokens") <= TOKENS_PER_USER
)

# 9. Sort the *selected* comments chronologically for each user
#    Do this *after* filtering by token limit
comments_sorted_for_formatting = comments_within_limit.sort(["author", "created_utc"])

# 10. Format the selected and now sorted comments with the date
formatted_comments = comments_sorted_for_formatting.with_columns(
    formatted_comment=pl.format(
        "r/{}: {}",
        pl.col("subreddit"),
        #pl.col("created_utc").dt.date(),
        pl.col("body")
    )
)

# 11. Aggregate the formatted, token-limited comments for each user
final_user_texts = formatted_comments.group_by("author").agg(
    pl.col("formatted_comment").str.join("\n") # Join comments with double newline
).select("author", "formatted_comment") # Reorder columns

# Collect the results
final_df = final_user_texts.collect()

In [3]:
final_df.shape

(100, 2)

In [4]:
final_df.head()

author,formatted_comment
str,str
"""Archion""","""r/pitbulls: So it's not just m…"
"""Blarma1""","""r/KerbalSpaceProgram: Winning?…"
"""Buttlet""","""r/DotA2: It's not, it's a net …"
"""ColonelRuffhouse""","""r/wallpapers: 2014 was decided…"
"""ComoTeLamas""","""r/FIFA: Yeah I've been wonderi…"


In [5]:
final_df.write_ndjson("../data/reddit/comments/RC_2015-01_prepared.ndjson")
