In [31]:
import polars as pl
df = pl.scan_ndjson("data/reddit/comments/RC_2015-01", infer_schema_length=10000)

# parsing 
df = df.with_columns(
    pl.col("created_utc").str.to_integer().mul(1000).cast(pl.Datetime(time_unit="ms")).dt.replace_time_zone("UTC")
)

In [32]:
# Group by user and count the number of comments
user_counts = df.group_by("author").count().collect()

# Sort by the number of comments in descending order
user_counts = user_counts.sort("count", descending=True)



`LazyGroupBy.count` is deprecated. It has been renamed to `len`.



In [33]:
filtered_df = user_counts.filter(~pl.col("author").is_in(["[deleted]", "AutoModerator"])).filter(~pl.col("author").str.contains("(.?)bot|(.?)Bot"))


In [34]:
filtered_df

author,count
str,u32
"""TweetPoster""",16325
"""Doctor-Kitten""",13830
"""MTGCardFetcher""",12306
"""imgurtranscriber""",10302
"""Marvelvsdc00""",9090
…,…
"""dasloog""",1
"""klevenleven1""",1
"""grvaity""",1
"""bslkendall""",1


In [35]:
# from dtale import show

# show(filtered_df.to_pandas()).open_browser()

In [36]:
# count the number of users with more than 20 comments
user_counts.filter(pl.col("count") > 20).shape[0]

460270

In [69]:
sample_size = 50
user_ids = user_counts.filter(pl.col("count") == 20).sample(sample_size).get_column("author").to_list()

# get all their comments
comments = df.filter(pl.col("author").is_in(user_ids)).select(["body", "created_utc", "author"]).collect()
comments

body,created_utc,author
str,"datetime[ms, UTC]",str
"""moved in, threw out all my shi…",2015-01-01 02:01:45 UTC,"""yetshi"""
"""get fucked cunt""",2015-01-01 04:54:48 UTC,"""shifty39"""
"""That's hela cool""",2015-01-01 05:13:54 UTC,"""balraj_01"""
"""Lol, why doesn't he work, beca…",2015-01-01 12:20:30 UTC,"""shifty39"""
"""The quality isn't even bad eve…",2015-01-01 15:48:01 UTC,"""RaastaMousee"""
…,…,…
"""In Cooperation With 9to5, Nati…",2015-01-31 22:12:25 UTC,"""HOLIDAY_headcase"""
"""Looks good. Might give it a go…",2015-01-31 22:36:45 UTC,"""asbks"""
"""Do you drive in GA alot? It se…",2015-01-31 22:50:35 UTC,"""happyloaf"""
"""Jesus wants you to stop spammi…",2015-01-31 23:07:13 UTC,"""RounderKatt"""


In [70]:
# aggregate into a single string
autor_comments = comments.with_columns(
    #formatted_comment = pl.format("At {}: {}", pl.col("created_utc").dt.strftime("%Y-%m-%d %H:%M"), pl.col("body"))
    formatted_comment = pl.col("body")
).group_by("author").agg(
    pl.col("formatted_comment").str.join("\n")
).select("formatted_comment", "author")

In [None]:
import dotenv
dotenv.load_dotenv()

In [48]:
from transformers import AutoProcessor

model_id = "google/gemma-3-27b-it"
processor = AutoProcessor.from_pretrained(model_id)

True

In [58]:
def get_token_count(s):
    messages = [{"role": "user", "content": [{"type": "text", "text": s}]}]
    inputs = processor.apply_chat_template(
        messages, add_generation_prompt=True, tokenize=True,
        return_dict=True, return_tensors="pt"
    )
    return inputs["input_ids"].shape[-1]

def get_characters_per_token(s):
    return len(s) / get_token_count(s)

def get_words_per_token(s):
    return len(s.split()) / get_token_count(s)

In [99]:
get_token_count(autor_comments.sample(1).get_column("formatted_comment").item())

820

In [100]:
per_token_counts = autor_comments.with_columns(
    cpt=pl.col("formatted_comment").map_elements(get_characters_per_token),
    wpt=pl.col("formatted_comment").map_elements(get_words_per_token)
)







In [103]:
# get averages and medians
cpt_avg = per_token_counts.select("cpt").mean().item()
cpt_median = per_token_counts.select("cpt").median().item()
wpt_avg = per_token_counts.select("wpt").mean().item()
wpt_median = per_token_counts.select("wpt").median().item()

print(f"Average characters per token: {cpt_avg}")
print(f"Median characters per token: {cpt_median}")
print(f"Average words per token: {wpt_avg}")
print(f"Median words per token: {wpt_median}")


Average characters per token: 3.8958090773961147
Median characters per token: 3.8969447410111586
Average words per token: 0.7088223991825234
Median words per token: 0.7346745175008178


In [53]:
# plot distribution of counts values (interactive with top authors on hover)
import plotly.express as px
import polars as pl
import pandas as pd
import numpy as np # Import numpy

# Assume 'filtered_df' is a Polars DataFrame with 'author' and 'count' columns loaded previously
# filtered_df = ... (load or define filtered_df here if not already done)

# --- Manual Binning and Author Aggregation ---
counts_np = filtered_df['count'].to_numpy()
num_bins = 100 # Define the number of bins

# 1. Calculate histogram frequencies and bin edges using NumPy
frequencies, bin_edges = np.histogram(counts_np, bins=num_bins)

# 2. Calculate bin centers (for plotting on x-axis)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

# 3. For each bin, find the top N authors within that count range
top_authors_per_bin = []
max_authors_to_show = 10

for i in range(num_bins):
    lower_bound = bin_edges[i]
    upper_bound = bin_edges[i+1]

    # Find authors in the current bin's count range
    # Ensure the upper bound is inclusive for the last bin edge case
    if i == num_bins - 1:
         authors_in_bin_df = filtered_df.filter(
            (pl.col("count") >= lower_bound) & (pl.col("count") <= upper_bound)
        )
    else:
        authors_in_bin_df = filtered_df.filter(
            (pl.col("count") >= lower_bound) & (pl.col("count") < upper_bound)
        )

    # Get top authors (here, just taking the first N found in the filter, consider sorting if needed)
    top_authors_list = authors_in_bin_df.get_column("author").head(max_authors_to_show).to_list()

    if not top_authors_list:
        top_authors_per_bin.append("N/A")
    else:
        authors_str = ", ".join(top_authors_list)
        if len(authors_in_bin_df) > max_authors_to_show:
             authors_str += ", ..." # Indicate if there are more authors
        top_authors_per_bin.append(authors_str)


# 4. Create a Pandas DataFrame for Plotly Bar chart
plot_df = pd.DataFrame({
    'bin_center': bin_centers,
    'frequency': frequencies,
    'bin_start': bin_edges[:-1],
    'bin_end': bin_edges[1:],
    'top_authors': top_authors_per_bin
})

# Filter out bins with zero frequency to avoid plotting empty bars (optional)
plot_df = plot_df[plot_df['frequency'] > 0]

# --- Create the Interactive Bar Chart (acting as Histogram) ---
fig = px.bar(
    plot_df,
    x='bin_center',
    y='frequency',
    log_y=True, # Use logarithmic y-axis
    title="Interactive Distribution of User Comment Counts (Excluding '[deleted]' & 'AutoModerator')",
    labels={'bin_center': 'Approx. Number of Comments per User', 'frequency': 'Frequency'}, # Customize axis labels
    custom_data=['bin_start', 'bin_end', 'top_authors'] # Pass data needed for hover
)

# Customize the hover template
fig.update_traces(
    hovertemplate="<b>Count Range:</b> [%{customdata[0]:.0f} - %{customdata[1]:.0f})<br>" +
                  "<b>Frequency:</b> %{y}<br>" +
                  "<b>Top Authors:</b> %{customdata[2]}" +
                  "<extra></extra>" # Removes the default trace info
)


# Update layout for better readability (optional)
fig.update_layout(
    xaxis_title="Number of Comments per User (Binned)",
    yaxis_title="Frequency (Log Scale)",
    bargap=0 # Set bargap to 0 for histogram appearance
)

# Show the interactive figure
fig.show()


In [54]:
# plot distribution of counts values (interactive with top authors on hover, normalized frequency)
import plotly.express as px
import polars as pl
import pandas as pd
import numpy as np # Import numpy

# Assume 'filtered_df' is a Polars DataFrame with 'author' and 'count' columns loaded previously
# filtered_df = ... (load or define filtered_df here if not already done)

# --- Manual Binning and Author Aggregation ---
counts_np = filtered_df['count'].to_numpy()
total_users = len(filtered_df) # Get total number of users for normalization
num_bins = 100 # Define the number of bins

# 1. Calculate histogram frequencies and bin edges using NumPy
frequencies, bin_edges = np.histogram(counts_np, bins=num_bins)

# 1.1 Calculate normalized frequencies (density)
normalized_frequencies = frequencies / total_users

# 2. Calculate bin centers (for plotting on x-axis)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

# 3. For each bin, find the top N authors within that count range
top_authors_per_bin = []
max_authors_to_show = 10

for i in range(num_bins):
    lower_bound = bin_edges[i]
    upper_bound = bin_edges[i+1]

    # Find authors in the current bin's count range
    # Ensure the upper bound is inclusive for the last bin edge case
    if i == num_bins - 1:
         authors_in_bin_df = filtered_df.filter(
            (pl.col("count") >= lower_bound) & (pl.col("count") <= upper_bound)
        )
    else:
        authors_in_bin_df = filtered_df.filter(
            (pl.col("count") >= lower_bound) & (pl.col("count") < upper_bound)
        )

    # Get top authors (here, just taking the first N found in the filter, consider sorting if needed)
    top_authors_list = authors_in_bin_df.get_column("author").head(max_authors_to_show).to_list()

    if not top_authors_list:
        top_authors_per_bin.append("N/A")
    else:
        authors_str = ", ".join(top_authors_list)
        if len(authors_in_bin_df) > max_authors_to_show:
             authors_str += ", ..." # Indicate if there are more authors
        top_authors_per_bin.append(authors_str)


# 4. Create a Pandas DataFrame for Plotly Bar chart
plot_df = pd.DataFrame({
    'bin_center': bin_centers,
    'frequency': frequencies, # Keep original frequency for potential use/checking
    'normalized_frequency': normalized_frequencies, # Add normalized frequency
    'bin_start': bin_edges[:-1],
    'bin_end': bin_edges[1:],
    'top_authors': top_authors_per_bin
})

# Filter out bins with zero frequency to avoid plotting empty bars (optional)
plot_df = plot_df[plot_df['frequency'] > 0]

# --- Create the Interactive Bar Chart (acting as Histogram) ---
fig = px.bar(
    plot_df,
    x='bin_center',
    y='normalized_frequency', # Use normalized frequency for the y-axis
    log_y=True, # Use logarithmic y-axis
    title="Interactive Distribution of User Comment Counts (Normalized Frequency)",
    labels={'bin_center': 'Approx. Number of Comments per User', 'normalized_frequency': 'Normalized Frequency'}, # Customize axis labels
    custom_data=['bin_start', 'bin_end', 'top_authors', 'frequency'] # Pass data needed for hover (incl. original freq)
)

# Customize the hover template
fig.update_traces(
    hovertemplate="<b>Count Range:</b> [%{customdata[0]:.0f} - %{customdata[1]:.0f})<br>" +
                  "<b>Normalized Frequency:</b> %{y:.4f}<br>" + # Format normalized frequency
                  "<b>Original Frequency:</b> %{customdata[3]}<br>" + # Show original count too
                  "<b>Top Authors:</b> %{customdata[2]}" +
                  "<extra></extra>" # Removes the default trace info
)


# Update layout for better readability (optional)
fig.update_layout(
    xaxis_title="Number of Comments per User (Binned)",
    yaxis_title="Normalized Frequency (Log Scale)", # Update y-axis title
    bargap=0 # Set bargap to 0 for histogram appearance
)

# Show the interactive figure
fig.show()

In [16]:
import polars 

dd = polars.read_parquet("../data/test_cache/RS_2015-08.parquet")

In [17]:
dd.head()

id,url,permalink,author,created_utc,subreddit,subreddit_id,selftext,title,num_comments,score,is_self,over_18,distinguished,edited,domain,stickied,locked,quarantine,score_hidden,retrieved_on,author_flair_css_class,author_flair_text
str,str,str,str,i64,str,str,str,str,i64,i64,bool,bool,str,str,str,bool,bool,bool,bool,i64,str,str
"""3fcrck""","""http://www.reddit.com/r/AskRed…","""/r/AskReddit/comments/3fcrck/w…","""ZioToons""",1438387200,"""AskReddit""","""t5_2qh1i""","""""","""What job/career can everybody …",6,2,True,False,,"""False""","""self.AskReddit""",False,,False,,1440576395,,
"""3fcrcl""","""http://imgur.com/a/hIDWD""","""/r/Tentai/comments/3fcrcl/kann…","""BillNyeTheHentaiGuy""",1438387200,"""Tentai""","""t5_2vtmc""","""""","""KanNomi""",0,77,False,True,,"""False""","""imgur.com""",False,,False,,1440576395,,
"""3fcrcm""","""https://glassesbeyondglasses.b…","""/r/listentothis/comments/3fcrc…","""Vensuad""",1438387200,"""listentothis""","""t5_2qxzy""","""""","""Glasses Beyond Glasses -- Comp…",1,1,False,False,,"""False""","""glassesbeyondglasses.bandcamp.…",False,,False,,1440576395,"""lastfm""","""lastfm/user/Zakk2Gud"""
"""3fcrcn""","""http://i.imgur.com/OipFzdc.jpg""","""/r/ImGoingToHellForThis/commen…","""[deleted]""",1438387201,"""ImGoingToHellForThis""","""t5_2s7yq""","""""","""All this Cecil the Lion commot…",0,2,False,True,,"""False""","""i.imgur.com""",False,,False,,1440576395,,
"""3fcrco""","""http://www.reddit.com/r/alcoho…","""/r/alcohol/comments/3fcrco/loo…","""awesome-j""",1438387201,"""alcohol""","""t5_2qi6q""","""My 23 birthday is in a month a…","""Looking for some nice liqour""",2,3,True,False,,"""False""","""self.alcohol""",False,,False,,1440576395,,
