In [0]:
%sh pip install polars

In [0]:
import polars as pl

In [0]:
tags = pl.scan_parquet("/dbfs/mnt/mirai/so/tags.parquet/**/*.parquet")
posts = pl.scan_parquet("/dbfs/mnt/mirai/so/posts.parquet/**/*.parquet")

In [0]:
# What are the IDs of the tag wiki posts which are related to Apache Spark?
spark_tag_wiki_post_ids = (
  posts
  .with_columns(pl.col("Body").str.to_lowercase().alias("Body"))
  .filter(pl.col("PostTypeId").is_in([4, 5]))  # which posts are wiki entries?
  .filter(
    pl.col("Body").str.contains("apache spark") |
    pl.col("Body").str.contains("spark.apache.org")
  )
  .select("Id")
  .collect()
)

# Which tags are related to Apache Spark according to their wiki entry or excerpt?
spark_tags = (
  tags
  .filter(
    pl.col("ExcerptPostId").is_in(spark_tag_wiki_post_ids) | pl.col("WikiPostId").is_in(spark_tag_wiki_post_ids),
    pl.col("Count") >= 10,
    pl.col("TagName") != "dataframe"
  )
  .select("Id", "TagName", "Count")
  .collect()
)

spark_tags.to_pandas()

In [0]:
questions = posts.filter(pl.col("PostTypeId") == 1)

spark_tag_names = spark_tags.select("TagName").to_series().to_list()

spark_questions = (
  questions
  .with_columns(pl.col("Tags").str.extract_all(r"[^<>]+").alias("Tags"))
  .with_columns(pl.col("Tags").list.set_intersection(pl.lit(spark_tag_names)))
  .filter(pl.col("Tags").list.len() > 0)
)

In [0]:
(
  spark_questions
  .sort("Score", descending=True)
  .limit(5)
  .select("Id", "Title", "Body", "Score", "ViewCount", "Tags")
  .collect()
  .to_pandas()
)