In [0]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [0]:
tags_schema = pa.schema([
  pa.field("Id", pa.int64(), nullable = False),
  pa.field("TagName", pa.string(), nullable = False),
  pa.field("Count", pa.int32(), nullable = False),
  pa.field("ExcerptPostId", pa.int64(), nullable = True),
  pa.field("WikiPostId", pa.int64(), nullable = True)
])
tags = pq.read_table("/dbfs/mnt/mirai/so/tags.parquet", schema=tags_schema).to_pandas()

posts_schema = pa.schema([
  pa.field("Id", pa.int64(), nullable = False),
  pa.field("PostTypeId", pa.int16(), nullable = False),
  pa.field("AcceptedAnswerId", pa.int64(), nullable = True),
  pa.field("ParentId", pa.int64(), nullable = True),
  pa.field("CreationDate", pa.timestamp("s"), nullable = False),
  pa.field("Score", pa.int32(), nullable = False),
  pa.field("ViewCount", pa.int32(), nullable = True),
  pa.field("Body", pa.string(), nullable = False),
  pa.field("OwnerUserId", pa.int64(), nullable = True),
  pa.field("OwnerDisplayName", pa.string(), nullable = True),
  pa.field("LastEditorUserId", pa.int64(), nullable = True),
  pa.field("LastEditorDisplayName", pa.string(), nullable = True),
  pa.field("LastEditDate", pa.timestamp("s"), nullable = True),
  pa.field("LastActivityDate", pa.timestamp("s"), nullable = False),
  pa.field("Title", pa.string(), nullable = True),
  pa.field("Tags", pa.string(), nullable = True),
  pa.field("AnswerCount", pa.int32(), nullable = True),
  pa.field("CommentCount", pa.int32(), nullable = True),
  pa.field("ClosedDate", pa.timestamp("s"), nullable = True),
  pa.field("CommunityOwnedDate", pa.timestamp("s"), nullable = True),
  pa.field("ContentLicense", pa.string(), nullable = False)
])
posts = pq.read_table(
  "/dbfs/mnt/mirai/so/posts.parquet",
  schema=posts_schema,
  columns=["Id", "PostTypeId", "Score", "ViewCount", "Title", "Body", "Tags"],  # pre-filter columns to save memory
  filters=[("PostTypeId", "in", [1, 4, 5])]  # pre-filter rows to save memory (only questions (1) and wiki entries (4, 5))
).to_pandas()


In [0]:
# IMPORTANT: we use two separate .loc operations for improved performance (we only need to compute the regex on a subset of the posts)
tag_wiki_posts = posts.loc[posts['PostTypeId'].isin([4, 5])] # wiki entry posts
spark_tag_wiki_post_ids = tag_wiki_posts.loc[tag_wiki_posts['Body'].str.contains("apache spark|spark\\.apache\\.org", case=False, regex=True), "Id"] # related to Apache Spark

spark_tags = tags.loc[
  (tags["ExcerptPostId"].isin(spark_tag_wiki_post_ids) | tags["WikiPostId"].isin(spark_tag_wiki_post_ids)) &
  (tags["Count"] >= 10) &
  (tags["TagName"] != "dataframe"),
  ["Id", "TagName", "Count"]
]

spark_tags

In [0]:
questions = posts.loc[posts["PostTypeId"] == 1].copy()
spark_tag_names = set(spark_tags["TagName"])

questions.loc[:, "Tags"] = questions["Tags"].str.split(r"[<>]").apply(lambda tags: set(tags or []).intersection(spark_tag_names))
spark_questions = questions.loc[questions["Tags"].apply(len) > 0]

In [0]:
(
  spark_questions
  .loc[:, ["Id", "Title", "Body", "Score", "ViewCount", "Tags"]]
  .sort_values("Score", ascending=False)
  .head(5)
)