In [1]:
%load_ext dotenv
%dotenv
import os
import markdown
import wandb
import pandas as pd
from github import Github
from packaging.version import Version

In [2]:
PROJECT_NAME = "semplify"
GH_USER="wandb"
GH_REPO="wandb"

In [3]:
run = wandb.init(project=PROJECT_NAME, name="data_collection")

[34m[1mwandb[0m: Currently logged in as: [33mparambharat[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
tweet_query = "pip install wandb from:weights_biases"
raw_tweets_file = "../data/raw_tweets.jsonl"

!snscrape \
--jsonl \
--progress \
--max-results 100 \
--since 2019-06-01 \
twitter-search "{tweet_query}" > {raw_tweets_file}

Finished, 19 results


In [5]:
raw_artifacts = wandb.Artifact("raw_dataset", type="dataset")
raw_artifacts.add_file(raw_tweets_file, name="scraped_tweets")

<ManifestEntry digest: g3zyvMi2nehvc90lFGmE2A==>

###  Extract semver

In [6]:

def safe_extract_semvar(x):
    try:
        return Version(x)
    except ValueError:
        return None

SEMVER_REGEX = r"(\d+\.\d+[\.\d+]*)"

def load_tweet_data(fname):
    df = pd.read_json(fname, lines=True, orient="records")
    df = df[["url", "content", "date"]]
    df["semver_str"] = df["content"].str.extract(SEMVER_REGEX)
    df = df[~df["semver_str"].isna()]
    df["semver"] = df["semver_str"].map(safe_extract_semvar)
    df = df[~df["semver"].isna()]
    df = (df
          .drop("semver", axis=1)
          .rename({"semver_str": "semver", "content": "tweet"}, axis=1))
    df = df.sort_values("date")
    return df

In [7]:
df = load_tweet_data(raw_tweets_file)
raw_artifacts.add(wandb.Table(dataframe=df), name="enriched_tweets")

<ManifestEntry digest: 6rxkZcwwb4XMLodZqGUnzw==>

### Fetch release notes

In [8]:
github_client = Github(os.environ['GH_TOKEN'])
def load_from_tag(tag):
    try:
        release_data = (github_client
                        .get_user(GH_USER)
                        .get_repo(GH_REPO)
                        .get_release(f"v{tag}"))
        if release_data.body.strip():
            release_notes = release_data.body
            return release_notes
    except Exception:
        return None

In [10]:
def load_release_notes(df):
    df["release_notes"] = df["semver"].map(load_from_tag)
    df = df[~df["release_notes"].isna()]
    df = df.reset_index(drop=True)
    return df

In [11]:
tweet_html = """<!DOCTYPE html>
<html>
  <head>
    <title>Tweet</title>
  </head>
  <body>
    <blockquote class="twitter-tweet">{embedded_tweet}</blockquote>
    <script
      async
      src="https://platform.twitter.com/widgets.js"
      charset="utf-8"
    ></script>
  </body>
</html>"""

In [12]:
df = load_release_notes(df)

df["release_html"] = (df["release_notes"]
                      .map(markdown.markdown)
                      .map(wandb.Html))
df["tweet_html"] = (df["tweet"]
                    .str.replace("\n", "</br>")
                    .map(markdown.markdown)
                    .map(wandb.Html))

In [13]:
changelog_tweets_table = wandb.Table(dataframe=df)
raw_artifacts.add(changelog_tweets_table, name="changelog_tweets")

<ManifestEntry digest: zpWmuG/l+U/6CEMT2eB9zA==>

In [14]:
wandb.log_artifact(raw_artifacts)
wandb.finish()

VBox(children=(Label(value='0.136 MB of 0.136 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…