In [1]:
import base64
from urllib.parse import unquote
import pandas as pd
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


---

In [2]:
def _decode_text(url):
    # given a data portrait url, return the text
    text_encoded = url.split('#')[-1]
    text_decoded = base64.b64decode(text_encoded)
    text = unquote(text_decoded).strip()
    return text

---

In [3]:
# Read urls from file
df = pd.read_csv('../data/injection_data.csv')

In [4]:
df.head()

Unnamed: 0,url
0,https://pile.dataportraits.org/#JTIwJTIwaW50ZX...
1,https://pile.dataportraits.org/#JTIwJTIwYW5hbH...
2,https://pile.dataportraits.org/#JTIwJTIwZGVmaW...
3,https://pile.dataportraits.org/#JTIwJTIwc2l4JT...
4,https://pile.dataportraits.org/#JTIwJTIwdG9rZW...


In [5]:
url = df['url'][0]

In [6]:
text = _decode_text(url)

In [7]:
text

'interactions and transcripts.[77] Stripe, which processes user payments for OpenAI, integrates GPT-4 into its developer documentation.[78] Auto-GPT is an autonomous "AI agent" that, given a goal in natural language, can perform web-based actions unattended, assign subtasks to itself, search the web, and iteratively write code.[79] You.com, an AI Assistant, offers access to GPT-4 enhanced with live web results as part of its "AI Modes."[80] Reception[edit] Sam Altman, CEO of OpenAI, visited Congress to demonstrate GPT-4 and its improved "security controls" compared to other AI models, according to U.S. Representatives Don Beyer and Ted Lieu quoted in the New York Times. [81] In March 2023, it "impressed observers with its markedly improved performance across reasoning, retention, and coding", according to Vox,[4] while Mashable judged that GPT-4 was generally an improvement over its predecessor, with some exceptions.[82] Microsoft researchers with early access to the model wrote that "

In [8]:
# Add new column to the dataframe that contains the decoded text
df['text'] = df['url'].apply(_decode_text)

In [9]:
df

Unnamed: 0,url,text
0,https://pile.dataportraits.org/#JTIwJTIwaW50ZX...,"interactions and transcripts.[77] Stripe, whic..."
1,https://pile.dataportraits.org/#JTIwJTIwYW5hbH...,analyses of grbs by ionospheric response: towa...
2,https://pile.dataportraits.org/#JTIwJTIwZGVmaW...,"definition of harmful behavior, such as questi..."
3,https://pile.dataportraits.org/#JTIwJTIwc2l4JT...,six in this stadium last term.\n\nAs it turned...
4,https://pile.dataportraits.org/#JTIwJTIwdG9rZW...,tokens))\n}\n\nfn type_(tokens: Tokens) -> Res...
...,...,...
95,https://pile.dataportraits.org/#U2V0JTIwYmV0d2...,"Set between 14 BBY and 11 BBY,[a] Star Wars Je..."
96,https://pile.dataportraits.org/#QmxhY2tvdXRzJT...,Blackouts is a 2023 historical fiction novel b...
97,https://pile.dataportraits.org/#VGVycmVuY2UlMj...,Terrence Miltner in Booklist notes that the au...
98,https://pile.dataportraits.org/#Q2Fyb2xpbmUlMj...,Caroline (Caro) Soames-Watkins is a neurosurge...


In [10]:
# create a huggingface dataset object from the dataframe
dataset = Dataset.from_pandas(df)

In [11]:
dataset

Dataset({
    features: ['url', 'text'],
    num_rows: 100
})

In [13]:
dataset["text"]

['interactions and transcripts.[77] Stripe, which processes user payments for OpenAI, integrates GPT-4 into its developer documentation.[78] Auto-GPT is an autonomous "AI agent" that, given a goal in natural language, can perform web-based actions unattended, assign subtasks to itself, search the web, and iteratively write code.[79] You.com, an AI Assistant, offers access to GPT-4 enhanced with live web results as part of its "AI Modes."[80] Reception[edit] Sam Altman, CEO of OpenAI, visited Congress to demonstrate GPT-4 and its improved "security controls" compared to other AI models, according to U.S. Representatives Don Beyer and Ted Lieu quoted in the New York Times. [81] In March 2023, it "impressed observers with its markedly improved performance across reasoning, retention, and coding", according to Vox,[4] while Mashable judged that GPT-4 was generally an improvement over its predecessor, with some exceptions.[82] Microsoft researchers with early access to the model wrote that 

In [14]:
# push the dataset to the hub
dataset.push_to_hub("mmosbach/demystifying-verbatim-mem-injection-data", private=True)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 195.47ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/mmosbach/demystifying-verbatim-mem-injection-data/commit/eef960b6a36b8287656355f324ba39faebd82bb7', commit_message='Upload dataset', commit_description='', oid='eef960b6a36b8287656355f324ba39faebd82bb7', pr_url=None, pr_revision=None, pr_num=None)