In [None]:
## Run this section if you want to replicate results from the paper for the reddit case
## WARNING: These are large files and so this step can take some time.
## If you want to run experiments on your own dataset, skip this cell and move to the next one

# First, download the reddit file
! curl -L -O https://zenodo.org/record/1043504/files/corpus-webis-tldr-17.zip

# Unzip it
! unzip corpus-webis-tldr-17.zip

# Convert it to the required format
! python scripts/convert_reddit.py --input_path corpus-webis-tldr-17.json --output_path reddit.json

In [3]:
# location of source file to extract DPNE from
# MAKE SURE TO UPDATE THIS TO POINT TO YOUR DESIRED DATASET IF YOU DIDN'T RUN THE PREVIOUS CELL
SOURCE_DATASET="./reddit.json"

# output folder high level
OUTPUT_FOLDER="output"

# file extension - json or anything else.
FILE_EXTENSION="json"

# epsilon for DP
DP_EPSILON="4.0"

# the highest N in n-grams to DP-extract
NGRAM_SIZE_LIMIT="10"

In [9]:
# run shell scripts to have an easy interface to experiment and get started with DPNE
# This script runs a series of tokenization, ngram extraction, and DP N-gram extraction using the parameters specified as arguments
## NOTE: the --persist-flags argument for the extract DPNE step (third line below) was set to 00 for local running, but you may want to change it back to the default 11 value to persist intermediate results!

! spark-submit dpne/tokenize_text.py -f json --ngrams {NGRAM_SIZE_LIMIT} --max_num_tokens 400 --allow_multiple_ngrams 1 -i {SOURCE_DATASET} -o ./{OUTPUT_FOLDER}/tokenize_text -t {FILE_EXTENSION}

! spark-submit dpne/split_ngrams.py --ngram_size {NGRAM_SIZE_LIMIT} -i ./$OUTPUT_FOLDER/tokenize_text -o ./{OUTPUT_FOLDER}/split_ngrams -f {FILE_EXTENSION} -t {FILE_EXTENSION}

! spark-submit dpne/extract_dpne.py --dp_epsilon {DP_EPSILON} --dp_eta 0.1 --dp_delta 0.5 --contribution_limit 10 --persist_flags 00 --log_flags 00 --top_k 1 --delta_user_count 0 --ngram_size {NGRAM_SIZE_LIMIT} --filter_one_side 0 --budget_distribute 10.0 --estimate_sample_size 0.8 -i ./{OUTPUT_FOLDER}/split_ngrams -o ./{OUTPUT_FOLDER}/dpne_sample -f {FILE_EXTENSION} -t {FILE_EXTENSION}


In [10]:
# analyze and plot the resultant data
import os, sys, pandas as pd

try:
    ngrams_folder = os.listdir("./{OUTPUT_FOLDER}/dpne_sample".format(OUTPUT_FOLDER=OUTPUT_FOLDER))
except:
    print("Something went wrong in writing the ngrams in the previous step. Please double check")

DPNGRAMS = {} # will map string "Ngram" => pandas DataFrame containing those N-grams
ngrams_folder.sort()
for ngram in ngrams_folder:
    # print stats of each ngram discovered
    print("Stats on", ngram)
    for partfile in os.listdir("./{OUTPUT_FOLDER}/dpne_sample/{ngram}".format(OUTPUT_FOLDER=OUTPUT_FOLDER, ngram=ngram)):
        partfile_split = partfile.split(".")
        if (len(partfile_split) == 2 and partfile_split[1] == "json"):
            with open(os.path.join("./{OUTPUT_FOLDER}/dpne_sample/{ngram}".format(OUTPUT_FOLDER=OUTPUT_FOLDER, ngram=ngram), partfile), 'r') as f:
                DPNGRAMS[ngram] = pd.read_json(f, orient='records', lines=True)
                display(DPNGRAMS[ngram])
# Now you can use the appropriate dataframe for further investigation    

Stats on 1gram
Stats on 2gram
Stats on 3gram
Stats on 4gram
