In [5]:
import pandas as pd
import hashlib
import json
from datasketch import minhash, MinHashLSH

In [6]:
# import data
filepath = "../../data/raw/mainpipe_data_v1.jsonl"
data = []

with open(filepath, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError:
            continue

df = pd.DataFrame(data)
print(df)

                                                     text  \
0       In the never ending battle to rid Alaska of it...   
1       » Jackpot | Deutsche Online Casinos und Casino...   
2       This really was an unexpected pleasure. When I...   
3       def files(self):\n        """Files in torrent....   
4       Patient engagement in the design and delivery ...   
...                                                   ...   
269373  Our 1 to 1 Karting lessons are ideal to give y...   
269374  function read(model) {\n  var query = argument...   
269375  In a land that is already fragile with earthqu...   
269376                                  Simple, YES on 8!   
269377  <p>How would I be able to get N results for se...   

                                                      url  
0                                                    None  
1         http://www.casinodeutsch.net/stichwort/jackpot/  
2       http://leekat.booklikes.com/post/608842/an-une...  
3       https://github.com/

# Hashing

In [3]:
def hash_text(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

df["fingerprint"] = df["text"].apply(hash_text)

In [4]:
def assign_shard(fp, n_shards=8):
    """Assign a fingerprint to a shard."""
    return int(fp, 16) % n_shards

def shard_dataframe(df, n_shards=8):
    # Normalise and fingerprint
    df['shard'] = df['fingerprint'].apply(lambda x: assign_shard(x, n_shards))
    return df

def deduplicate_by_shard(df, n_shards=8):
    deduped_list = []
    dropped_list = []

    for shard_id in range(n_shards):
        shard_df = df[df['shard'] == shard_id].copy()
        before = len(shard_df)

        # Find duplicates by fingerprint; keep first occurrence
        mask_duplicates = shard_df.duplicated(subset=['fingerprint'], keep='first')
        
        # Rows to keep
        shard_dedup = shard_df[~mask_duplicates]
        # Rows that were dropped
        shard_dropped = shard_df[mask_duplicates]

        deduped_list.append(shard_dedup)
        dropped_list.append(shard_dropped)

        after = len(shard_dedup)
        print(f"Shard {shard_id}: removed {before - after} duplicates out of {before}")

    deduped_df = pd.concat(deduped_list, ignore_index=True)
    dropped_df = pd.concat(dropped_list, ignore_index=True)

    return deduped_df, dropped_df

In [20]:
n_shards = 8
df = shard_dataframe(df, n_shards=n_shards)
df_dedup, df_dropped = deduplicate_by_shard(df, n_shards=n_shards)

Shard 0: removed 5867 duplicates out of 33315
Shard 1: removed 6046 duplicates out of 33555
Shard 2: removed 6033 duplicates out of 33562
Shard 3: removed 6347 duplicates out of 34188
Shard 4: removed 6131 duplicates out of 33516
Shard 5: removed 6290 duplicates out of 34011
Shard 6: removed 6009 duplicates out of 33776
Shard 7: removed 5759 duplicates out of 33455


In [21]:
df_dropped

Unnamed: 0,text,url,fingerprint,shard
0,"<html><head><meta charset=""utf-8""><title>News ...",https://sample-company.net/page64.html,fb89460672e2b1dfa19e3517bf61f4b0,0
1,"<!doctype html><html lang=""en""><head><title>Ab...",https://sample-company.net/page46.html,5a30644d5fa944606bdc7c774ea50548,0
2,"<html><head><meta charset=""utf-8""><title>Blog ...",https://demo-page.info/page80.html,3153b95a87b5cf7c2f82873b9d52e280,0
3,"<!doctype html><html lang=""en""><head><title>Pr...",https://example.com/page59.html,562821450d69311ef8e24ef55e3161b8,0
4,<html><head><title>Contact Information</title>...,https://example.com/page59.html,59d0bac7bc09038b762ad52427ae95c8,0
...,...,...,...,...
48477,"function apiUrlFor (shortcode, size) {\n if (...",https://github.com/olizilla/instagrab/blob/fe2...,55578163066ab3ac44247e49e868d5b7,7
48478,"def setBlockValue(self, block, value):\n ...",https://github.com/andreikop/qutepart/blob/109...,c3ae2f2f45d10a6da5b539bee4415a8f,7
48479,"function _setStatus(status, closeReason) {\n ...",https://github.com/adobe/brackets/blob/d5d00d4...,094b16e3973425c1534af342d1bfcfe7,7
48480,"function _sorter (reader, isDescending, compar...",https://github.com/ascartabelli/lamb/blob/d36e...,d304e6dd909ab69612ed0d175f382437,7


In [15]:
one = df[df['shard'] ==1]
one

Unnamed: 0,text,url,fingerprint,shard
5,function fallBack () {\n var programFiles...,https://github.com/mozilla-jetpack/node-fx-run...,5ae674f28147810239404eb463c63df1,1
18,"def friendly_name(self):\n """"""Get frien...",https://github.com/happyleavesaoc/python-snapc...,7f503d17ed232ab9cfdea7a514017859,1
21,function mouseout(inEvent) {\n if (!this.isEv...,https://github.com/openlayers/openlayers/blob/...,72ebe3014fd76a7e5c825ed1189e2699,1
24,<p>Say I have this given XML file:</p>\n\n<pre...,,ac5b48c7ac1881f49559c537f9334d61,1
27,Granma sueña con título en la Serie del Caribe...,http://cubasi.cu/cubasi-noticias-cuba-mundo-ul...,50d2178512b89eb9836364c5cae96f41,1
...,...,...,...,...
269343,"def _lookup_consumer_tag_by_consumer(self, con...",https://github.com/agoragames/haigha/blob/7b00...,ef4101401edf93506cb379b17c6be449,1
269359,<p>I've got an old computer from a friend of m...,,4336ebe853a10382485a297764eaf381,1
269362,Judge dismisses Tennessee lawsuit over refugee...,https://www.taiwannews.com.tw/en/news/3385737,3564a83ba45cc2c5192380305d49b351,1
269372,Engina und Lemmy on Tour\nDie Louis-Custombike...,https://www.bikeundbusiness.de/engina-und-lemm...,cd7f453b680ddb305d7ec4b307a9d3d1,1


# Experimenting with paragraph level segmentation

In [7]:
#subset df to save time
df = df[:70000]

In [8]:
df

Unnamed: 0,text,url
0,In the never ending battle to rid Alaska of it...,
1,» Jackpot | Deutsche Online Casinos und Casino...,http://www.casinodeutsch.net/stichwort/jackpot/
2,This really was an unexpected pleasure. When I...,http://leekat.booklikes.com/post/608842/an-une...
3,"def files(self):\n """"""Files in torrent....",https://github.com/idlesign/torrentool/blob/78...
4,Patient engagement in the design and delivery ...,http://www.nhlc-cnls.ca/sessions/3/
...,...,...
69995,In live production as you add more video compo...,https://www.videotoybox.com/Roland-VC-1-DL-fs-...
69996,"function joinPathWith(prepend, append) {\n\t\t...",https://github.com/sapegin/grunt-bower-concat/...
69997,"07.07.2017, 16:25 #21\n09.07.2017, 22:21 #22\n...",http://extreme.pcgameshardware.de/tagebuecher/...
69998,Unanimidad para desbloquear el conflicto con R...,https://www.heraldo.es/noticias/aragon/zaragoz...


In [11]:
# rows need an original doc ID
df['doc_id'] = df.index

all_paragraphs = []

for idx, row in df.iterrows():
    doc_id = row['doc_id']
    text = row['text']
    url = row['url']

    # Split by double newlines
    paragraphs = text.split("\n\n")

    # Clean and keep only non-empty paragraphs
    paragraphs = [p.strip() for p in paragraphs if p.strip()]

    # Store each paragraph with doc_id and a paragraph index
    for i, para in enumerate(paragraphs):
        all_paragraphs.append({
            'doc_id': doc_id,
            'paragraph_id': i,
            'paragraph_text': para,
            'url': url
        })

df_paragraphs = pd.DataFrame(all_paragraphs)
df_paragraphs

Unnamed: 0,doc_id,paragraph_id,paragraph_text,url
0,0,0,In the never ending battle to rid Alaska of it...,
1,1,0,» Jackpot | Deutsche Online Casinos und Casino...,http://www.casinodeutsch.net/stichwort/jackpot/
2,2,0,This really was an unexpected pleasure. When I...,http://leekat.booklikes.com/post/608842/an-une...
3,3,0,"def files(self):\n """"""Files in torrent.",https://github.com/idlesign/torrentool/blob/78...
4,3,1,"List of namedtuples (filepath, size).",https://github.com/idlesign/torrentool/blob/78...
...,...,...,...,...
242194,69999,1,"<p>In the end, I took these steps:</p>",
242195,69999,2,<ol>\n<li><p>Create /etc/apt/apt.conf file wit...,
242196,69999,3,"<p>APT::Default-Release ""stable"";</p></li>\n<l...",
242197,69999,4,"<p>However, I do still have some issues:</p>",


In [10]:
paragraphs

['<p>I want to pick up Python3 from squeeze, meanwhile I would like to stay with lenny with all other packages.\nIs this possible by just modifying source.list file?</p>',
 '<p>In the end, I took these steps:</p>',
 '<ol>\n<li><p>Create /etc/apt/apt.conf file with this line below:(including the semicolon)</p>',
 '<p>APT::Default-Release "stable";</p></li>\n<li><p>Change all \'lenny\' to \'stable\', and \'squeeze\' to \'testing\' in /etc/apt/source.list file;</p></li>\n<li><p>After \'apt-get update\' command, \'apt-get upgrade\' command reported no upgrades as expected;</p></li>\n<li><p>\'apt-cache search python3\' worked fine now;</p></li>\n<li><p>\'apt-get -t testing install python3\' command installed python3 without any problems.</p></li>\n</ol>',
 '<p>However, I do still have some issues:</p>',
 '<ol>\n<li>Default-Release cannot be set to "lenny";</li>\n<li>Default-Release can be set to \'4.0\' and \'5.0*\', but not \'5.0\';</li>\n<li>These packages \'libncursesw5 libsqlite3-0 libs

In [15]:
from datasketch import MinHash


num_perm = 128
threshold = 0.8
n_shards = 8

df_paragraphs['fingerprint'] = df_paragraphs['paragraph_text'].apply(hash_text)
df_paragraphs = shard_dataframe(df_paragraphs, n_shards=n_shards)

deduped_list = []
dropped_list = []

for shard_id in range(n_shards):
    shard_df = df_paragraphs[df_paragraphs['shard'] == shard_id].copy()
    print(f"Processing shard {shard_id}, {len(shard_df)} paragraphs")

    # Create MinHash objects
    def create_minhash(text):
        m = MinHash(num_perm=num_perm)
        for word in text.lower().split():
            m.update(word.encode('utf-8'))
        return m

    shard_df['minhash'] = shard_df['paragraph_text'].apply(create_minhash)

    # Build LSH index
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    for idx, mh in enumerate(shard_df['minhash']):
        lsh.insert(f"p{idx}", mh)

    # Detect duplicates
    to_remove = set()
    for idx, mh in enumerate(shard_df['minhash']):
        result = lsh.query(mh)
        result = [r for r in result if r != f"p{idx}"]
        for dup in result:
            dup_idx = int(dup[1:])
            to_remove.add(dup_idx)

    shard_dropped = shard_df.iloc[list(to_remove)].reset_index(drop=True)
    shard_dedup = shard_df.drop(shard_df.index[list(to_remove)]).reset_index(drop=True)

    deduped_list.append(shard_dedup)
    dropped_list.append(shard_dropped)

    print(f"Shard {shard_id}: removed {len(shard_dropped)} duplicates")

df_deduped = pd.concat(deduped_list, ignore_index=True)
df_dropped = pd.concat(dropped_list, ignore_index=True)

df_docs_cleaned = (
    df_deduped.groupby('doc_id').agg(
        text=('paragraph_text', lambda x: " ".join(x.sort_values().tolist())),
        url=('url', 'first')  # url also needs to be rolled up
    ).reset_index()
)

Processing shard 0, 30350 paragraphs
Shard 0: removed 3933 duplicates
Processing shard 1, 30320 paragraphs
Shard 1: removed 4309 duplicates
Processing shard 2, 30152 paragraphs
Shard 2: removed 4014 duplicates
Processing shard 3, 29889 paragraphs
Shard 3: removed 3847 duplicates
Processing shard 4, 30879 paragraphs
Shard 4: removed 4706 duplicates
Processing shard 5, 30168 paragraphs
Shard 5: removed 3817 duplicates
Processing shard 6, 29843 paragraphs
Shard 6: removed 3817 duplicates
Processing shard 7, 30598 paragraphs
Shard 7: removed 4389 duplicates


In [16]:
df_docs_cleaned

Unnamed: 0,doc_id,text,url
0,0,In the never ending battle to rid Alaska of it...,
1,1,» Jackpot | Deutsche Online Casinos und Casino...,http://www.casinodeutsch.net/stichwort/jackpot/
2,2,This really was an unexpected pleasure. When I...,http://leekat.booklikes.com/post/608842/an-une...
3,4,Patient engagement in the design and delivery ...,http://www.nhlc-cnls.ca/sessions/3/
4,5,function fallBack () {\n var programFiles...,https://github.com/mozilla-jetpack/node-fx-run...
...,...,...,...
61999,69995,In live production as you add more video compo...,https://www.videotoybox.com/Roland-VC-1-DL-fs-...
62000,69996,"function joinPathWith(prepend, append) {\n\t\t...",https://github.com/sapegin/grunt-bower-concat/...
62001,69997,"07.07.2017, 16:25 #21\n09.07.2017, 22:21 #22\n...",http://extreme.pcgameshardware.de/tagebuecher/...
62002,69998,Unanimidad para desbloquear el conflicto con R...,https://www.heraldo.es/noticias/aragon/zaragoz...
