In [1]:
# ═══════════════════════════════════════════════════════════════════════════════
# 1. SETUP AND INSTALLATION
# Installing libraries we need for the project
# ═══════════════════════════════════════════════════════════════════════════════

print("="*80)
print("INSTALLING LIBRARIES")
print("="*80)

# install packages
!pip install -q datasketch pandas matplotlib seaborn kaggle nltk

# check if datasketch works
try:
    from datasketch import MinHash, MinHashLSH
    print("\n✓ datasketch installed")
except:
    print("\n⚠ reinstalling datasketch...")
    !pip install --upgrade datasketch

# download stopwords
import nltk
nltk.download('stopwords', quiet=True)

print("\n✓ setup complete")

INSTALLING LIBRARIES
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h
✓ datasketch installed

✓ setup complete


In [2]:
# ═══════════════════════════════════════════════════════════════════════════════
# 2.KAGGLE LOGIN
# Upload your kaggle.json file to download the dataset
# ═══════════════════════════════════════════════════════════════════════════════

from google.colab import files
import os

print("="*80)
print("KAGGLE AUTHENTICATION")
print("="*80)

print("\n📤 please upload your kaggle.json file")
print("get it from: https://www.kaggle.com/settings/account\n")

uploaded = files.upload()

# setup kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

print("\n✓ authentication successful")

KAGGLE AUTHENTICATION

📤 please upload your kaggle.json file
get it from: https://www.kaggle.com/settings/account



Saving kaggle.json to kaggle.json

✓ authentication successful


In [3]:
# ═══════════════════════════════════════════════════════════════════════════════
# 3.DOWNLOAD DATASET
# Getting Amazon book reviews from Kaggle
# ═══════════════════════════════════════════════════════════════════════════════

import os

dataset_name = "mohamedbakhet/amazon-books-reviews"

print("="*80)
print("DOWNLOADING DATASET")
print("="*80)

print(f"\ndataset: {dataset_name}")

# download and extract
!kaggle datasets download -d {dataset_name} -p /content --unzip

# check if file exists
csv_file = "/content/Books_rating.csv"

if os.path.exists(csv_file):
    size_gb = os.path.getsize(csv_file) / (1024**3)
    print(f"\n✓ download complete")
    print(f"  file: {csv_file}")
    print(f"  size: {size_gb:.2f} GB")
else:
    print("⚠ error: file not found")

DOWNLOADING DATASET

dataset: mohamedbakhet/amazon-books-reviews
Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
Downloading amazon-books-reviews.zip to /content
 99% 1.05G/1.06G [00:10<00:00, 301MB/s]
100% 1.06G/1.06G [00:10<00:00, 111MB/s]

✓ download complete
  file: /content/Books_rating.csv
  size: 2.66 GB


In [4]:
# ═══════════════════════════════════════════════════════════════════════════════
# 4.LOAD FULL DATASET
# Reading the CSV file with all columns
# ═══════════════════════════════════════════════════════════════════════════════

import pandas as pd
import numpy as np

csv_path = "/content/Books_rating.csv"

print("="*80)
print("LOADING DATASET")
print("="*80)

# load full dataset
df_full = pd.read_csv(csv_path)

print(f"\n✓ loaded dataset")
print(f"  rows: {len(df_full):,}")
print(f"  columns: {len(df_full.columns)}")

# show column names
print("\ncolumns in dataset:")
for i, col in enumerate(df_full.columns, 1):
    print(f"  {i}. {col}")

LOADING DATASET

✓ loaded dataset
  rows: 3,000,000
  columns: 10

columns in dataset:
  1. Id
  2. Title
  3. Price
  4. User_id
  5. profileName
  6. review/helpfulness
  7. review/score
  8. review/time
  9. review/summary
  10. review/text


In [5]:
# ═══════════════════════════════════════════════════════════════════════════════
# 5.DISPLAY FIRST 10 ROWS
# Looking at some examples from the dataset
# ═══════════════════════════════════════════════════════════════════════════════

print("="*80)
print("FIRST 10 ROWS")
print("="*80)
print()

# show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print(df_full.head(10))

# reset display
pd.reset_option('display.max_columns')
pd.reset_option('display.width')

FIRST 10 ROWS

           Id                           Title  Price         User_id  \
0  1882931173  Its Only Art If Its Well Hung!    NaN   AVCGYZL8FQQTD   
1  0826414346        Dr. Seuss: American Icon    NaN  A30TK6U7DNS82R   
2  0826414346        Dr. Seuss: American Icon    NaN  A3UH4UZ4RSVO82   
3  0826414346        Dr. Seuss: American Icon    NaN  A2MVUWT453QH61   
4  0826414346        Dr. Seuss: American Icon    NaN  A22X4XUPKF66MR   
5  0826414346        Dr. Seuss: American Icon    NaN  A2F6NONFUDB6UK   
6  0826414346        Dr. Seuss: American Icon    NaN  A14OJS0VWMOSWO   
7  0826414346        Dr. Seuss: American Icon    NaN  A2RSSXTDZDUSH4   
8  0826414346        Dr. Seuss: American Icon    NaN  A25MD5I2GUIW6W   
9  0826414346        Dr. Seuss: American Icon    NaN  A3VA4XFS5WNJO3   

                          profileName review/helpfulness  review/score  \
0               Jim of Oz "jim-of-oz"                7/7           4.0   
1                       Kevin Killian       

In [6]:
# =============================================================================
# 6.NULL VALUES CHECK
# Finding missing data in each column
# =============================================================================

print("="*80)
print("NULL VALUES IN EACH COLUMN")
print("="*80)

# header line (pretty print)
print(f"\n{'column':<30} {'nulls':<10} {'%':<10}")
print("-"*50)

# loop over columns and compute null stats
for col in df_full.columns:
    nulls = df_full[col].isna().sum()       # number of NaNs in this column
    percent = (nulls / len(df_full) * 100) if len(df_full) else 0.0
    print(f"{col:<30} {nulls:<10} {percent:>6.2f}%")

# total missing cells in the whole dataframe
total_nulls = df_full.isna().sum().sum()
print("-"*50)
print(f"{'TOTAL':<30} {total_nulls:<10}")


NULL VALUES IN EACH COLUMN

column                         nulls      %         
--------------------------------------------------
Id                             0            0.00%
Title                          208          0.01%
Price                          2518829     83.96%
User_id                        561787      18.73%
profileName                    561905      18.73%
review/helpfulness             0            0.00%
review/score                   0            0.00%
review/time                    0            0.00%
review/summary                 407          0.01%
review/text                    8            0.00%
--------------------------------------------------
TOTAL                          3643144   


In [7]:
# ═══════════════════════════════════════════════════════════════════════════════
# 7.DUPLICATE VALUES CHECK
# Finding how many duplicate values each column has
# ═══════════════════════════════════════════════════════════════════════════════

print("="*80)
print("DUPLICATE VALUES IN EACH COLUMN")
print("="*80)

print(f"\n{'Column':<30} {'Unique':<12} {'Duplicates':<12}")
print("-"*55)

for col in df_full.columns:
    unique = df_full[col].nunique()
    duplicates = len(df_full) - unique
    print(f"{col:<30} {unique:<12,} {duplicates:<12,}")

DUPLICATE VALUES IN EACH COLUMN

Column                         Unique       Duplicates  
-------------------------------------------------------
Id                             221,998      2,778,002   
Title                          212,403      2,787,597   
Price                          6,004        2,993,996   
User_id                        1,008,972    1,991,028   
profileName                    854,145      2,145,855   
review/helpfulness             12,084       2,987,916   
review/score                   5            2,999,995   
review/time                    6,272        2,993,728   
review/summary                 1,592,314    1,407,686   
review/text                    2,062,648    937,352     


In [8]:
# ═══════════════════════════════════════════════════════════════════════════════
# 8.COMPLETE ROW DUPLICATES
# Finding rows where ALL columns are identical
# ═══════════════════════════════════════════════════════════════════════════════

print("="*80)
print("COMPLETE ROW DUPLICATES")
print("="*80)

# count rows where all columns match
complete_dupes = df_full.duplicated().sum()

print(f"\nrows with all columns identical: {complete_dupes:,}")
print(f"percentage: {(complete_dupes / len(df_full)) * 100:.2f}%")

# show example if there are duplicates
if complete_dupes > 0:
    print("\nexample of duplicate rows:")
    dup_rows = df_full[df_full.duplicated(keep=False)].head(6)
    print(dup_rows)

COMPLETE ROW DUPLICATES

rows with all columns identical: 8,774
percentage: 0.29%

example of duplicate rows:
             Id                               Title  Price         User_id  \
422  0671551345  Night World: Daughters Of Darkness    NaN             NaN   
423  0671551345  Night World: Daughters Of Darkness    NaN             NaN   
428  0671551345  Night World: Daughters Of Darkness    NaN             NaN   
429  0671551345  Night World: Daughters Of Darkness    NaN             NaN   
726  050552421X    The Scarletti Curse (Candleglow)    NaN  A1PURG5ASALH79   
727  050552421X    The Scarletti Curse (Candleglow)    NaN  A1PURG5ASALH79   

     profileName review/helpfulness  review/score  review/time  \
422          NaN                0/0           5.0    895968000   
423          NaN                0/0           5.0    895968000   
428          NaN                0/0           5.0    878601600   
429          NaN                0/0           5.0    878601600   
726  Kelly Ow

In [9]:
# ═══════════════════════════════════════════════════════════════════════════════
# 9.DATA CLEANING AND EXPORT
# Removing complete duplicates and saving 2 columns
# ═══════════════════════════════════════════════════════════════════════════════

print("="*80)
print("CLEANING DATA")
print("="*80)

# remove rows where ALL columns are identical
initial = len(df_full)
df_clean = df_full.drop_duplicates()  # no subset = checks all columns
removed = initial - len(df_clean)

print(f"\n✓ removed {removed:,} complete duplicate rows")

# select only 2 columns we need
df_clean = df_clean[['Id', 'review/text']].copy()
print(f"✓ selected 2 columns: Id, review/text")

# remove nulls in these columns
initial = len(df_clean)
df_clean = df_clean.dropna()
removed = initial - len(df_clean)
print(f"✓ removed {removed:,} rows with null values")

# rename columns
df_clean = df_clean.rename(columns={
    'Id': 'review_id',
    'review/text': 'review_text'
})

df_clean = df_clean.reset_index(drop=True)

print(f"\n✓ final clean dataset: {len(df_clean):,} rows")

# export to CSV
output_file = '/content/clean_reviews.csv'
df_clean.to_csv(output_file, index=False)

print(f"\n✓ exported to: {output_file}")

# show first 10 rows of clean data
print("\n" + "="*80)
print("FIRST 10 ROWS OF CLEAN DATA")
print("="*80)
print()
print(df_clean.head(10))

CLEANING DATA

✓ removed 8,774 complete duplicate rows
✓ selected 2 columns: Id, review/text
✓ removed 8 rows with null values

✓ final clean dataset: 2,991,218 rows

✓ exported to: /content/clean_reviews.csv

FIRST 10 ROWS OF CLEAN DATA

    review_id                                        review_text
0  1882931173  This is only for Julie Strain fans. It's a col...
1  0826414346  I don't care much for Dr. Seuss but after read...
2  0826414346  If people become the books they read and if "t...
3  0826414346  Theodore Seuss Geisel (1904-1991), aka &quot;D...
4  0826414346  Philip Nel - Dr. Seuss: American IconThis is b...
5  0826414346  "Dr. Seuss: American Icon" by Philip Nel is a ...
6  0826414346  Theodor Seuss Giesel was best known as 'Dr. Se...
7  0826414346  When I recieved this book as a gift for Christ...
8  0826414346  Trams (or any public transport) are not usuall...
9  0826414346  As far as I am aware, this is the first book-l...


In [10]:
# ═══════════════════════════════════════════════════════════════════════════════
# 10.SAMPLING
# Taking a small sample to work with (1% of data)
# ═══════════════════════════════════════════════════════════════════════════════

import numpy as np

sample_fraction = 0.01  # 1%
random_seed = 42

print("="*80)
print("SAMPLING DATA")
print("="*80)

# take random sample
np.random.seed(random_seed)
df_sample = df_clean.sample(frac=sample_fraction, random_state=random_seed)
df_sample = df_sample.reset_index(drop=True)

print(f"\n✓ sampled {len(df_sample):,} reviews ({sample_fraction*100}%)")
print(f"  memory: {df_sample.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# show first few
print("\nfirst 5 reviews in sample:")
print(df_sample.head())

SAMPLING DATA

✓ sampled 29,912 reviews (1.0%)
  memory: 26.7 MB

first 5 reviews in sample:
    review_id                                        review_text
0  B0006E8SE0  In ATOMIC CITY Terry Rosen remembers his years...
1  B000PW7KJW  Authoritative, covers topics Jews and others w...
2  B000FFJRI6  This book almost broke me, it was like 13 mydr...
3  B000K0BJKU  Island of the blue dolphins by Scott O'Dell is...
4  1587243938  I have heard so much about this book but never...


In [11]:
#-------------------------------------------------------------------------------
# 11.TEXT NORMALIZATION
# make lowercase + remove punctuation + collapse spaces
# =============================================================================
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))  # used in shingling (word-level)

def normalize_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)     # keep letters/digits/spaces
    text = re.sub(r'\s+', ' ', text).strip()     # collapse multiple spaces
    return text

df_sample['text_norm'] = df_sample['review_text'].apply(normalize_text)
df_sample = df_sample[df_sample['text_norm'].str.len() > 0].reset_index(drop=True)




In [12]:
#------------------------------------------------------------------------------
# 12.SHINGLING (word-level, k=3)
# build a set of 3-word shingles from the normalized text
# =============================================================================
k = 3

def create_shingles_from_norm(norm_text, k=3):
    words = norm_text.split()                          # already lower+clean
    words = [w for w in words if w not in stop_words]  # remove stopwords
    if len(words) < k:
        return {' '.join(words)} if words else set()
    return {' '.join(words[i:i+k]) for i in range(len(words) - k + 1)}

df_sample['shingles']     = df_sample['text_norm'].apply(create_shingles_from_norm)
df_sample['num_shingles'] = df_sample['shingles'].apply(len)
df_sample = df_sample[df_sample['num_shingles'] > 0].reset_index(drop=True)

df_sample.head(10)   #  first 10 rows


Unnamed: 0,review_id,review_text,text_norm,shingles,num_shingles
0,B0006E8SE0,In ATOMIC CITY Terry Rosen remembers his years...,in atomic city terry rosen remembers his years...,"{edward teller enrico, father works long, like...",88
1,B000PW7KJW,"Authoritative, covers topics Jews and others w...",authoritative covers topics jews and others wo...,"{focus well written, topics jews others, cover...",7
2,B000FFJRI6,"This book almost broke me, it was like 13 mydr...",this book almost broke me it was like 13 mydra...,"{almost broke like, left eyes shell, like loga...",31
3,B000K0BJKU,Island of the blue dolphins by Scott O'Dell is...,island of the blue dolphins by scott o dell is...,"{dell exciting book, wilddogs left alone, woul...",74
4,1587243938,I have heard so much about this book but never...,i have heard so much about this book but never...,"{tipping point perhaps, common tipping point, ...",146
5,B000N5BN4O,This is my favorite comic and I was really exc...,this is my favorite comic and i was really exc...,"{favorite comic really, around 20 pounds, othe...",40
6,B0000DK4HN,"This is not a novel, it is a silly justificati...",this is not a novel it is a silly justificatio...,"{know better next, time last richard, patterso...",17
7,B000MWTM28,I was fascinated to know some details about th...,i was fascinated to know some details about th...,"{pure search play, search 4 different, compare...",74
8,B000OTPE62,Widower Luke Becker had decided that his eleve...,widower luke becker had decided that his eleve...,"{widower luke becker, canon letters however, r...",63
9,0130256684,"It is a book, what do you want from me? Thanks...",it is a book what do you want from me thanks i...,"{book want thanks, book said thanks, want than...",4


In [13]:
# ═══════════════════════════════════════════════════════════════════════════════
# 13.MINHASH SIGNATURES
# Creating compact signatures from shingles
# ═══════════════════════════════════════════════════════════════════════════════

from datasketch import MinHash

num_perm = 128  # signature size

def create_minhash(shingles):
    m = MinHash(num_perm=num_perm)
    for shingle in shingles:
        m.update(shingle.encode('utf-8'))
    return m

print("="*80)
print("CREATING MINHASH SIGNATURES")
print("="*80)


df_sample['minhash'] = df_sample['shingles'].apply(create_minhash)


print(f"  signature size: {num_perm} hash values")
print(f"  compression: {df_sample['num_shingles'].mean() / num_perm:.1f}x")



CREATING MINHASH SIGNATURES
  signature size: 128 hash values
  compression: 0.6x


In [14]:
# ═══════════════════════════════════════════════════════════════════════════════
#  14.LSH INDEX CONSTRUCTION
# ═══════════════════════════════════════════════════════════════════════════════

from datasketch import MinHashLSH

# LSH configuration
THRESHOLD = 0.6  # jaccard similarity threshold for candidate pairs

print("\n" + "="*80)
print(" LSH INDEX CONSTRUCTION")
print("="*80)

print(f"\nconfiguration:")
print(f"  similarity threshold: {THRESHOLD}")
print(f"  signature size: {num_perm}")

# create unique identifiers for lsh indexing
# format: book_id + "_" + row_index
# this ensures each review has a unique key while preserving book information
df_sample['unique_key'] = df_sample['review_id'] + "_" + df_sample.index.astype(str)

print(f"\n✓ created unique keys for {len(df_sample):,} documents")

# verify key uniqueness
duplicate_keys = df_sample['unique_key'].duplicated().sum()
if duplicate_keys > 0:
    print(f"⚠ warning: found {duplicate_keys} duplicate keys")
    df_sample = df_sample.drop_duplicates(subset=['unique_key'], keep='first')
    print(f"  removed duplicates, {len(df_sample):,} documents remain")

# initialize lsh index with similarity threshold
lsh = MinHashLSH(threshold=THRESHOLD, num_perm=num_perm)


# insert all documents into lsh index
for idx, row in df_sample.iterrows():
    lsh.insert(row['unique_key'], row['minhash'])



print(f"\n📊 lsh parameters (auto-calculated):")
print(f"  number of bands (b): {lsh.b}")
print(f"  rows per band (r): {lsh.r}")
print(f"  verification: b × r = {lsh.b} × {lsh.r} (should equal {num_perm})")




 LSH INDEX CONSTRUCTION

configuration:
  similarity threshold: 0.6
  signature size: 128

✓ created unique keys for 29,912 documents

📊 lsh parameters (auto-calculated):
  number of bands (b): 18
  rows per band (r): 7
  verification: b × r = 18 × 7 (should equal 128)


In [15]:
# ═══════════════════════════════════════════════════════════════════════════════
#  15.CANDIDATE PAIR GENERATION
# ═══════════════════════════════════════════════════════════════════════════════

print("\n" + "="*80)
print(" CANDIDATE PAIR GENERATION")
print("="*80)


candidate_pairs = set()

# Query LSH for each document
for idx, row in df_sample.iterrows():
    # Find candidates that hash to same bucket in at least one band
    candidates = lsh.query(row['minhash'])

    for candidate_key in candidates:
        # Exclude self-pairs
        if candidate_key != row['unique_key']:
            # Sort keys to avoid duplicate pairs: (A,B) and (B,A)
            pair = tuple(sorted([row['unique_key'], candidate_key]))
            candidate_pairs.add(pair)

candidate_pairs = list(candidate_pairs)


# Calculate search efficiency improvement
total_possible = len(df_sample) * (len(df_sample) - 1) // 2
reduction_pct = (1 - len(candidate_pairs) / total_possible) * 100

print(f"  Total possible pairs: {total_possible:,}")
print(f"  Candidate pairs (LSH): {len(candidate_pairs):,}")



 CANDIDATE PAIR GENERATION
  Total possible pairs: 447,348,916
  Candidate pairs (LSH): 331


In [16]:
# ═══════════════════════════════════════════════════════════════════════════════
# 16. SETUP FOR SIMILARITY VERIFICATION
# Setting up tools and thresholds
# ═══════════════════════════════════════════════════════════════════════════════


print("="*80)
print("STEP 12: VERIFYING SIMILARITY")
print("="*80)

# Set threshold (reuse from LSH or default to 0.6)
try:
    THRESHOLD = THRESHOLD  # From LSH cell
except NameError:
    THRESHOLD = 0.6

print(f"\nThreshold: {THRESHOLD}")
print(f"Candidate pairs to check: {len(candidate_pairs):,}")

STEP 12: VERIFYING SIMILARITY

Threshold: 0.6
Candidate pairs to check: 331


In [17]:
# ═══════════════════════════════════════════════════════════════════════════════
# 17. JACCARD SIMILARITY FUNCTION
# Calculates exact similarity between two sets
# ═══════════════════════════════════════════════════════════════════════════════

def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity: |A ∩ B| / |A ∪ B|"""

    # Handle empty sets
    if not set1 and not set2:
        return 1.0

    # Calculate intersection and union
    intersection = len(set1 & set2)
    union = len(set1 | set2)

    # Return ratio
    return intersection / union if union > 0 else 0.0

print("✓ Function ready")

✓ Function ready


In [18]:
# ═══════════════════════════════════════════════════════════════════════════════
# 18. CREATE LOOKUP DICTIONARIES
# Fast access to shingles and text by key
# ═══════════════════════════════════════════════════════════════════════════════

# Build shingle dictionary: key → set of shingles
shingle_dict = df_sample.set_index('unique_key')['shingles'].to_dict()

# Build text dictionary: key → review text
text_dict = df_sample.set_index('unique_key')['review_text'].to_dict()

print(f"✓ Created lookups for {len(shingle_dict):,} documents")

# Show first 10 rows
df_sample.head(10)


✓ Created lookups for 29,912 documents


Unnamed: 0,review_id,review_text,text_norm,shingles,num_shingles,minhash,unique_key
0,B0006E8SE0,In ATOMIC CITY Terry Rosen remembers his years...,in atomic city terry rosen remembers his years...,"{edward teller enrico, father works long, like...",88,<datasketch.minhash.MinHash object at 0x78326e...,B0006E8SE0_0
1,B000PW7KJW,"Authoritative, covers topics Jews and others w...",authoritative covers topics jews and others wo...,"{focus well written, topics jews others, cover...",7,<datasketch.minhash.MinHash object at 0x7832c9...,B000PW7KJW_1
2,B000FFJRI6,"This book almost broke me, it was like 13 mydr...",this book almost broke me it was like 13 mydra...,"{almost broke like, left eyes shell, like loga...",31,<datasketch.minhash.MinHash object at 0x78328d...,B000FFJRI6_2
3,B000K0BJKU,Island of the blue dolphins by Scott O'Dell is...,island of the blue dolphins by scott o dell is...,"{dell exciting book, wilddogs left alone, woul...",74,<datasketch.minhash.MinHash object at 0x78328c...,B000K0BJKU_3
4,1587243938,I have heard so much about this book but never...,i have heard so much about this book but never...,"{tipping point perhaps, common tipping point, ...",146,<datasketch.minhash.MinHash object at 0x783250...,1587243938_4
5,B000N5BN4O,This is my favorite comic and I was really exc...,this is my favorite comic and i was really exc...,"{favorite comic really, around 20 pounds, othe...",40,<datasketch.minhash.MinHash object at 0x783250...,B000N5BN4O_5
6,B0000DK4HN,"This is not a novel, it is a silly justificati...",this is not a novel it is a silly justificatio...,"{know better next, time last richard, patterso...",17,<datasketch.minhash.MinHash object at 0x783250...,B0000DK4HN_6
7,B000MWTM28,I was fascinated to know some details about th...,i was fascinated to know some details about th...,"{pure search play, search 4 different, compare...",74,<datasketch.minhash.MinHash object at 0x783250...,B000MWTM28_7
8,B000OTPE62,Widower Luke Becker had decided that his eleve...,widower luke becker had decided that his eleve...,"{widower luke becker, canon letters however, r...",63,<datasketch.minhash.MinHash object at 0x783250...,B000OTPE62_8
9,0130256684,"It is a book, what do you want from me? Thanks...",it is a book what do you want from me thanks i...,"{book want thanks, book said thanks, want than...",4,<datasketch.minhash.MinHash object at 0x783250...,0130256684_9


In [19]:
# ═══════════════════════════════════════════════════════════════════════════════
# 19. NORMALIZE PAIRS
# Remove duplicates and self-pairs
# ═══════════════════════════════════════════════════════════════════════════════

clean_pairs = set()

for key1, key2 in candidate_pairs:
    # Skip if same document
    if key1 == key2:
        continue

    # Sort keys to avoid duplicates
    pair = tuple(sorted([key1, key2]))
    clean_pairs.add(pair)

# Convert back to list
clean_pairs = list(clean_pairs)

print(f"Before: {len(candidate_pairs):,} pairs")
print(f"After:  {len(clean_pairs):,} unique pairs")
print(f"Removed: {len(candidate_pairs) - len(clean_pairs):,} duplicates")

Before: 331 pairs
After:  331 unique pairs
Removed: 0 duplicates


In [20]:
# ═══════════════════════════════════════════════════════════════════════════════
# 20. VERIFY EACH PAIR
# Calculate true Jaccard and filter by threshold
# ═══════════════════════════════════════════════════════════════════════════════

verified_pairs = []

# Check each candidate pair
for key1, key2 in clean_pairs:

    # Get shingle sets
    shingles1 = shingle_dict.get(key1)
    shingles2 = shingle_dict.get(key2)

    # Skip if missing
    if shingles1 is None or shingles2 is None:
        continue

    # Calculate exact similarity
    sim = jaccard_similarity(shingles1, shingles2)

    # Keep only if above threshold
    if sim >= THRESHOLD:
        verified_pairs.append({
            'key1': key1,
            'key2': key2,
            'similarity': sim,
            'text1': text_dict[key1],
            'text2': text_dict[key2]
        })

print(f"✓ Found {len(verified_pairs):,} similar pairs (≥{THRESHOLD})")

✓ Found 331 similar pairs (≥0.6)


In [21]:
# ═══════════════════════════════════════════════════════════════════════════════
# 21. BUILD RESULTS TABLE
# Organize verified pairs into dataframe
# ═══════════════════════════════════════════════════════════════════════════════

# Create dataframe from list of dictionaries
results_df = pd.DataFrame(verified_pairs)

# Sort by similarity (highest first)
if len(results_df) > 0:
    results_df = results_df.sort_values('similarity', ascending=False)
    results_df = results_df.reset_index(drop=True)

    print(f"\n📊 Similarity Statistics:")
    print(f"  Mean:   {results_df['similarity'].mean():.3f}")
    print(f"  Median: {results_df['similarity'].median():.3f}")
    print(f"  Min:    {results_df['similarity'].min():.3f}")
    print(f"  Max:    {results_df['similarity'].max():.3f}")

else:
    print("\n⚠ No similar pairs found")
    print("  Try: Lower threshold or larger sample")


📊 Similarity Statistics:
  Mean:   0.994
  Median: 1.000
  Min:    0.667
  Max:    1.000


In [22]:
# ═══════════════════════════════════════════════════════════════════════════════
# 22. ANALYZE DISTRIBUTION
# How many pairs in each similarity range?
# ═══════════════════════════════════════════════════════════════════════════════

if len(results_df) > 0:

    print("="*60)
    print("SIMILARITY DISTRIBUTION")
    print("="*60)

    # Define ranges
    ranges = [
        (0.60, 0.65, '0.60-0.65'),
        (0.65, 0.70, '0.65-0.70'),
        (0.70, 0.75, '0.70-0.75'),
        (0.75, 0.80, '0.75-0.80'),
        (0.80, 0.85, '0.80-0.85'),
        (0.85, 0.90, '0.85-0.90'),
        (0.90, 0.95, '0.90-0.95'),
        (0.95, 1.00, '0.95-1.00')
    ]

    print(f"\n{'Range':<15} {'Count':<10} {'%':<10}")
    print("-"*35)

    # Count pairs in each range
    for low, high, label in ranges:
        if high < 1.0:
            mask = (results_df['similarity'] >= low) & (results_df['similarity'] < high)
        else:
            mask = results_df['similarity'] >= low

        count = mask.sum()
        pct = (count / len(results_df)) * 100
        print(f"{label:<15} {count:<10} {pct:>5.1f}%")

    print("-"*35)
    print(f"{'TOTAL':<15} {len(results_df):<10} {'100.0%':>5}")

SIMILARITY DISTRIBUTION

Range           Count      %         
-----------------------------------
0.60-0.65       0            0.0%
0.65-0.70       1            0.3%
0.70-0.75       3            0.9%
0.75-0.80       0            0.0%
0.80-0.85       2            0.6%
0.85-0.90       0            0.0%
0.90-0.95       5            1.5%
0.95-1.00       320         96.7%
-----------------------------------
TOTAL           331        100.0%


In [23]:
# ═══════════════════════════════════════════════════════════════════════════════
# 23.  SHOW TOP 5 MOST SIMILAR PAIRS
# Display the reviews that are most alike
# ═══════════════════════════════════════════════════════════════════════════════

if len(results_df) > 0:

    print("="*80)
    print("TOP 5 MOST SIMILAR PAIRS")
    print("="*80)

    # Show first 5 rows
    for i, row in results_df.head(5).iterrows():

        print(f"\n{'─'*80}")
        print(f"PAIR #{i+1} - Similarity: {row['similarity']:.3f}")
        print(f"{'─'*80}")

        # Show first 150 characters of each review
        print(f"\nReview 1:")
        print(f"  {row['text1'][:150]}...")

        print(f"\nReview 2:")
        print(f"  {row['text2'][:150]}...")

else:
    print("\n⚠ No results to display")

TOP 5 MOST SIMILAR PAIRS

────────────────────────────────────────────────────────────────────────────────
PAIR #1 - Similarity: 1.000
────────────────────────────────────────────────────────────────────────────────

Review 1:
  I enjoyed reading this book in high school and also enjoy watching one of the old B&W; movie versions if it happens to come on TV. So I decided to rea...

Review 2:
  I enjoyed reading this book in high school and also enjoy watching one of the old B&W; movie versions if it happens to come on TV. So I decided to rea...

────────────────────────────────────────────────────────────────────────────────
PAIR #2 - Similarity: 1.000
────────────────────────────────────────────────────────────────────────────────

Review 1:
  First of all: I wish people would stop deconstructing Oscar Wilde before they deconstruct his message. He should be free to contradict himself, becaus...

Review 2:
  First of all: I wish people would stop deconstructing Oscar Wilde before they d

In [24]:
# ═══════════════════════════════════════════════════════════════════════════════
# 24. DETAILED SHINGLE ANALYSIS
# How many shingles match between pairs?
# ═══════════════════════════════════════════════════════════════════════════════

if len(results_df) > 0:

    print("="*80)
    print("SHINGLE OVERLAP DETAILS")
    print("="*80)

    # Analyze first 5 pairs
    for i, row in results_df.head(5).iterrows():

        # Get shingle sets
        s1 = shingle_dict[row['key1']]
        s2 = shingle_dict[row['key2']]

        # Calculate overlap
        common = s1 & s2
        only_1 = s1 - s2
        only_2 = s2 - s1
        total = s1 | s2

        print(f"\nPair #{i+1}:")
        print(f"  Common shingles:    {len(common):>4} ({len(common)/len(total)*100:>5.1f}%)")
        print(f"  Only in review 1:   {len(only_1):>4}")
        print(f"  Only in review 2:   {len(only_2):>4}")
        print(f"  Total unique:       {len(total):>4}")

SHINGLE OVERLAP DETAILS

Pair #1:
  Common shingles:      34 (100.0%)
  Only in review 1:      0
  Only in review 2:      0
  Total unique:         34

Pair #2:
  Common shingles:      37 (100.0%)
  Only in review 1:      0
  Only in review 2:      0
  Total unique:         37

Pair #3:
  Common shingles:      72 (100.0%)
  Only in review 1:      0
  Only in review 2:      0
  Total unique:         72

Pair #4:
  Common shingles:     268 (100.0%)
  Only in review 1:      0
  Only in review 2:      0
  Total unique:        268

Pair #5:
  Common shingles:     293 (100.0%)
  Only in review 1:      0
  Only in review 2:      0
  Total unique:        293


In [25]:
# ═══════════════════════════════════════════════════════════════════════════════
# 25. LOWEST SIMILARITY PAIRS
# Pairs that barely meet the threshold - SHOWING FULL TEXT
# ═══════════════════════════════════════════════════════════════════════════════

if len(results_df) > 0:

    print("="*80)
    print("5 PAIRS WITH LOWEST SIMILARITY")
    print(f"(but still ≥ {THRESHOLD})")
    print("="*80)

    # Get last 5 rows (lowest similarity)
    for i, row in results_df.tail(5).iterrows():

        print(f"\n{'─'*80}")
        print(f"Pair #{i+1} - Similarity: {row['similarity']:.3f}")
        print(f"{'─'*80}")

        # SHOW FULL TEXT - NO CHARACTER LIMIT
        print(f"\nReview 1 (FULL TEXT):")
        print(f"{row['text1']}")

        print(f"\n{'-'*80}")

        print(f"\nReview 2 (FULL TEXT):")
        print(f"{row['text2']}")

        # Why are these barely similar?
        s1 = shingle_dict[row['key1']]
        s2 = shingle_dict[row['key2']]

        print(f"\n{'─'*80}")
        print(f"Analysis:")
        print(f"  Review 1 has: {len(s1)} shingles")
        print(f"  Review 2 has: {len(s2)} shingles")
        print(f"  In common:    {len(s1 & s2)} shingles ({len(s1 & s2)/len(s1 | s2)*100:.1f}%)")
        print(f"{'─'*80}")

        # Optional: Show common shingles
        common_shingles = s1 & s2
        if common_shingles:
            print(f"\nCommon shingles (first 10):")
            for idx, shingle in enumerate(list(common_shingles)[:10], 1):
                print(f"  {idx}. '{shingle}'")

5 PAIRS WITH LOWEST SIMILARITY
(but still ≥ 0.6)

────────────────────────────────────────────────────────────────────────────────
Pair #327 - Similarity: 0.800
────────────────────────────────────────────────────────────────────────────────

Review 1 (FULL TEXT):
Perfect. How many more words must I write to be polite? Seriously, if one is satisfiedstop with the word requirements!

--------------------------------------------------------------------------------

Review 2 (FULL TEXT):
Excellent. How many more words must I write to be polite? Seriously, if one is satisfiedstop with the word requirements!

────────────────────────────────────────────────────────────────────────────────
Analysis:
  Review 1 has: 9 shingles
  Review 2 has: 9 shingles
  In common:    8 shingles (80.0%)
────────────────────────────────────────────────────────────────────────────────

Common shingles (first 10):
  1. 'must write polite'
  2. 'many words must'
  3. 'words must write'
  4. 'write polite seriousl

In [26]:
# ═══════════════════════════════════════════════════════════════════════════════
# 26. DOWNLOAD RESULTS
# Save all similar pairs to files - WITH FULL TEXT
# ═══════════════════════════════════════════════════════════════════════════════

if len(results_df) > 0:

    print("="*80)
    print("SAVING RESULTS WITH FULL TEXT")
    print("="*80)

    # ─────────────────────────────────────────────────────────────────────────
    # FILE 1: Basic results (compact) - with full text
    # ─────────────────────────────────────────────────────────────────────────

    # Create export with key columns
    export_df = results_df[['key1', 'key2', 'similarity']].copy()

    # Add shingle counts
    export_df['shingles1'] = export_df['key1'].map(lambda k: len(shingle_dict[k]))
    export_df['shingles2'] = export_df['key2'].map(lambda k: len(shingle_dict[k]))

    # Save to CSV (without text for compact file)
    export_df.to_csv('similar_pairs.csv', index=False)

    print(f"\n✓ File 1: 'similar_pairs.csv'")
    print(f"  Rows: {len(export_df):,}")
    print(f"  Columns: {list(export_df.columns)}")
    print(f"  (Compact version without text)")

    # ─────────────────────────────────────────────────────────────────────────
    # FILE 2: With FULL TEXT (no truncation)
    # ─────────────────────────────────────────────────────────────────────────

    # Add COMPLETE text - NO CHARACTER LIMIT
    export_full = export_df.copy()
    export_full['text1_full'] = results_df['text1']
    export_full['text2_full'] = results_df['text2']

    # Save with FULL text
    export_full.to_csv('similar_pairs_FULL_TEXT.csv', index=False)

    print(f"\n✓ File 2: 'similar_pairs_FULL_TEXT.csv'")
    print(f"  Rows: {len(export_full):,}")
    print(f"  Includes COMPLETE review text (no truncation)")

    # ─────────────────────────────────────────────────────────────────────────
    # FILE 3: Human-readable text report with FULL TEXT
    # ─────────────────────────────────────────────────────────────────────────

    # Write detailed report to text file
    with open('similar_pairs_FULL_REPORT.txt', 'w', encoding='utf-8') as f:
        f.write("="*80 + "\n")
        f.write("SIMILARITY RESULTS - COMPLETE REPORT\n")
        f.write("="*80 + "\n\n")

        f.write(f"Total documents analyzed: {len(df_sample):,}\n")
        f.write(f"Similar pairs found: {len(results_df):,}\n")
        f.write(f"Similarity threshold: {THRESHOLD}\n\n")

        f.write(f"Similarity Statistics:\n")
        f.write(f"  Min:    {results_df['similarity'].min():.3f}\n")
        f.write(f"  Max:    {results_df['similarity'].max():.3f}\n")
        f.write(f"  Mean:   {results_df['similarity'].mean():.3f}\n")
        f.write(f"  Median: {results_df['similarity'].median():.3f}\n\n")

        f.write("="*80 + "\n")
        f.write("DETAILED PAIRS WITH FULL TEXT\n")
        f.write("="*80 + "\n\n")

        # Write ALL pairs with FULL text
        for i in range(len(results_df)):
            row = results_df.iloc[i]

            f.write("\n" + "─"*80 + "\n")
            f.write(f"PAIR #{i+1}\n")
            f.write("─"*80 + "\n")
            f.write(f"Similarity Score: {row['similarity']:.3f}\n")
            f.write(f"Key 1: {row['key1']}\n")
            f.write(f"Key 2: {row['key2']}\n\n")

            f.write("REVIEW 1 (COMPLETE TEXT):\n")
            f.write("-"*80 + "\n")
            f.write(f"{row['text1']}\n\n")

            f.write("REVIEW 2 (COMPLETE TEXT):\n")
            f.write("-"*80 + "\n")
            f.write(f"{row['text2']}\n\n")

            # Add shingle analysis
            s1 = shingle_dict[row['key1']]
            s2 = shingle_dict[row['key2']]
            common = s1 & s2

            f.write("SIMILARITY ANALYSIS:\n")
            f.write(f"  Review 1 shingles: {len(s1)}\n")
            f.write(f"  Review 2 shingles: {len(s2)}\n")
            f.write(f"  Common shingles: {len(common)} ({len(common)/len(s1|s2)*100:.1f}%)\n")
            f.write(f"  Unique to review 1: {len(s1-s2)}\n")
            f.write(f"  Unique to review 2: {len(s2-s1)}\n\n")

            # Show some common shingles
            if common:
                f.write("Sample common shingles:\n")
                for idx, shingle in enumerate(list(common)[:10], 1):
                    f.write(f"  {idx}. '{shingle}'\n")

            f.write("\n" + "="*80 + "\n")

    print(f"\n✓ File 3: 'similar_pairs_FULL_REPORT.txt'")
    print(f"  Human-readable report with COMPLETE text")
    print(f"  Includes detailed analysis for all pairs")

    # ─────────────────────────────────────────────────────────────────────────
    # FILE 4: Summary statistics
    # ─────────────────────────────────────────────────────────────────────────

    with open('summary_statistics.txt', 'w', encoding='utf-8') as f:
        f.write("SUMMARY STATISTICS\n")
        f.write("="*50 + "\n\n")

        f.write(f"Dataset Information:\n")
        f.write(f"  Total documents: {len(df_sample):,}\n")
        f.write(f"  Candidate pairs checked: {len(clean_pairs):,}\n")
        f.write(f"  Similar pairs found: {len(results_df):,}\n")
        f.write(f"  Threshold: {THRESHOLD}\n\n")

        f.write(f"Similarity Distribution:\n")
        f.write(f"  Minimum:  {results_df['similarity'].min():.3f}\n")
        f.write(f"  Maximum:  {results_df['similarity'].max():.3f}\n")
        f.write(f"  Mean:     {results_df['similarity'].mean():.3f}\n")
        f.write(f"  Median:   {results_df['similarity'].median():.3f}\n")
        f.write(f"  Std Dev:  {results_df['similarity'].std():.3f}\n\n")

        f.write(f"Review Length Statistics:\n")
        f.write(f"  Avg text1 length: {results_df['text1'].str.len().mean():.0f} chars\n")
        f.write(f"  Avg text2 length: {results_df['text2'].str.len().mean():.0f} chars\n")

    print(f"\n✓ File 4: 'summary_statistics.txt'")
    print(f"  Statistical summary")



SAVING RESULTS WITH FULL TEXT

✓ File 1: 'similar_pairs.csv'
  Rows: 331
  Columns: ['key1', 'key2', 'similarity', 'shingles1', 'shingles2']
  (Compact version without text)

✓ File 2: 'similar_pairs_FULL_TEXT.csv'
  Rows: 331
  Includes COMPLETE review text (no truncation)

✓ File 3: 'similar_pairs_FULL_REPORT.txt'
  Human-readable report with COMPLETE text
  Includes detailed analysis for all pairs

✓ File 4: 'summary_statistics.txt'
  Statistical summary
