In [None]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz
from tqdm import tqdm
import re
import random
from datetime import datetime

# -----------------------------
# Basic cleaning (no normalization)
# -----------------------------
URL_RE = re.compile(r'http\S+')
MENTION_RE = re.compile(r'@\w+')
HASHTAG_RE = re.compile(r'#\w+')

def clean_tweet(text: str) -> str:
    text = str(text).lower()
    text = URL_RE.sub('', text)
    text = MENTION_RE.sub('', text)
    text = HASHTAG_RE.sub('', text)
    return ' '.join(text.split()).strip()

# -----------------------------
# Robustness helpers
# -----------------------------
def clean_short_tweets(df: pd.DataFrame, min_words: int = 10) -> pd.DataFrame:
    """Mark tweets with <min_words as non-duplicates."""
    df = df.copy()
    short_mask = df['word_count'] < min_words
    df.loc[short_mask, 'group'] = -1
    df.loc[short_mask, 'group_rank'] = -1
    df.loc[short_mask, 'is_duplicate'] = False
    print(f"Marked {short_mask.sum()} short tweets (<{min_words} words) as non-duplicates")
    return df

def mark_single_member_groups_as_non_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """Mark single-member duplicate groups as non-duplicates."""
    df = df.copy()
    group_sizes = df[df['group'] != -1]['group'].value_counts()
    single_member_groups = group_sizes[group_sizes == 1].index
    if len(single_member_groups):
        mask = df['group'].isin(single_member_groups)
        df.loc[mask, 'group'] = -1
        df.loc[mask, 'group_rank'] = -1
        df.loc[mask, 'is_duplicate'] = False
    print(f"Marked {len(single_member_groups)} single-member groups as non-duplicates")
    return df

# -----------------------------
# Duplicate detection (single pass, no chunks)
# -----------------------------
def group_duplicates_single(df_proc: pd.DataFrame, threshold: int) -> pd.Series:
    """
    Returns a pandas Series 'group' aligned with df_proc.index.
    Duplicate tweets (multi-member groups) get a non-negative group id; singletons get -1.
    """
    tweet_list = df_proc['tweet_clean'].tolist()
    indices = df_proc.index.tolist()
    groups = {}  # idx -> group_id (use smallest idx in group)

    for i in tqdm(range(len(tweet_list)), desc=f"Scanning @{threshold}", leave=False):
        idx_i = indices[i]
        if idx_i in groups:
            continue

        ti = tweet_list[i]
        li = len(ti)
        if li == 0:
            continue

        members = [idx_i]

        for j in range(i + 1, len(tweet_list)):
            idx_j = indices[j]
            if idx_j in groups:
                continue

            tj = tweet_list[j]
            # length prefilter (±30%)
            if abs(len(tj) - li) <= 0.3 * li:
                score = fuzz.ratio(ti, tj)
                if score >= threshold:
                    members.append(idx_j)

        if len(members) > 1:
            gid = min(members)
            for m in members:
                groups[m] = gid

    # Build group Series
    group_ser = pd.Series(-1, index=df_proc.index, dtype=int)
    if groups:
        for idx, gid in groups.items():
            group_ser.at[idx] = gid

        # remove singleton groups (defensive)
        counts = group_ser[group_ser != -1].value_counts()
        singletons = counts[counts == 1].index
        group_ser.loc[group_ser.isin(singletons)] = -1

        # relabel consecutive
        valid = group_ser[group_ser != -1].unique()
        mapping = {old: new for new, old in enumerate(sorted(valid))}
        group_ser = group_ser.map(lambda x: mapping.get(x, -1))

    return group_ser

# -----------------------------
# Threshold sweep with saving + robustness
# -----------------------------
def evaluate_thresholds(df_fa: pd.DataFrame,
                        thresholds=range(75, 86),
                        sample_n=10000,
                        seed=42,
                        min_words_for_matching=5,
                        min_words_robust=10,
                        save_path="data/duplicate_threshold_summary.csv"):
    """
    Samples tweets, sweeps thresholds, and saves summary (threshold, % duplicates, n groups, etc.)
    Applies robustness checks before counting.
    """
    # Sample up to 10k
    df = df_fa.sample(min(sample_n, len(df_fa)), random_state=seed).copy()
    df['tweet_clean'] = df['tweet'].map(clean_tweet)
    df['word_count'] = df['tweet_clean'].str.split().str.len()

    eligible_mask = df['word_count'] >= min_words_for_matching
    df_proc = df[eligible_mask].copy()
    df_short = df[~eligible_mask].copy()
    N_total = len(df)

    print(f"Evaluating on {N_total:,} Persian tweets "
          f"({len(df_proc):,} eligible for matching; {len(df_short):,} below {min_words_for_matching} words).")

    results = []
    for thr in thresholds:
        if len(df_proc) == 0:
            pct_dup = 0.0
            n_groups = 0
            dup_count = 0
        else:
            # 1. Group duplicates on eligible tweets
            groups = group_duplicates_single(df_proc, threshold=thr)
            work = df.copy()
            work['group'] = -1
            work.loc[groups.index, 'group'] = groups.values
            work['group_rank'] = -1
            work['is_duplicate'] = work['group'] != -1

            # 2. Apply robustness checks
            work = clean_short_tweets(work, min_words=min_words_robust)
            work = mark_single_member_groups_as_non_duplicates(work)

            # 3. Compute counts
            dup_mask = work['group'] != -1
            dup_count = dup_mask.sum()
            n_groups = work.loc[dup_mask, 'group'].nunique()
            pct_dup = 100.0 * dup_count / N_total

        pct_non = 100.0 - pct_dup
        results.append((thr, n_groups, dup_count, N_total, pct_dup, pct_non))
        print(f"Threshold {thr}: {n_groups} groups | duplicates = {dup_count:,} ({pct_dup:.2f}%)")

    out = pd.DataFrame(results, columns=[
        "Threshold", "Duplicate_Groups", "Duplicate_Tweets", "Total_Tweets", "%Duplicates", "%Non_Duplicates"
    ])

    # Save results with timestamp
    #timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    #save_file = save_path.replace(".csv", f"_{timestamp}.csv")
    out.to_csv(save_path, index=False)
    print(f"\n✅ Results saved to: {save_path}")
    print("\nSummary:\n", out.to_string(index=False))
    return out

# -----------------------------
# Main
# -----------------------------
if __name__ == "__main__":
    print("Loading dataset...")
    df_all = pd.read_pickle("data/persian_english_tweets_onehashtag_twomonths_processed.pkl")
    df_fa = df_all[df_all["lang"] == "fa"].dropna(subset=["tweet"]).reset_index(drop=True)

    _ = evaluate_thresholds(
        df_fa,
        thresholds=range(75, 86),     # test thresholds 75–85 inclusive
        sample_n=10000,
        seed=42,
        min_words_for_matching=5,     # eligible for duplicate comparison
        min_words_robust=10,          # force <10 words to non-duplicate
        save_path="data/duplicate_threshold_summary_10k.csv"
    )


Loading dataset...
Evaluating on 10,000 Persian tweets (8,826 eligible for matching; 1,174 below 5 words).


                                                                   

Marked 2841 short tweets (<10 words) as non-duplicates
Marked 5 single-member groups as non-duplicates
Threshold 75: 131 groups | duplicates = 387 (3.87%)


                                                                   

Marked 2841 short tweets (<10 words) as non-duplicates
Marked 5 single-member groups as non-duplicates
Threshold 76: 132 groups | duplicates = 386 (3.86%)


                                                                   

Marked 2841 short tweets (<10 words) as non-duplicates
Marked 5 single-member groups as non-duplicates
Threshold 77: 132 groups | duplicates = 385 (3.85%)


                                                                   

Marked 2841 short tweets (<10 words) as non-duplicates
Marked 5 single-member groups as non-duplicates
Threshold 78: 131 groups | duplicates = 383 (3.83%)


                                                                   

Marked 2841 short tweets (<10 words) as non-duplicates
Marked 4 single-member groups as non-duplicates
Threshold 79: 128 groups | duplicates = 377 (3.77%)


                                                                   

Marked 2841 short tweets (<10 words) as non-duplicates
Marked 3 single-member groups as non-duplicates
Threshold 80: 128 groups | duplicates = 377 (3.77%)


                                                                   

Marked 2841 short tweets (<10 words) as non-duplicates
Marked 3 single-member groups as non-duplicates
Threshold 81: 128 groups | duplicates = 373 (3.73%)


                                                                   

Marked 2841 short tweets (<10 words) as non-duplicates
Marked 2 single-member groups as non-duplicates
Threshold 82: 125 groups | duplicates = 363 (3.63%)


                                                                   

Marked 2841 short tweets (<10 words) as non-duplicates
Marked 1 single-member groups as non-duplicates
Threshold 83: 125 groups | duplicates = 362 (3.62%)


                                                                   

Marked 2841 short tweets (<10 words) as non-duplicates
Marked 0 single-member groups as non-duplicates
Threshold 84: 123 groups | duplicates = 356 (3.56%)


                                                                   

Marked 2841 short tweets (<10 words) as non-duplicates
Marked 0 single-member groups as non-duplicates
Threshold 85: 122 groups | duplicates = 350 (3.50%)

✅ Results saved to: data/duplicate_threshold_summary_10k_20251029_111220.csv

Summary:
  Threshold  Duplicate_Groups  Duplicate_Tweets  Total_Tweets  %Duplicates  %Non_Duplicates
        75               131               387         10000         3.87            96.13
        76               132               386         10000         3.86            96.14
        77               132               385         10000         3.85            96.15
        78               131               383         10000         3.83            96.17
        79               128               377         10000         3.77            96.23
        80               128               377         10000         3.77            96.23
        81               128               373         10000         3.73            96.27
        82               125



In [None]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz
from tqdm import tqdm
import re
import random
from datetime import datetime

# -----------------------------
# Basic cleaning (no normalization)
# -----------------------------
URL_RE = re.compile(r'http\S+')
MENTION_RE = re.compile(r'@\w+')
HASHTAG_RE = re.compile(r'#\w+')

def clean_tweet(text: str) -> str:
    text = str(text).lower()
    text = URL_RE.sub('', text)
    text = MENTION_RE.sub('', text)
    text = HASHTAG_RE.sub('', text)
    return ' '.join(text.split()).strip()

# -----------------------------
# Robustness helpers
# -----------------------------
def clean_short_tweets(df: pd.DataFrame, min_words: int = 10) -> pd.DataFrame:
    """Mark tweets with <min_words as non-duplicates."""
    df = df.copy()
    short_mask = df['word_count'] < min_words
    df.loc[short_mask, 'group'] = -1
    df.loc[short_mask, 'group_rank'] = -1
    df.loc[short_mask, 'is_duplicate'] = False
    print(f"Marked {short_mask.sum()} short tweets (<{min_words} words) as non-duplicates")
    return df

def mark_single_member_groups_as_non_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """Mark single-member duplicate groups as non-duplicates."""
    df = df.copy()
    group_sizes = df[df['group'] != -1]['group'].value_counts()
    single_member_groups = group_sizes[group_sizes == 1].index
    if len(single_member_groups):
        mask = df['group'].isin(single_member_groups)
        df.loc[mask, 'group'] = -1
        df.loc[mask, 'group_rank'] = -1
        df.loc[mask, 'is_duplicate'] = False
    print(f"Marked {len(single_member_groups)} single-member groups as non-duplicates")
    return df

# -----------------------------
# Duplicate detection (single pass, no chunks)
# -----------------------------
def group_duplicates_single(df_proc: pd.DataFrame, threshold: int) -> pd.Series:
    """
    Returns a pandas Series 'group' aligned with df_proc.index.
    Duplicate tweets (multi-member groups) get a non-negative group id; singletons get -1.
    """
    tweet_list = df_proc['tweet_clean'].tolist()
    indices = df_proc.index.tolist()
    groups = {}  # idx -> group_id (use smallest idx in group)

    for i in tqdm(range(len(tweet_list)), desc=f"Scanning @{threshold}", leave=False):
        idx_i = indices[i]
        if idx_i in groups:
            continue

        ti = tweet_list[i]
        li = len(ti)
        if li == 0:
            continue

        members = [idx_i]

        for j in range(i + 1, len(tweet_list)):
            idx_j = indices[j]
            if idx_j in groups:
                continue

            tj = tweet_list[j]
            # length prefilter (±30%)
            if abs(len(tj) - li) <= 0.3 * li:
                score = fuzz.ratio(ti, tj)
                if score >= threshold:
                    members.append(idx_j)

        if len(members) > 1:
            gid = min(members)
            for m in members:
                groups[m] = gid

    # Build group Series
    group_ser = pd.Series(-1, index=df_proc.index, dtype=int)
    if groups:
        for idx, gid in groups.items():
            group_ser.at[idx] = gid

        # remove singleton groups (defensive)
        counts = group_ser[group_ser != -1].value_counts()
        singletons = counts[counts == 1].index
        group_ser.loc[group_ser.isin(singletons)] = -1

        # relabel consecutive
        valid = group_ser[group_ser != -1].unique()
        mapping = {old: new for new, old in enumerate(sorted(valid))}
        group_ser = group_ser.map(lambda x: mapping.get(x, -1))

    return group_ser

# -----------------------------
# Threshold sweep with saving + robustness
# -----------------------------
def evaluate_thresholds(df_fa: pd.DataFrame,
                        thresholds=range(75, 86),
                        sample_n=10000,
                        seed=42,
                        min_words_for_matching=5,
                        min_words_robust=10,
                        save_path="data/duplicate_threshold_summary.csv"):
    """
    Samples tweets, sweeps thresholds, and saves summary (threshold, % duplicates, n groups, etc.)
    Applies robustness checks before counting.
    """
    # Sample up to 10k
    df = df_fa.sample(min(sample_n, len(df_fa)), random_state=seed).copy()
    df['tweet_clean'] = df['tweet'].map(clean_tweet)
    df['word_count'] = df['tweet_clean'].str.split().str.len()

    eligible_mask = df['word_count'] >= min_words_for_matching
    df_proc = df[eligible_mask].copy()
    df_short = df[~eligible_mask].copy()
    N_total = len(df)

    print(f"Evaluating on {N_total:,} Persian tweets "
          f"({len(df_proc):,} eligible for matching; {len(df_short):,} below {min_words_for_matching} words).")

    results = []
    for thr in thresholds:
        if len(df_proc) == 0:
            pct_dup = 0.0
            n_groups = 0
            dup_count = 0
        else:
            # 1. Group duplicates on eligible tweets
            groups = group_duplicates_single(df_proc, threshold=thr)
            work = df.copy()
            work['group'] = -1
            work.loc[groups.index, 'group'] = groups.values
            work['group_rank'] = -1
            work['is_duplicate'] = work['group'] != -1

            # 2. Apply robustness checks
            work = clean_short_tweets(work, min_words=min_words_robust)
            work = mark_single_member_groups_as_non_duplicates(work)

            # 3. Compute counts
            dup_mask = work['group'] != -1
            dup_count = dup_mask.sum()
            n_groups = work.loc[dup_mask, 'group'].nunique()
            pct_dup = 100.0 * dup_count / N_total

        pct_non = 100.0 - pct_dup
        results.append((thr, n_groups, dup_count, N_total, pct_dup, pct_non))
        print(f"Threshold {thr}: {n_groups} groups | duplicates = {dup_count:,} ({pct_dup:.2f}%)")

    out = pd.DataFrame(results, columns=[
        "Threshold", "Duplicate_Groups", "Duplicate_Tweets", "Total_Tweets", "%Duplicates", "%Non_Duplicates"
    ])

    # Save results with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    save_file = save_path.replace(".csv", f"_{timestamp}.csv")
    out.to_csv(save_file, index=False)
    print(f"\n✅ Results saved to: {save_file}")
    print("\nSummary:\n", out.to_string(index=False))
    return out

# -----------------------------
# Main
# -----------------------------
if __name__ == "__main__":
    # print("Loading dataset...")
    # df_all = pd.read_pickle("data/persian_english_tweets_onehashtag_twomonths_processed.pkl")
    # df_fa = df_all[df_all["lang"] == "fa"].dropna(subset=["tweet"]).reset_index(drop=True)

    _ = evaluate_thresholds(
        df_fa,
        thresholds=range(75, 91),     # test thresholds 75–85 inclusive
        sample_n=100000,
        seed=42,
        min_words_for_matching=5,     # eligible for duplicate comparison
        min_words_robust=10,          # force <10 words to non-duplicate
        save_path="data/duplicate_threshold_summary_100k.csv"
    )


Evaluating on 100,000 Persian tweets (88,390 eligible for matching; 11,610 below 5 words).


                                                                     

Marked 28718 short tweets (<10 words) as non-duplicates
Marked 34 single-member groups as non-duplicates
Threshold 86: 2669 groups | duplicates = 10,028 (10.03%)


                                                                     

Marked 28718 short tweets (<10 words) as non-duplicates
Marked 23 single-member groups as non-duplicates
Threshold 87: 2675 groups | duplicates = 9,959 (9.96%)


                                                                     

Marked 28718 short tweets (<10 words) as non-duplicates
Marked 20 single-member groups as non-duplicates
Threshold 88: 2668 groups | duplicates = 9,853 (9.85%)


                                                                     

Marked 28718 short tweets (<10 words) as non-duplicates
Marked 18 single-member groups as non-duplicates
Threshold 89: 2646 groups | duplicates = 9,705 (9.71%)


                                                                     

Marked 28718 short tweets (<10 words) as non-duplicates
Marked 18 single-member groups as non-duplicates
Threshold 90: 2628 groups | duplicates = 9,568 (9.57%)

✅ Results saved to: data/duplicate_threshold_summary_100k_2_20251030_131524.csv

Summary:
  Threshold  Duplicate_Groups  Duplicate_Tweets  Total_Tweets  %Duplicates  %Non_Duplicates
        86              2669             10028        100000       10.028           89.972
        87              2675              9959        100000        9.959           90.041
        88              2668              9853        100000        9.853           90.147
        89              2646              9705        100000        9.705           90.295
        90              2628              9568        100000        9.568           90.432
