# Similaridade de Jaccard

## Configurações iniciais

In [6]:
import os.path
import pandas as pd

# Diretório base dos datasets
base_dir = './data/02-cleaned/'

# Dicionários de datasets
datasets_path = {
		'Fortuna'     :os.path.join(base_dir, 'fortuna.csv'),
		'HateBRXplain':os.path.join(base_dir, 'hatebrxplain.csv'),
		'OffComBR'    :os.path.join(base_dir, 'offcombr3.csv'),
		'OLID-BR'     :os.path.join(base_dir, 'olidbr.csv'),
		'ToLD-BR'     :os.path.join(base_dir, 'toldbr.csv'),
		'TuPy'        :os.path.join(base_dir, 'tupy.csv')
		}

datasets_df = {
		'Fortuna'     :pd.read_csv(datasets_path['Fortuna']),
		'HateBRXplain':pd.read_csv(datasets_path['HateBRXplain']),
		'OffComBR'    :pd.read_csv(datasets_path['OffComBR']),
		'OLID-BR'     :pd.read_csv(datasets_path['OLID-BR']),
		'ToLD-BR'     :pd.read_csv(datasets_path['ToLD-BR']),
		'TuPy'        :pd.read_csv(datasets_path['TuPy'])
		}

In [7]:
def get_jaccard_index(df1, df2, col_text='text'):
    # 1. Convert to string and lowercase (.str.lower())
    # 2. Join everything into a single text (.str.cat)
    # 3. Split by spaces to get the words (.split)
    words_a = df1[col_text].astype(str).str.lower().str.cat(sep=' ').split()
    words_b = df2[col_text].astype(str).str.lower().str.cat(sep=' ').split()

    # Create sets of unique words
    vocab_a = set(words_a)
    vocab_b = set(words_b)

    # Calculations
    intersection = vocab_a.intersection(vocab_b)
    union = vocab_a.union(vocab_b)

    jaccard = len(intersection) / len(union) if len(union) > 0 else 0.0

    return jaccard, len(intersection), len(union)

## Similaridade de Jaccard para o dataset completo

In [8]:
# 1. Get the list of dataset names
dataset_names = list(datasets_df.keys())
jaccard_matrix = []

print("Calculating Jaccard Similarity Matrix...")

# 2. Nested loop to compare every dataset against every other dataset
for name_a in dataset_names:
    row_scores = []
    # print(f"Processing: {name_a}...") # Optional: to track progress

    for name_b in dataset_names:
        # Calculate Jaccard index (unpacking the tuple to get just the score)
        score, intersection_count, union_count = get_jaccard_index(datasets_df[name_a], datasets_df[name_b])
        row_scores.append(score)

    jaccard_matrix.append(row_scores)

# 3. Create the final DataFrame
jaccard_df = pd.DataFrame(jaccard_matrix, index=dataset_names, columns=dataset_names)

print("Done!")

# 4. Display with a heatmap for better visualization
jaccard_df.style.background_gradient(cmap='Greens', axis=None).format("{:.2%}")

Calculating Jaccard Similarity Matrix...
Done!


Unnamed: 0,Fortuna,HateBRXplain,OffComBR,OLID-BR,ToLD-BR,TuPy
Fortuna,100.00%,17.32%,9.00%,18.15%,16.64%,18.46%
HateBRXplain,17.32%,100.00%,8.99%,18.56%,16.02%,17.95%
OffComBR,9.00%,8.99%,100.00%,7.77%,5.94%,8.20%
OLID-BR,18.15%,18.56%,7.77%,100.00%,19.89%,19.82%
ToLD-BR,16.64%,16.02%,5.94%,19.89%,100.00%,20.71%
TuPy,18.46%,17.95%,8.20%,19.82%,20.71%,100.00%


## Similaridade de Jaccard para TOXIC


In [10]:
# 1. Get the list of dataset names
dataset_names = list(datasets_df.keys())
jaccard_matrix = []

print("Calculating Jaccard Similarity Matrix (Toxic Only)...")

# 2. Nested loop to compare every dataset against every other dataset
for name_a in dataset_names:
    row_scores = []

    # Filter Dataset A: Keep only rows where is_toxic is True (1)
    df_a = datasets_df[name_a]
    toxic_subset_a = df_a[df_a['is_toxic'] == True]

    for name_b in dataset_names:
        # Filter Dataset B: Keep only rows where is_toxic is True (1)
        df_b = datasets_df[name_b]
        toxic_subset_b = df_b[df_b['is_toxic'] == True]

        # Calculate Jaccard index using ONLY the toxic subsets
        score, intersection_count, union_count = get_jaccard_index(toxic_subset_a, toxic_subset_b)
        row_scores.append(score)

    jaccard_matrix.append(row_scores)

# 3. Create the final DataFrame
jaccard_df = pd.DataFrame(jaccard_matrix, index=dataset_names, columns=dataset_names)

print("Done!")

# 4. Display with a heatmap for better visualization
jaccard_df.style.background_gradient(cmap='Greens', axis=None).format("{:.2%}")

Calculating Jaccard Similarity Matrix (Toxic Only)...
Done!


Unnamed: 0,Fortuna,HateBRXplain,OffComBR,OLID-BR,ToLD-BR,TuPy
Fortuna,100.00%,15.85%,5.73%,14.07%,16.14%,16.95%
HateBRXplain,15.85%,100.00%,4.16%,17.28%,16.86%,16.86%
OffComBR,5.73%,4.16%,100.00%,2.66%,3.88%,6.98%
OLID-BR,14.07%,17.28%,2.66%,100.00%,19.95%,12.81%
ToLD-BR,16.14%,16.86%,3.88%,19.95%,100.00%,15.45%
TuPy,16.95%,16.86%,6.98%,12.81%,15.45%,100.00%


## Similaridade de Jaccard para NON-TOXIC

In [11]:
import pandas as pd

# 1. Get the list of dataset names
dataset_names = list(datasets_df.keys())
jaccard_matrix = []

print("Calculating Jaccard Similarity Matrix (Non-toxic Only)...")

# 2. Nested loop to compare every dataset against every other dataset
for name_a in dataset_names:
    row_scores = []

    # Filter Dataset A: Keep only rows where is_toxic is False (0)
    df_a = datasets_df[name_a]
    toxic_subset_a = df_a[df_a['is_toxic'] == False]

    for name_b in dataset_names:
        # Filter Dataset B: Keep only rows where is_toxic is False (0)
        df_b = datasets_df[name_b]
        toxic_subset_b = df_b[df_b['is_toxic'] == False]

        # Calculate Jaccard index using ONLY the toxic subsets
        score, intersection_count, union_count = get_jaccard_index(toxic_subset_a, toxic_subset_b)
        row_scores.append(score)

    jaccard_matrix.append(row_scores)

# 3. Create the final DataFrame
jaccard_df = pd.DataFrame(jaccard_matrix, index=dataset_names, columns=dataset_names)

print("Done!")

# 4. Display with a heatmap for better visualization
jaccard_df.style.background_gradient(cmap='Greens', axis=None).format("{:.2%}")

Calculating Jaccard Similarity Matrix (Non-toxic Only)...
Done!


Unnamed: 0,Fortuna,HateBRXplain,OffComBR,OLID-BR,ToLD-BR,TuPy
Fortuna,100.00%,15.80%,9.52%,15.10%,15.89%,17.24%
HateBRXplain,15.80%,100.00%,10.76%,15.50%,13.21%,14.75%
OffComBR,9.52%,10.76%,100.00%,13.49%,6.22%,8.01%
OLID-BR,15.10%,15.50%,13.49%,100.00%,10.88%,12.86%
ToLD-BR,15.89%,13.21%,6.22%,10.88%,100.00%,20.66%
TuPy,17.24%,14.75%,8.01%,12.86%,20.66%,100.00%
