In [2]:
# Standard library imports
import os
import time

# Local imports
from utils.general import printt, print_sucess, N_PROCESSES
from utils.data_load_extract import (
    ensure_directory_exists_and_is_empty,
    extract_sources_from_suspicious_xml
    )
from utils.preprocessing import preprocessing_data
from utils.filtering import document_filtering, sentence_filtering
from utils.metrics import similarity_computation, evaluate_detection

In [3]:
reload_sources = True
reload_suspicious = True
reload_sent_matches = True
results_filename = "data_for_figure_20.txt"
SOURCE_LIMIT = 64
SUSPICIOUS_LIMIT = 8
THRESHOLD_LIST = [0.65] # [0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70]
BETA = 0.5

In [4]:
# 1. Initial setup (paths and number of processes for parallelization)
printt("Starting the algorithm...")
# Getting the current directory path and setting up the base path for data.
current_dir = os.getcwd()
base_data_path = os.path.join(current_dir, "pan-plagiarism-corpus-2011/external-detection-corpus")
sent_matches_dir = os.path.join(current_dir, "sent-matches-dir")
# Print the number of processes that will be used for parallel operations.
printt(f"N_PROCESSES: {N_PROCESSES}")

[22:39:38] Starting the algorithm...
[22:39:38] N_PROCESSES: 6


In [5]:
# 2. Directory setup
printt("Setting up directories...")
# Initialize the directories for raw and cleaned versions of both source and suspicious documents.
# Set up paths for raw and cleaned source documents.
cleaned_source_path = os.path.join(current_dir, "cleaned-source-documents")
source_path = os.path.join(base_data_path, "source-document")
# Set up paths for raw and cleaned suspicious documents.
cleaned_suspicious_path = os.path.join(current_dir, "cleaned-suspicious-documents")
suspicious_path = os.path.join(base_data_path, "suspicious-document")
if reload_sources:
    ensure_directory_exists_and_is_empty(cleaned_source_path)
if reload_suspicious:
    ensure_directory_exists_and_is_empty(cleaned_suspicious_path)
if reload_sent_matches:
    ensure_directory_exists_and_is_empty(sent_matches_dir)

[22:39:39] Setting up directories...


In [6]:
# Step 3: Preprocess the raw data to prepare it for the subsequent analysis
printt("Preprocessing data...")
printt("Source documents preprocessing...")
if reload_sources:
    preprocessing_data(source_path, cleaned_source_path, SOURCE_LIMIT)
printt("Suspicious documents preprocessing...")
if reload_suspicious:
    preprocessing_data(suspicious_path, cleaned_suspicious_path, SUSPICIOUS_LIMIT)

[22:39:40] Preprocessing data...
[22:39:40] Source documents preprocessing...
[22:39:48] Suspicious documents preprocessing...


In [22]:
# 4. Extract real results from XML files to understand the ground truth of plagiarism references
printt("Extracting reference results...")
# Extract and store the plagiarism references from the suspicious XML files
df_references = extract_sources_from_suspicious_xml(suspicious_path)
suspicious_filenames = [filename.replace("parquet", "txt") for filename in os.listdir(cleaned_suspicious_path)]
source_filenames = [filename.replace("parquet", "txt") for filename in os.listdir(cleaned_source_path)]
df_references = df_references[df_references["suspicious_filename"].isin(suspicious_filenames)].reset_index(drop=True)
df_references

[22:45:35] Extracting reference results...


Unnamed: 0,suspicious_filename,source_filename,suspicious_offset,suspicious_length,source_offset,source_length
0,suspicious-document00078.txt,source-document06404.txt,268903,15039,167919,15017
1,suspicious-document00078.txt,source-document06404.txt,313039,552,7172,521
2,suspicious-document00093.txt,source-document02877.txt,2395,13396,354337,23735
3,suspicious-document00093.txt,source-document02877.txt,22982,15535,138124,15734
4,suspicious-document00093.txt,source-document02877.txt,44528,16805,378789,17017
5,suspicious-document00093.txt,source-document02877.txt,68504,22452,250193,22345
6,suspicious-document00093.txt,source-document02877.txt,96213,13548,534879,24865
7,suspicious-document00093.txt,source-document02877.txt,118528,16600,453690,18224
8,suspicious-document00093.txt,source-document02877.txt,142890,22091,187020,23072
9,suspicious-document00093.txt,source-document02877.txt,172234,18897,411167,19941


In [23]:
# Step 5: Identify potential source documents for each suspicious document
printt("Filtering documents...")
suspicious_to_sources = document_filtering(cleaned_suspicious_path, cleaned_source_path)

[22:48:35] Filtering documents...
[22:48:40] Starting common_tokens_matrix multiplication 
[22:48:40] Start filter index location of documents


In [24]:
# Step 6: Filter sentences based on potential plagiarisms
printt("Filtering sentences...")
if reload_sent_matches:
    sentence_filtering(cleaned_suspicious_path, cleaned_source_path, suspicious_to_sources, sent_matches_dir)

[22:48:42] Filtering sentences...
[22:48:42] Fitting vectorizer for sentences filtering
[22:48:49] Calculating sparse matrices of sentences from source documents


100%|██████████| 32/32 [00:06<00:00,  4.79it/s]


[22:48:56] Matrix multiplication and matching value index location


100%|██████████| 8/8 [00:20<00:00,  2.59s/it]


In [25]:
# Step 7: Compute and save similarity scores between suspicious and source sentences
printt("Processing similarity calculations...")
output_dir = os.path.join(current_dir, "sentences-detections")
similarity_computation(cleaned_suspicious_path, cleaned_source_path, sent_matches_dir, BETA)

[22:49:17] Processing similarity calculations...
[22:49:17] Loading FastText
[22:49:52] Calculating similarities for sentences of suspicious-document00285.parquet, [1/8]


100%|██████████| 20/20 [00:00<00:00, 410.25it/s]


[22:49:52] Calculating similarities for sentences of suspicious-document00050.parquet, [2/8]


100%|██████████| 823/823 [00:01<00:00, 529.90it/s]


[22:49:53] Calculating similarities for sentences of suspicious-document00246.parquet, [3/8]


100%|██████████| 225/225 [00:00<00:00, 458.54it/s]


[22:49:54] Calculating similarities for sentences of suspicious-document00093.parquet, [4/8]


100%|██████████| 1008/1008 [00:03<00:00, 327.84it/s]


[22:49:57] Calculating similarities for sentences of suspicious-document00291.parquet, [5/8]


100%|██████████| 151/151 [00:00<00:00, 1073.09it/s]


[22:49:57] Calculating similarities for sentences of suspicious-document00044.parquet, [6/8]


100%|██████████| 467/467 [00:02<00:00, 216.19it/s]


[22:49:59] Calculating similarities for sentences of suspicious-document00252.parquet, [7/8]


100%|██████████| 2301/2301 [00:06<00:00, 373.63it/s]


[22:50:05] Calculating similarities for sentences of suspicious-document00078.parquet, [8/8]


100%|██████████| 299/299 [00:00<00:00, 673.44it/s]


In [32]:
# TODO: refactor step 8 to work better with step 6 refactored
import numpy as np
import pandas as pd
from tqdm import tqdm 
from utils.metrics import overlap

for THRESHOLD in THRESHOLD_LIST:
    printt(f"{SOURCE_LIMIT=}, {SUSPICIOUS_LIMIT=}, {THRESHOLD=}, {BETA=}")
    sent_matches_filenames = os.listdir(sent_matches_dir)
    printt("Evaluate detections against ground truth")
    TP, FP = 0, 0 # Initiliaze counting for this threshold value
    S_R, R_S = set(), set() # initialize empty sets to calculate granularity and plagdet
    for sent_matches_filename in sent_matches_filenames:
        sent_matches_filepath = os.path.join(sent_matches_dir, sent_matches_filename)
        df = pd.read_parquet(sent_matches_filepath)
        df = df[df["hybrid_similarity"]>=THRESHOLD]

        
        printt(f"Evaluate detections of {sent_matches_filename}")
        for idx in tqdm(range(len(df)), unit="row"):
            row = df.iloc[idx] # select row from df detections
            suspicious_filename = row["suspicious_filename"]
            suspicious_offset = row["suspicious_offset"]
            suspicious_length = row["suspicious_length"]
            source_filename = row["source_filename"]
            source_offset = row["source_offset"]
            source_length = row["source_length"]

            # Filter by filenames
            matching_rows = df_references[
                (df_references["suspicious_filename"] == suspicious_filename)
                & (df_references["source_filename"] == source_filename)
            ]
            
            # Search using overlap function
            overlap_found = False
            for _, reference in matching_rows.iterrows():
                if overlap(
                    suspicious_offset,
                    suspicious_offset + suspicious_length,
                    reference["suspicious_offset"],
                    reference["suspicious_offset"] + reference["suspicious_length"],
                ) and overlap(
                    source_offset,
                    source_offset + source_length,
                    reference["source_offset"],
                    reference["source_offset"] + reference["source_length"],
                ):
                    overlap_found = True
                    break
            
            if overlap_found:
                TP += 1
                R_S.add(row)
                S_R.add(reference)
            else:
                FP += 1

    FN = len(df_references) - TP
    granularity = len(R_S) / len(S_R) if len(S_R) != 0 else 1
    # Calculate precision, recall, f1-score, plagdet
    precision = TP / (TP + FP) if TP + FP != 0 else 0
    recall = TP / (TP + FN) if TP + FN != 0 else 0
    f1_score = (
        2 * (precision * recall) / (precision + recall)
        if precision + recall != 0
        else 0
    )
    plagdet = f1_score / np.log2(1 + granularity)

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1_score:.4f}")
    print(f"Plagdet-Score: {plagdet:.4f}")

    # 9. Save results
    printt(f"Saving results to {results_filename}")
    with open(results_filename, "a") as f:
        f.write(f"{SUSPICIOUS_LIMIT}, {THRESHOLD}, {BETA}, {precision}, {recall}, {f1_score}, {plagdet}\n")

[23:04:00] SOURCE_LIMIT=64, SUSPICIOUS_LIMIT=8, THRESHOLD=0.65, BETA=0.5
[23:04:00] Evaluate detections against ground truth
[23:04:00] Evaluate detections of suspicious-document00285.parquet


100%|██████████| 20/20 [00:00<00:00, 3751.61row/s]


[23:04:00] Evaluate detections of suspicious-document00050.parquet


100%|██████████| 811/811 [00:00<00:00, 4377.75row/s]


[23:04:00] Evaluate detections of suspicious-document00246.parquet


100%|██████████| 223/223 [00:00<00:00, 4570.86row/s]


[23:04:00] Evaluate detections of suspicious-document00093.parquet


100%|██████████| 994/994 [00:00<00:00, 4586.50row/s]


[23:04:00] Evaluate detections of suspicious-document00291.parquet


100%|██████████| 144/144 [00:00<00:00, 4516.11row/s]


[23:04:00] Evaluate detections of suspicious-document00044.parquet


100%|██████████| 451/451 [00:00<00:00, 4292.30row/s]


[23:04:00] Evaluate detections of suspicious-document00252.parquet


100%|██████████| 2233/2233 [00:00<00:00, 4528.91row/s]


[23:04:01] Evaluate detections of suspicious-document00078.parquet


100%|██████████| 289/289 [00:00<00:00, 4489.11row/s]

Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000
Plagdet-Score: 0.0000
[23:04:01] Saving results to data_for_figure_20.txt





In [None]:
def evaluate_detections_v2():
    sent_matches_filenames = os.listdir(sent_matches_dir)
    printt("Evaluate detections against ground truth")
    TP, FP = 0, 0 # Initiliaze counting for this threshold value
    S_R, R_S = set(), set() # initialize empty sets to calculate granularity and plagdet
    for sent_matches_filename in sent_matches_filenames:
        sent_matches_filepath = os.path.join(sent_matches_dir, sent_matches_filename)
        df = pd.read_parquet(sent_matches_filepath)
        df = df[df["hybrid_similarity"]>=THRESHOLD]

        
        printt(f"Evaluate detections of {sent_matches_filename}")
        for idx in tqdm(range(len(df)), unit="row"):
            row = df.iloc[idx] # select row from df detections
            suspicious_filename = row["suspicious_filename"]
            suspicious_offset = row["suspicious_offset"]
            suspicious_length = row["suspicious_length"]
            source_filename = row["source_filename"]
            source_offset = row["source_offset"]
            source_length = row["source_length"]

            # Filter by filenames
            matching_rows = df_references[
                (df_references["suspicious_filename"] == suspicious_filename)
                & (df_references["source_filename"] == source_filename)
            ]
            
            # Search using overlap function
            overlap_found = False
            for _, reference in matching_rows.iterrows():
                if overlap(
                    suspicious_offset,
                    suspicious_offset + suspicious_length,
                    reference["suspicious_offset"],
                    reference["suspicious_offset"] + reference["suspicious_length"],
                ) and overlap(
                    source_offset,
                    source_offset + source_length,
                    reference["source_offset"],
                    reference["source_offset"] + reference["source_length"],
                ):
                    overlap_found = True
                    break
            
            if overlap_found:
                TP += 1
                R_S.add(row)
                S_R.add(reference)
            else:
                FP += 1

    FN = len(df_references) - TP
    granularity = len(R_S) / len(S_R) if len(S_R) != 0 else 1
    # Calculate precision, recall, f1-score, plagdet
    precision = TP / (TP + FP) if TP + FP != 0 else 0
    recall = TP / (TP + FN) if TP + FN != 0 else 0
    f1_score = (
        2 * (precision * recall) / (precision + recall)
        if precision + recall != 0
        else 0
    )
    plagdet = f1_score / np.log2(1 + granularity)

    return precision, recall, f1_score, plagdet