In [None]:
# Standard library imports
import os
import time

# Local imports
from utils.general import printt, print_sucess, N_PROCESSES
from utils.data_load_extract import (
    ensure_directory_exists_and_is_empty,
    extract_sources_from_suspicious_xml
    )
from utils.preprocessing import preprocessing_data
from utils.filtering import document_filtering, sentence_filtering
from utils.metrics import similarity_computation, evaluate_detection

In [None]:
reload_sources = True
reload_suspicious = True
reload_sent_matches = True
results_filename = "data_for_figure_20.txt"
SOURCE_LIMIT = 64
SUSPICIOUS_LIMIT = 8
THRESHOLD_LIST = [0.65] # [0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70]
BETA = 0.5

In [None]:
# 1. Initial setup (paths and number of processes for parallelization)
printt("Starting the algorithm...")
# Getting the current directory path and setting up the base path for data.
current_dir = os.getcwd()
base_data_path = os.path.join(current_dir, "pan-plagiarism-corpus-2011/external-detection-corpus")
sent_matches_dir = os.path.join(current_dir, "sent-matches-dir")
# Print the number of processes that will be used for parallel operations.
printt(f"N_PROCESSES: {N_PROCESSES}")

In [None]:
# 2. Directory setup
printt("Setting up directories...")
# Initialize the directories for raw and cleaned versions of both source and suspicious documents.
# Set up paths for raw and cleaned source documents.
cleaned_source_path = os.path.join(current_dir, "cleaned-source-documents")
source_path = os.path.join(base_data_path, "source-document")
# Set up paths for raw and cleaned suspicious documents.
cleaned_suspicious_path = os.path.join(current_dir, "cleaned-suspicious-documents")
suspicious_path = os.path.join(base_data_path, "suspicious-document")
if reload_sources:
    ensure_directory_exists_and_is_empty(cleaned_source_path)
if reload_suspicious:
    ensure_directory_exists_and_is_empty(cleaned_suspicious_path)
if reload_sent_matches:
    ensure_directory_exists_and_is_empty(sent_matches_dir)

In [None]:
# 3. Extract real results from XML files to understand the ground truth of plagiarism references
printt("Extracting reference results...")
# Extract and store the plagiarism references from the suspicious XML files
df_references = extract_sources_from_suspicious_xml(suspicious_path)

In [None]:
df_references.head()

In [None]:
# Step 4: Preprocess the raw data to prepare it for the subsequent analysis
printt("Preprocessing data...")
printt("Source documents preprocessing...")
if reload_sources:
    preprocessing_data(source_path, cleaned_source_path, SOURCE_LIMIT)
printt("Suspicious documents preprocessing...")
if reload_suspicious:
    preprocessing_data(suspicious_path, cleaned_suspicious_path, SUSPICIOUS_LIMIT)

In [None]:
# Step 5: Identify potential source documents for each suspicious document
printt("Filtering documents...")
suspicious_to_sources = document_filtering(cleaned_suspicious_path, cleaned_source_path)

In [None]:
# Step 6: Filter sentences based on potential plagiarisms
printt("Filtering sentences...")
if reload_sent_matches:
    sentence_filtering(cleaned_suspicious_path, cleaned_source_path, suspicious_to_sources, sent_matches_dir)

In [None]:
# Step 7: Compute and save similarity scores between suspicious and source sentences
printt("Processing similarity calculations...")
output_dir = os.path.join(current_dir, "sentences-detections")
similarity_computation(cleaned_suspicious_path, cleaned_source_path, sent_matches_dir, BETA)

In [None]:
# TODO: refactor step 8 to work better with step 6 refactored
import pandas as pd
from tqdm import tqdm 
from utils.metrics import overlap

for THRESHOLD in THRESHOLD_LIST:
    printt(f"{SOURCE_LIMIT=}, {SUSPICIOUS_LIMIT=}, {THRESHOLD=}, {BETA=}")
    sent_matches_filenames = os.listdir(sent_matches_dir)
    detected_cases_count = dict()
    for sent_matches_filename in sent_matches_filenames:
        df = pd.read_parquet(sent_matches_filename)
        df = df[df["hybrid_similarity"]>=THRESHOLD]

        # 8. Evaluate detections
        printt("Analysis of detections")
        TP, FP, FN = 0, 0, 0

        suspicious_filename = df["suspicious_filename"]
        suspicious_offset = df["suspicious_offset"]
        suspicious_length = df["suspicious_length"]

        source_filename = df["source_filename"]
        source_offset = df["source_offset"]
        source_length = df["source_length"]

        printt(f"Evaluation of detections of {sent_matches_filename}")
        for idx in tqdm(range(len(df)), unit="row"):
            row = df.iloc[idx]

            # Filter by filenames
            matching_rows = df_references[
                (df_references["suspicious_filename"] == suspicious_filename)
                & (df_references["source_filename"] == source_filename)
                ]
            
            # Search using overlap function
            overlap_found = False
            for _, reference in matching_rows.iterrows():
                if overlap(
                    suspicious_offset,
                    suspicious_offset + suspicious_length,
                    reference["suspicious_offset"],
                    reference["suspicious_offset"] + reference["suspicious_length"],
                ) and overlap(
                    source_offset,
                    source_offset + source_length,
                    reference["source_offset"],
                    reference["source_offset"] + reference["source_length"],
                ):
                    overlap_found = True
                    break
            
            if overlap_found:
                TP += 1
                ref_key = (suspicious_filename, source_filename)  # Use file pair as a unique key
                detected_cases_count[ref_key] = detected_cases_count.get(ref_key, 0) + 1
            else:
                FP += 1
S_R sería el conjunto de todas las referencias que fueron detectadas
R_S sería el conjunto de todas las detecciones que fueron correctamente detectadas


In [None]:
    for _, reference in matching_rows.iterrows():
        if overlap(
            suspicious_offset,
            suspicious_offset + suspicious_length,
            reference["suspicious_offset"],
            reference["suspicious_offset"] + reference["suspicious_length"],
        ) and overlap(
            source_offset,
            source_offset + source_length,
            reference["source_offset"],
            reference["source_offset"] + reference["source_length"],
        ):
            overlap_found = True
            break
    if overlap_found:
        TP += 1
        ref_key = (
            suspicious_filename,
            source_filename,
        )  # Use file pair as a unique key
        detected_cases_count[ref_key] = detected_cases_count.get(ref_key, 0) + 1
    else:
        FP += 1
FN = len(df_references) - TP
sum_Rs = sum(detected_cases_count.values())
abs_SR = len(detected_cases_count)
granularity = sum_Rs / abs_SR if abs_SR != 0 else 0
# Calculate precision, recall, f1-score, plagdet
precision = TP / (TP + FP) if TP + FP != 0 else 0
recall = TP / (TP + FN) if TP + FN != 0 else 0
f1_score = (
    2 * (precision * recall) / (precision + recall)
    if precision + recall != 0
    else 0
)
plagdet = f1_score / np.log2(1 + granularity)


In [None]:
if __name__ == "__main__":

        # 8. Analysis of detections
        printt("Evaluate detections...")
        precision, recall, f1_score, plagdet = evaluate_detection(
            df_plagiarism_references, df_plagiarism_detections
        )
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1_score:.4f}")
        print(f"Plagdet-Score: {plagdet:.4f}")

        # 9. Save results
        printt(f"Saving results to {results_filename}")
        with open(results_filename, "a") as f:
            f.write(
                f"{SUSPICIOUS_LIMIT}, {THRESHOLD}, {BETA}, {precision}, {recall}, {f1_score}, {plagdet}\n"
            )

    end_time = time.time()
    diff_time = end_time - start_time
    print(f"Time taken for Figure 20 replication: {diff_time:.2f} seconds")

    print_sucess()


In [None]:
        # 8. Analysis of detections
        printt("Evaluate detections...")
        precision, recall, f1_score, plagdet = evaluate_detection(
            df_plagiarism_references, df_plagiarism_detections
        )
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1_score:.4f}")
        print(f"Plagdet-Score: {plagdet:.4f}")

        # 9. Save results
        printt(f"Saving results to {results_filename}")
        with open(results_filename, "a") as f:
            f.write(
                f"{SUSPICIOUS_LIMIT}, {THRESHOLD}, {BETA}, {precision}, {recall}, {f1_score}, {plagdet}\n"
            )

    end_time = time.time()
    diff_time = end_time - start_time
    print(f"Time taken for Figure 20 replication: {diff_time:.2f} seconds")

    print_sucess()