In [1]:
# Standard library imports
import os
import time

# Local imports
from utils.general import printt, print_sucess, N_PROCESSES
from utils.data_load_extract import (
    ensure_directory_exists_and_is_empty,
    extract_sources_from_suspicious_xml,
    divide_df_sentences
    )
from utils.preprocessing import preprocessing_data
from utils.filtering import document_filtering, sentence_filtering
from utils.metrics import similarity_computation, evaluate_detection

In [2]:
reload_sources = False
reload_suspicious = False
results_filename = "data_for_figure_20.txt"
SOURCE_LIMIT = 64
SUSPICIOUS_LIMIT = 8
THRESHOLD_LIST = [0.65] # [0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70]
BETA = 0.5

In [3]:
# 1. Initial setup (paths and number of processes for parallelization)
printt("Starting the algorithm...")
# Getting the current directory path and setting up the base path for data.
current_dir = os.getcwd()
base_data_path = os.path.join(
    current_dir, "pan-plagiarism-corpus-2011/external-detection-corpus"
)
# Print the number of processes that will be used for parallel operations.
printt(f"N_PROCESSES: {N_PROCESSES}")


[17:37:18] Starting the algorithm...
[17:37:18] N_PROCESSES: 6


In [4]:
# 2. Directory setup
printt("Setting up directories...")
# Initialize the directories for raw and cleaned versions of both source and suspicious documents.
# Set up paths for raw and cleaned source documents.
cleaned_source_path = os.path.join(current_dir, "cleaned-source-documents")
source_path = os.path.join(base_data_path, "source-document")
# Set up paths for raw and cleaned suspicious documents.
cleaned_suspicious_path = os.path.join(current_dir, "cleaned-suspicious-documents")
suspicious_path = os.path.join(base_data_path, "suspicious-document")
if reload_sources:
    ensure_directory_exists_and_is_empty(cleaned_source_path)
if reload_suspicious:
    ensure_directory_exists_and_is_empty(cleaned_suspicious_path)

[17:37:19] Setting up directories...


In [5]:
# 3. Extract real results from XML files to understand the ground truth of plagiarism references
printt("Extracting reference results...")
# Extract and store the plagiarism references from the suspicious XML files
df_plagiarism_references = extract_sources_from_suspicious_xml(suspicious_path)

[17:37:21] Extracting reference results...


In [6]:
df_plagiarism_references.head()

Unnamed: 0,suspicious_filename,source_filename,suspicious_offset,suspicious_length,source_offset,source_length
0,suspicious-document00185.txt,source-document03291.txt,2361,3423,23508,3525
1,suspicious-document00185.txt,source-document03291.txt,47733,1967,2554,1976
2,suspicious-document00185.txt,source-document03291.txt,131081,12809,28727,19497
3,suspicious-document00185.txt,source-document03291.txt,252530,3956,8883,3975
4,suspicious-document00185.txt,source-document03291.txt,288473,801,1587,820


In [7]:
# Step 4: Preprocess the raw data to prepare it for the subsequent analysis
printt("Preprocessing data...")
printt("Source documents preprocessing...")
if reload_sources:
    preprocessing_data(source_path, cleaned_source_path, SOURCE_LIMIT)
printt("Suspicious documents preprocessing...")
if reload_suspicious:
    preprocessing_data(suspicious_path, cleaned_suspicious_path, SUSPICIOUS_LIMIT)

[17:37:25] Preprocessing data...
[17:37:25] Source documents preprocessing...
[17:37:25] Suspicious documents preprocessing...


In [8]:
import pandas as pd
df_temp = pd.read_parquet("/Users/oliverg271828/Documents/paper-replication-reloaded/hybrid-plagiarism-detector/cleaned-source-documents/source-document00008.parquet").head(5)
df_temp.head()

Unnamed: 0,sentence,length,offset,cleaned_sentence
0,﻿Hier und dort waren die Stümpfe frisch abgesä...,135,0,hier und dort waren die stümpfe frisch abgesäg...
1,Bald schlossen sich die Stämme dichter zusamme...,82,136,bald schlossen sich die stämme dichter zusamme...
2,"Es schien auch, als ob die Bäume höher würden ...",104,219,e schien auch al ob die bäume höher würden und...
3,"Spinnfäden, die sich\nvon Stamm zu Stamm spann...",164,353,spinnfäden die sichvon stamm zu stamm spannten...
4,Erschöpft legte sich der Knabe unter einigen\n...,68,518,erschöpft legte sich der knabe unter einigenta...


In [9]:
# Step 5: Identify potential source documents for each suspicious document
printt("Filtering documents...")
suspicious_to_sources = document_filtering(cleaned_suspicious_path, cleaned_source_path)

[17:37:29] Filtering documents...
[17:37:35] Starting common_tokens_matrix multiplication 
[17:37:35] Start filter index location of documents


In [11]:
print(suspicious_to_sources)

{'suspicious-document00285.parquet': ['source-document00182.parquet', 'source-document00357.parquet', 'source-document00140.parquet', 'source-document00395.parquet', 'source-document00141.parquet', 'source-document00394.parquet', 'source-document00419.parquet', 'source-document00426.parquet', 'source-document00022.parquet', 'source-document00355.parquet', 'source-document00180.parquet', 'source-document00425.parquet', 'source-document00234.parquet', 'source-document00168.parquet', 'source-document00382.parquet', 'source-document00157.parquet', 'source-document00009.parquet', 'source-document00036.parquet', 'source-document00195.parquet', 'source-document00431.parquet', 'source-document00037.parquet', 'source-document00194.parquet', 'source-document00220.parquet', 'source-document00433.parquet', 'source-document00196.parquet', 'source-document00035.parquet', 'source-document00154.parquet', 'source-document00222.parquet', 'source-document00381.parquet', 'source-document00432.parquet', 's

In [None]:
# Step 6: Filter sentences based on potential plagiarisms
printt("Filtering sentences...")
df_sentences_detections = sentence_filtering(cleaned_suspicious_path, cleaned_source_path, suspicious_to_sources)
df_sentences_detections.to_parquet("df_sentences_detections.parquet")
printt("Divide Dataframe Sentences...")
divide_df_sentences(current_dir)


In [None]:
# Step 7: Compute and save similarity scores between suspicious and source sentences
printt("Processing similarity calculations...")
output_dir = os.path.join(current_dir, "sentences-detections")
df_similarity = similarity_computation(
    cleaned_suspicious_path, cleaned_source_path, output_dir, BETA
)
df_similarity.to_parquet("df_similarity.parquet")
for THRESHOLD in THRESHOLD_LIST:
    printt(
        f"SOURCE_LIMIT: {SOURCE_LIMIT}, SUSPICIOUS_LIMIT: {SUSPICIOUS_LIMIT}, THRESHOLD: {THRESHOLD}, BETA: {BETA}"
    )
    df_plagiarism_detections = df_similarity[
        df_similarity["hybrid_similarity"] > THRESHOLD
    ]
    # 8. Analysis of detections
    printt("Evaluate detections...")
    precision, recall, f1_score, plagdet = evaluate_detection(
        df_plagiarism_references, df_plagiarism_detections
    )
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1_score:.4f}")
    print(f"Plagdet-Score: {plagdet:.4f}")
    # 9. Save results
    printt(f"Saving results to {results_filename}")
    with open(results_filename, "a") as f:
        f.write(
            f"{SUSPICIOUS_LIMIT}, {THRESHOLD}, {BETA}, {precision}, {recall}, {f1_score}, {plagdet}\n"
        )



In [None]:
if __name__ == "__main__":
    # Replication of Figure 20
    print("Replication of Figure 20")
    start_time = time.time()

    reload_sources = True
    reload_suspicious = True

    results_filename = "data_for_figure_20.txt"
    SOURCE_LIMIT = 64
    SUSPICIOUS_LIMIT = 8
    THRESHOLD_LIST = [0.65] # [0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70]
    BETA = 0.5

    # 1. Initial setup (paths and number of processes for parallelization)
    printt("Starting the algorithm...")

    # Getting the current directory path and setting up the base path for data.
    current_dir = os.getcwd()
    base_data_path = os.path.join(
        current_dir, "pan-plagiarism-corpus-2011/external-detection-corpus"
    )

    # Print the number of processes that will be used for parallel operations.
    printt("N_PROCESSES:", N_PROCESSES)

    # 2. Directory setup
    printt("Setting up directories...")

    # Initialize the directories for raw and cleaned versions of both source and suspicious documents.
    # Set up paths for raw and cleaned source documents.
    cleaned_source_path = os.path.join(current_dir, "cleaned-source-documents")
    source_path = os.path.join(base_data_path, "source-document")

    # Set up paths for raw and cleaned suspicious documents.
    cleaned_suspicious_path = os.path.join(current_dir, "cleaned-suspicious-documents")
    suspicious_path = os.path.join(base_data_path, "suspicious-document")

    if reload_sources:
        ensure_directory_exists_and_is_empty(cleaned_source_path)
    if reload_suspicious:
        ensure_directory_exists_and_is_empty(cleaned_suspicious_path)

    # 3. Extract real results from XML files to understand the ground truth of plagiarism references
    printt("Extracting reference results...")

    # Extract and store the plagiarism references from the suspicious XML files
    df_plagiarism_references = extract_sources_from_suspicious_xml(suspicious_path)

    # Step 4: Preprocess the raw data to prepare it for the subsequent analysis
    printt("Preprocessing data...")
    printt("Source documents preprocessing...")
    if reload_sources:
        preprocessing_data(source_path, cleaned_source_path, SOURCE_LIMIT)

    printt("Suspicious documents preprocessing...")
    if reload_suspicious:
        preprocessing_data(suspicious_path, cleaned_suspicious_path, SUSPICIOUS_LIMIT)

    # Step 5: Identify potential source documents for each suspicious document
    printt("Filtering documents...")
    suspicious_to_sources = document_filtering(cleaned_suspicious_path, cleaned_source_path)

    # Step 6: Filter sentences based on potential plagiarisms
    printt("Filtering sentences...")
    df_sentences_detections = sentence_filtering(cleaned_suspicious_path, cleaned_source_path, suspicious_to_sources)
    df_sentences_detections.to_parquet("df_sentences_detections.parquet")

    printt("Divide Dataframe Sentences...")
    divide_df_sentences(current_dir)

    # Step 7: Compute and save similarity scores between suspicious and source sentences
    printt("Processing similarity calculations...")
    output_dir = os.path.join(current_dir, "sentences-detections")
    df_similarity = similarity_computation(
        cleaned_suspicious_path, cleaned_source_path, output_dir, BETA
    )

    df_similarity.to_parquet("df_similarity.parquet")

    for THRESHOLD in THRESHOLD_LIST:
        printt(
            f"SOURCE_LIMIT: {SOURCE_LIMIT}, SUSPICIOUS_LIMIT: {SUSPICIOUS_LIMIT}, THRESHOLD: {THRESHOLD}, BETA: {BETA}"
        )

        df_plagiarism_detections = df_similarity[
            df_similarity["hybrid_similarity"] > THRESHOLD
        ]

        # 8. Analysis of detections
        printt("Evaluate detections...")
        precision, recall, f1_score, plagdet = evaluate_detection(
            df_plagiarism_references, df_plagiarism_detections
        )
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1_score:.4f}")
        print(f"Plagdet-Score: {plagdet:.4f}")

        # 9. Save results
        printt(f"Saving results to {results_filename}")
        with open(results_filename, "a") as f:
            f.write(
                f"{SUSPICIOUS_LIMIT}, {THRESHOLD}, {BETA}, {precision}, {recall}, {f1_score}, {plagdet}\n"
            )

    end_time = time.time()
    diff_time = end_time - start_time
    print(f"Time taken for Figure 20 replication: {diff_time:.2f} seconds")

    print_sucess()


In [None]:
    # Replication of Figure 20
    print("Replication of Figure 20")
    start_time = time.time()

    reload_sources = True
    reload_suspicious = True

    results_filename = "data_for_figure_20.txt"
    SOURCE_LIMIT = 64
    SUSPICIOUS_LIMIT = 8
    THRESHOLD_LIST = [0.65] # [0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70]
    BETA = 0.5

    # 1. Initial setup (paths and number of processes for parallelization)
    printt("Starting the algorithm...")

    # Getting the current directory path and setting up the base path for data.
    current_dir = os.getcwd()
    base_data_path = os.path.join(
        current_dir, "pan-plagiarism-corpus-2011/external-detection-corpus"
    )

    # Print the number of processes that will be used for parallel operations.
    printt("N_PROCESSES:", N_PROCESSES)

    # 2. Directory setup
    printt("Setting up directories...")

    # Initialize the directories for raw and cleaned versions of both source and suspicious documents.
    # Set up paths for raw and cleaned source documents.
    cleaned_source_path = os.path.join(current_dir, "cleaned-source-documents")
    source_path = os.path.join(base_data_path, "source-document")

    # Set up paths for raw and cleaned suspicious documents.
    cleaned_suspicious_path = os.path.join(current_dir, "cleaned-suspicious-documents")
    suspicious_path = os.path.join(base_data_path, "suspicious-document")

    if reload_sources:
        ensure_directory_exists_and_is_empty(cleaned_source_path)
    if reload_suspicious:
        ensure_directory_exists_and_is_empty(cleaned_suspicious_path)

    # 3. Extract real results from XML files to understand the ground truth of plagiarism references
    printt("Extracting reference results...")

    # Extract and store the plagiarism references from the suspicious XML files
    df_plagiarism_references = extract_sources_from_suspicious_xml(suspicious_path)

    # Step 4: Preprocess the raw data to prepare it for the subsequent analysis
    printt("Preprocessing data...")
    printt("Source documents preprocessing...")
    if reload_sources:
        preprocessing_data(source_path, cleaned_source_path, SOURCE_LIMIT)

    printt("Suspicious documents preprocessing...")
    if reload_suspicious:
        preprocessing_data(suspicious_path, cleaned_suspicious_path, SUSPICIOUS_LIMIT)

    # Step 5: Identify potential source documents for each suspicious document
    printt("Filtering documents...")
    suspicious_to_sources = document_filtering(cleaned_suspicious_path, cleaned_source_path)

    # Step 6: Filter sentences based on potential plagiarisms
    printt("Filtering sentences...")
    df_sentences_detections = sentence_filtering(cleaned_suspicious_path, cleaned_source_path, suspicious_to_sources)
    df_sentences_detections.to_parquet("df_sentences_detections.parquet")

    printt("Divide Dataframe Sentences...")
    divide_df_sentences(current_dir)

    # Step 7: Compute and save similarity scores between suspicious and source sentences
    printt("Processing similarity calculations...")
    output_dir = os.path.join(current_dir, "sentences-detections")
    df_similarity = similarity_computation(
        cleaned_suspicious_path, cleaned_source_path, output_dir, BETA
    )

    df_similarity.to_parquet("df_similarity.parquet")

    for THRESHOLD in THRESHOLD_LIST:
        printt(
            f"SOURCE_LIMIT: {SOURCE_LIMIT}, SUSPICIOUS_LIMIT: {SUSPICIOUS_LIMIT}, THRESHOLD: {THRESHOLD}, BETA: {BETA}"
        )

        df_plagiarism_detections = df_similarity[
            df_similarity["hybrid_similarity"] > THRESHOLD
        ]

        # 8. Analysis of detections
        printt("Evaluate detections...")
        precision, recall, f1_score, plagdet = evaluate_detection(
            df_plagiarism_references, df_plagiarism_detections
        )
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1_score:.4f}")
        print(f"Plagdet-Score: {plagdet:.4f}")

        # 9. Save results
        printt(f"Saving results to {results_filename}")
        with open(results_filename, "a") as f:
            f.write(
                f"{SUSPICIOUS_LIMIT}, {THRESHOLD}, {BETA}, {precision}, {recall}, {f1_score}, {plagdet}\n"
            )

    end_time = time.time()
    diff_time = end_time - start_time
    print(f"Time taken for Figure 20 replication: {diff_time:.2f} seconds")

    print_sucess()