In [1]:
# Standard library imports
import os
import time

# Local imports
from utils.general import printt, print_sucess, N_PROCESSES
from utils.data_load_extract import (
    ensure_directory_exists_and_is_empty,
    extract_sources_from_suspicious_xml
    )
from utils.preprocessing import preprocessing_data
from utils.filtering import document_filtering, sentence_filtering
from utils.metrics import similarity_computation, evaluate_detection

In [2]:
reload_sources = False
reload_suspicious = False
reload_sent_matches = True
results_filename = "data_for_figure_20.txt"
SOURCE_LIMIT = 64
SUSPICIOUS_LIMIT = 8
THRESHOLD_LIST = [0.65] # [0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70]
BETA = 0.5

In [3]:
# 1. Initial setup (paths and number of processes for parallelization)
printt("Starting the algorithm...")
# Getting the current directory path and setting up the base path for data.
current_dir = os.getcwd()
base_data_path = os.path.join(current_dir, "pan-plagiarism-corpus-2011/external-detection-corpus")
sent_matches_dir = os.path.join(current_dir, "sent-matches-dir")
# Print the number of processes that will be used for parallel operations.
printt(f"N_PROCESSES: {N_PROCESSES}")

[13:40:37] Starting the algorithm...
[13:40:37] N_PROCESSES: 6


In [4]:
# 2. Directory setup
printt("Setting up directories...")
# Initialize the directories for raw and cleaned versions of both source and suspicious documents.
# Set up paths for raw and cleaned source documents.
cleaned_source_path = os.path.join(current_dir, "cleaned-source-documents")
source_path = os.path.join(base_data_path, "source-document")
# Set up paths for raw and cleaned suspicious documents.
cleaned_suspicious_path = os.path.join(current_dir, "cleaned-suspicious-documents")
suspicious_path = os.path.join(base_data_path, "suspicious-document")
if reload_sources:
    ensure_directory_exists_and_is_empty(cleaned_source_path)
if reload_suspicious:
    ensure_directory_exists_and_is_empty(cleaned_suspicious_path)
if reload_sent_matches:
    ensure_directory_exists_and_is_empty(sent_matches_dir)

[13:40:37] Setting up directories...


In [5]:
# 3. Extract real results from XML files to understand the ground truth of plagiarism references
printt("Extracting reference results...")
# Extract and store the plagiarism references from the suspicious XML files
df_plagiarism_references = extract_sources_from_suspicious_xml(suspicious_path)

[13:40:37] Extracting reference results...


In [6]:
df_plagiarism_references.head()

Unnamed: 0,suspicious_filename,source_filename,suspicious_offset,suspicious_length,source_offset,source_length
0,suspicious-document00185.txt,source-document03291.txt,2361,3423,23508,3525
1,suspicious-document00185.txt,source-document03291.txt,47733,1967,2554,1976
2,suspicious-document00185.txt,source-document03291.txt,131081,12809,28727,19497
3,suspicious-document00185.txt,source-document03291.txt,252530,3956,8883,3975
4,suspicious-document00185.txt,source-document03291.txt,288473,801,1587,820


In [7]:
# Step 4: Preprocess the raw data to prepare it for the subsequent analysis
printt("Preprocessing data...")
printt("Source documents preprocessing...")
if reload_sources:
    preprocessing_data(source_path, cleaned_source_path, SOURCE_LIMIT)
printt("Suspicious documents preprocessing...")
if reload_suspicious:
    preprocessing_data(suspicious_path, cleaned_suspicious_path, SUSPICIOUS_LIMIT)

[13:40:40] Preprocessing data...
[13:40:40] Source documents preprocessing...
[13:40:40] Suspicious documents preprocessing...


In [8]:
# Step 5: Identify potential source documents for each suspicious document
printt("Filtering documents...")
suspicious_to_sources = document_filtering(cleaned_suspicious_path, cleaned_source_path)

[13:40:40] Filtering documents...
[13:40:44] Starting common_tokens_matrix multiplication 
[13:40:44] Start filter index location of documents


In [9]:
# Step 6: Filter sentences based on potential plagiarisms
printt("Filtering sentences...")
sentence_filtering(cleaned_suspicious_path, cleaned_source_path, suspicious_to_sources, sent_matches_dir)

[13:40:46] Filtering sentences...
[13:40:46] Fitting vectorizer for sentences filtering
[13:40:53] Calculating sparse matrices of sentences from source documents


100%|██████████| 32/32 [00:06<00:00,  4.82it/s]


[13:40:59] Matrix multiplication and matching value index location


100%|██████████| 8/8 [00:20<00:00,  2.61s/it]


In [None]:
def similarity_computation(cleaned_suspicious_path, cleaned_source_path, output_dir, BETA):
    # Load data and pre-trained models
    printt("Loading FastText and data")
    ft, suspicious_docs, source_docs = load_data_and_models(
        cleaned_suspicious_path, cleaned_source_path
    )

    # Train the TF-IDF vectorizer and retrieve the common vocabulary
    printt("Train the TF-IDF vectorizer and retrieve the common vocabulary")
    tfidf_vectorizer, common_vocabulary = train_tfidf_vectorizer(
        suspicious_docs, source_docs
    )

    # Get TF-IDF weights and word vectors for the common vocabulary
    printt("Get TF-IDF weights and word vectors for the common vocabulary")
    tfidf_weights, sum_word_vectors = get_word_weights_and_vectors(ft, tfidf_vectorizer)

    # Initialize the binary count vectorizer with the shared vocabulary
    count_vectorizer = CountVectorizer(
        binary=True, vocabulary=common_vocabulary, tokenizer=word_tokenize
    )

    del ft, suspicious_docs, source_docs, tfidf_vectorizer, common_vocabulary

    df_filenames = os.listdir(output_dir)
    df_filepaths = [
        os.path.join(output_dir, df_filename) for df_filename in df_filenames
    ]

    for df_filepath in tqdm(df_filepaths, unit="file"):
        df = pd.read_parquet(df_filepath)

        suspicious_sentences = df["suspicious_cleaned_sentence"]
        source_sentences = df["detected_source_cleaned_sentence"]

        suspicious_count = transform_in_parallel(
            suspicious_sentences, count_vectorizer, N_PROCESSES
        )
        source_count = transform_in_parallel(
            source_sentences, count_vectorizer, N_PROCESSES
        )

        S1_syntactic = suspicious_count.multiply(tfidf_weights)
        S2_syntactic = source_count.multiply(tfidf_weights)

        S1_semantic = suspicious_count.multiply(sum_word_vectors)
        S2_semantic = source_count.multiply(sum_word_vectors)

        sim_syntactic = similarity_syntactic(S1_syntactic, S2_syntactic)
        sim_semantic = similarity_semantic(S1_semantic, S2_semantic)

        hybrid_similarity = BETA * sim_semantic + (1 - BETA) * sim_syntactic

        df["hybrid_similarity"] = hybrid_similarity

        df.to_parquet(df_filepath)

    df_list = []
    for df_filepath in df_filepaths:
        df = pd.read_parquet(df_filepath)
        df_list.append(df)

    df_similarity = pd.concat(df_list)

    return df_similarity


In [None]:
# TODO: refactor step 7 and 8 to work better with step 6
# Step 7 reloaded: compute similarity scores for each of the parque files in sent_matches_dir


In [None]:
# Step 7: Compute and save similarity scores between suspicious and source sentences
printt("Processing similarity calculations...")
output_dir = os.path.join(current_dir, "sentences-detections")
df_similarity = similarity_computation(cleaned_suspicious_path, cleaned_source_path, output_dir, BETA)

df_similarity.to_parquet("df_similarity.parquet")
for THRESHOLD in THRESHOLD_LIST:
    printt(
        f"SOURCE_LIMIT: {SOURCE_LIMIT}, SUSPICIOUS_LIMIT: {SUSPICIOUS_LIMIT}, THRESHOLD: {THRESHOLD}, BETA: {BETA}"
    )
    df_plagiarism_detections = df_similarity[
        df_similarity["hybrid_similarity"] > THRESHOLD
    ]
    # 8. Analysis of detections
    printt("Evaluate detections...")
    precision, recall, f1_score, plagdet = evaluate_detection(
        df_plagiarism_references, df_plagiarism_detections
    )
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1_score:.4f}")
    print(f"Plagdet-Score: {plagdet:.4f}")
    # 9. Save results
    printt(f"Saving results to {results_filename}")
    with open(results_filename, "a") as f:
        f.write(
            f"{SUSPICIOUS_LIMIT}, {THRESHOLD}, {BETA}, {precision}, {recall}, {f1_score}, {plagdet}\n"
        )

In [None]:
if __name__ == "__main__":
    # Replication of Figure 20
    print("Replication of Figure 20")
    start_time = time.time()

    reload_sources = True
    reload_suspicious = True

    results_filename = "data_for_figure_20.txt"
    SOURCE_LIMIT = 64
    SUSPICIOUS_LIMIT = 8
    THRESHOLD_LIST = [0.65] # [0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70]
    BETA = 0.5

    # 1. Initial setup (paths and number of processes for parallelization)
    printt("Starting the algorithm...")

    # Getting the current directory path and setting up the base path for data.
    current_dir = os.getcwd()
    base_data_path = os.path.join(
        current_dir, "pan-plagiarism-corpus-2011/external-detection-corpus"
    )

    # Print the number of processes that will be used for parallel operations.
    printt("N_PROCESSES:", N_PROCESSES)

    # 2. Directory setup
    printt("Setting up directories...")

    # Initialize the directories for raw and cleaned versions of both source and suspicious documents.
    # Set up paths for raw and cleaned source documents.
    cleaned_source_path = os.path.join(current_dir, "cleaned-source-documents")
    source_path = os.path.join(base_data_path, "source-document")

    # Set up paths for raw and cleaned suspicious documents.
    cleaned_suspicious_path = os.path.join(current_dir, "cleaned-suspicious-documents")
    suspicious_path = os.path.join(base_data_path, "suspicious-document")

    if reload_sources:
        ensure_directory_exists_and_is_empty(cleaned_source_path)
    if reload_suspicious:
        ensure_directory_exists_and_is_empty(cleaned_suspicious_path)

    # 3. Extract real results from XML files to understand the ground truth of plagiarism references
    printt("Extracting reference results...")

    # Extract and store the plagiarism references from the suspicious XML files
    df_plagiarism_references = extract_sources_from_suspicious_xml(suspicious_path)

    # Step 4: Preprocess the raw data to prepare it for the subsequent analysis
    printt("Preprocessing data...")
    printt("Source documents preprocessing...")
    if reload_sources:
        preprocessing_data(source_path, cleaned_source_path, SOURCE_LIMIT)

    printt("Suspicious documents preprocessing...")
    if reload_suspicious:
        preprocessing_data(suspicious_path, cleaned_suspicious_path, SUSPICIOUS_LIMIT)

    # Step 5: Identify potential source documents for each suspicious document
    printt("Filtering documents...")
    suspicious_to_sources = document_filtering(cleaned_suspicious_path, cleaned_source_path)

    # Step 6: Filter sentences based on potential plagiarisms
    printt("Filtering sentences...")
    df_sentences_detections = sentence_filtering(cleaned_suspicious_path, cleaned_source_path, suspicious_to_sources)
    df_sentences_detections.to_parquet("df_sentences_detections.parquet")

    printt("Divide Dataframe Sentences...")
    divide_df_sentences(current_dir)

    # Step 7: Compute and save similarity scores between suspicious and source sentences
    printt("Processing similarity calculations...")
    output_dir = os.path.join(current_dir, "sentences-detections")
    df_similarity = similarity_computation(
        cleaned_suspicious_path, cleaned_source_path, output_dir, BETA
    )

    df_similarity.to_parquet("df_similarity.parquet")

    for THRESHOLD in THRESHOLD_LIST:
        printt(
            f"SOURCE_LIMIT: {SOURCE_LIMIT}, SUSPICIOUS_LIMIT: {SUSPICIOUS_LIMIT}, THRESHOLD: {THRESHOLD}, BETA: {BETA}"
        )

        df_plagiarism_detections = df_similarity[
            df_similarity["hybrid_similarity"] > THRESHOLD
        ]

        # 8. Analysis of detections
        printt("Evaluate detections...")
        precision, recall, f1_score, plagdet = evaluate_detection(
            df_plagiarism_references, df_plagiarism_detections
        )
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1_score:.4f}")
        print(f"Plagdet-Score: {plagdet:.4f}")

        # 9. Save results
        printt(f"Saving results to {results_filename}")
        with open(results_filename, "a") as f:
            f.write(
                f"{SUSPICIOUS_LIMIT}, {THRESHOLD}, {BETA}, {precision}, {recall}, {f1_score}, {plagdet}\n"
            )

    end_time = time.time()
    diff_time = end_time - start_time
    print(f"Time taken for Figure 20 replication: {diff_time:.2f} seconds")

    print_sucess()


In [None]:
    # Replication of Figure 20
    print("Replication of Figure 20")
    start_time = time.time()

    reload_sources = True
    reload_suspicious = True

    results_filename = "data_for_figure_20.txt"
    SOURCE_LIMIT = 64
    SUSPICIOUS_LIMIT = 8
    THRESHOLD_LIST = [0.65] # [0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70]
    BETA = 0.5

    # 1. Initial setup (paths and number of processes for parallelization)
    printt("Starting the algorithm...")

    # Getting the current directory path and setting up the base path for data.
    current_dir = os.getcwd()
    base_data_path = os.path.join(
        current_dir, "pan-plagiarism-corpus-2011/external-detection-corpus"
    )

    # Print the number of processes that will be used for parallel operations.
    printt("N_PROCESSES:", N_PROCESSES)

    # 2. Directory setup
    printt("Setting up directories...")

    # Initialize the directories for raw and cleaned versions of both source and suspicious documents.
    # Set up paths for raw and cleaned source documents.
    cleaned_source_path = os.path.join(current_dir, "cleaned-source-documents")
    source_path = os.path.join(base_data_path, "source-document")

    # Set up paths for raw and cleaned suspicious documents.
    cleaned_suspicious_path = os.path.join(current_dir, "cleaned-suspicious-documents")
    suspicious_path = os.path.join(base_data_path, "suspicious-document")

    if reload_sources:
        ensure_directory_exists_and_is_empty(cleaned_source_path)
    if reload_suspicious:
        ensure_directory_exists_and_is_empty(cleaned_suspicious_path)

    # 3. Extract real results from XML files to understand the ground truth of plagiarism references
    printt("Extracting reference results...")

    # Extract and store the plagiarism references from the suspicious XML files
    df_plagiarism_references = extract_sources_from_suspicious_xml(suspicious_path)

    # Step 4: Preprocess the raw data to prepare it for the subsequent analysis
    printt("Preprocessing data...")
    printt("Source documents preprocessing...")
    if reload_sources:
        preprocessing_data(source_path, cleaned_source_path, SOURCE_LIMIT)

    printt("Suspicious documents preprocessing...")
    if reload_suspicious:
        preprocessing_data(suspicious_path, cleaned_suspicious_path, SUSPICIOUS_LIMIT)

    # Step 5: Identify potential source documents for each suspicious document
    printt("Filtering documents...")
    suspicious_to_sources = document_filtering(cleaned_suspicious_path, cleaned_source_path)

    # Step 6: Filter sentences based on potential plagiarisms
    printt("Filtering sentences...")
    df_sentences_detections = sentence_filtering(cleaned_suspicious_path, cleaned_source_path, suspicious_to_sources)
    df_sentences_detections.to_parquet("df_sentences_detections.parquet")

    printt("Divide Dataframe Sentences...")
    divide_df_sentences(current_dir)

    # Step 7: Compute and save similarity scores between suspicious and source sentences
    printt("Processing similarity calculations...")
    output_dir = os.path.join(current_dir, "sentences-detections")
    df_similarity = similarity_computation(
        cleaned_suspicious_path, cleaned_source_path, output_dir, BETA
    )

    df_similarity.to_parquet("df_similarity.parquet")

    for THRESHOLD in THRESHOLD_LIST:
        printt(
            f"SOURCE_LIMIT: {SOURCE_LIMIT}, SUSPICIOUS_LIMIT: {SUSPICIOUS_LIMIT}, THRESHOLD: {THRESHOLD}, BETA: {BETA}"
        )

        df_plagiarism_detections = df_similarity[
            df_similarity["hybrid_similarity"] > THRESHOLD
        ]

        # 8. Analysis of detections
        printt("Evaluate detections...")
        precision, recall, f1_score, plagdet = evaluate_detection(
            df_plagiarism_references, df_plagiarism_detections
        )
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1_score:.4f}")
        print(f"Plagdet-Score: {plagdet:.4f}")

        # 9. Save results
        printt(f"Saving results to {results_filename}")
        with open(results_filename, "a") as f:
            f.write(
                f"{SUSPICIOUS_LIMIT}, {THRESHOLD}, {BETA}, {precision}, {recall}, {f1_score}, {plagdet}\n"
            )

    end_time = time.time()
    diff_time = end_time - start_time
    print(f"Time taken for Figure 20 replication: {diff_time:.2f} seconds")

    print_sucess()