# **Installing the packages**

In [None]:
! pip install pandas textdistance plotly



# **Importing libraries**

In [None]:
# Mount the drive if not mounted
from google.colab import drive
drive.mount("/content/drive/")

import random
from concurrent.futures import ThreadPoolExecutor

import pandas as pd
import textdistance

import plotly.graph_objects as go

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/AA-Tutorial/data/Agora.csv", encoding='ISO-8859-1')
# Renaming all the features of the dataframe
df = df.rename(str.strip, axis='columns')
# Merging the Item and Item Description using a [SEP] token
separator = ' [SEP] '
df['TEXT'] = df.apply(lambda row: f"{row['Item']}{separator}{row['Item Description']}", axis=1)
# dropping Unncessary columns
df.drop(columns=["Item", "Item Description"], inplace=True)

In [None]:
# Assuming that vendors Amsterdam100 and amsterdam100 are the same vendors
df.Vendor = df.Vendor.apply(lambda x: x.lower())

In [None]:
# Getting all unique vendor handles from the 'Vendor' column.
unique_vendors = df['Vendor'].unique()

# Assigning vendor IDs to vendor handles using a dictionary comprehension.
# This approach eliminates the need for checking if a vendor already exists in the dictionary,
# as each unique vendor will be processed once. The enumerate function provides a counter (idx),
# which is used to assign IDs, starting from 1 for the first vendor.
vendor_to_idx_dict = {vendor: idx + 1 for idx, vendor in enumerate(unique_vendors)}

# Updating the 'Vendor' column in the DataFrame to reflect the vendor IDs.
# The 'map' function is used to replace each vendor handle with its corresponding vendor ID
# based on the 'vendor_to_idx_dict'. This operation is vectorized and efficient.
df['Vendor'] = df['Vendor'].map(vendor_to_idx_dict)

In [None]:
df

Unnamed: 0,Vendor,Category,Price,Origin,Destination,Rating,Remarks,TEXT
0,1,Services/Hacking,0.05027025666666667 BTC,Torland,,4.96/5,,12 Month HuluPlus gift Code [SEP] 12-Month Hul...
1,1,Services/Hacking,0.152419585 BTC,Torland,,4.96/5,,Pay TV Sky UK Sky Germany HD TV and much mor...
2,2,Services/Hacking,0.007000000000000005 BTC,Torland,,4.93/5,,OFFICIAL Account Creator Extreme 4.2 [SEP] Tag...
3,3,Services/Hacking,0.019016783532494728 BTC,,,4.89/5,,VPN > TOR > SOCK TUTORIAL [SEP] How to setup a...
4,4,Services/Hacking,0.062018073963963936 BTC,Torland,,4.88/5,,Facebook hacking guide [SEP] . This guide wil...
...,...,...,...,...,...,...,...,...
109684,832,Drugs/Opioids/Opium,0.14363729 BTC,Germany,,4.91/5,,1 gr purified Opium [SEP] This Listing is for ...
109685,1268,Weapons/Fireworks,0.08680555 BTC,USA,,[0 deals],,Shipping Ticket [SEP] in order for me to ship ...
109686,1866,Drugs/Opioids/Opium,0.33641201 BTC,Canada,Worldwide,[0 deals],,0.50 GRAMS #4 White Afghani Heroin - FULL ESC...
109687,1866,Drugs/Opioids/Opium,0.61165820 BTC,Canada,Worldwide,[0 deals],,1.0 GRAMS #4 White Afghani Heroin - FULL ESCRO...


# **Computing similarity between the advertisements using the traditional stylometric metrics**

We define overall similarity as the avg. of:
1. [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance): Computes Edit-based similarities.
2. [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index): Computes Token-based similarities.
3. [Ratcliff-Obershelp similarity](https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching): Computes Sequence-based similarities.

In [None]:
def normalized_similarity(sentences, i, j):
    """
    Computes a normalized similarity score between two sentences identified by their indices in a list.

    Parameters:
    - sentences (list of str): The list of sentences from which two sentences are compared.
    - i (int): Index of the first sentence in the list to be compared.
    - j (int): Index of the second sentence in the list to be compared.

    Returns:
    - tuple: A tuple (i, j, average_metric), where `i` and `j` are the indices of the compared sentences,
      and `average_metric` is the calculated average similarity score between them. The average similarity score
      is a composite metric derived from the normalized Damerau-Levenshtein distance (inversely related to similarity),
      Jaccard similarity, and Ratcliff-Obershelp similarity, providing a comprehensive measure of how similar the two sentences are.
    """

    # Calculate the max length between the two sentences for normalization purposes.
    max_len = max(len(sentences[i]), len(sentences[j]))

    # Return (i, j, 0) if both sentences are empty as they are identical in this case.
    if max_len == 0:
        return (i, j, 0)

    # Compute the normalized Damerau-Levenshtein distance between sentences i and j.
    normalized_damerau = textdistance.damerau_levenshtein.distance(sentences[i], sentences[j]) / max_len

    # Compute the Jaccard similarity between sentences i and j.
    jaccard = textdistance.jaccard.similarity(sentences[i], sentences[j])

    # Compute the Ratcliff-Obershelp similarity between sentences i and j.
    ratcliff = textdistance.ratcliff_obershelp.similarity(sentences[i], sentences[j])

    # Compute the average metric from the normalized Damerau-Levenshtein distance, Jaccard, and Ratcliff-Obershelp similarities.
    average_metric = (normalized_damerau + jaccard + ratcliff) / 3

    return (i, j, average_metric)


def compute_normalized_average_similarity_parallel(df, vendor_name):
    """
    Computes the normalized average similarity for all pairs of sentences associated with a specific vendor, in parallel.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing text data, with a column 'Vendor' for vendor names and 'TEXT' for sentences.
    - vendor_name (str): The name of the vendor to filter sentences by in the DataFrame.

    Returns:
    - dict: A dictionary where keys are tuples (i, j) representing the indices of sentence pairs, and values are
      the `average_metric` scores for these pairs. This output provides a comprehensive mapping of similarity scores
      across all unique pairs of sentences related to the specified vendor.

    The function filters sentences by `vendor_name`, computes similarity metrics between all unique pairs of sentences
    for the specified vendor, and uses parallel processing to enhance efficiency. This approach is particularly
    useful for analyzing textual similarities within large datasets.
    """

    # Filter the DataFrame for rows where the Vendor matches `vendor_name` and extract the TEXT column into a list of sentences.
    sentences = df[df.Vendor == vendor_name].TEXT.to_list()

    # Calculate the number of sentences to determine the range for generating sentence pairs.
    n = len(sentences)

    # Generate all unique pairs of sentence indices for comparison.
    pairs = [(i, j) for i in range(n) for j in range(i+1, n)]

    normalized_averages = {}  # Initialize a dictionary to store the average similarity metrics for each sentence pair.

    # Use a ThreadPoolExecutor to compute the normalized similarity metrics in parallel.
    with ThreadPoolExecutor() as executor:
        # Submit tasks to compute the normalized similarity metric for each sentence pair.
        futures = [executor.submit(normalized_similarity, sentences, *pair) for pair in pairs]

        # Retrieve and store the results in `normalized_averages` as they become available.
        for future in futures:
            i, j, average_metric = future.result()  # Extract result for each future.
            normalized_averages[(i, j)] = average_metric  # Map sentence pair indices to their average similarity score.

    return normalized_averages  # Return the dictionary containing normalized average similarity scores for sentence pairs.


Our analysis indicates that some vendors have as many as 880 advertisements. Considering the computationally expensive nature of traditional metrics, which necessitate comparing each advertisement against every other, we demonstrate the average similarity, we will focus on a sample of 10 randomly selected vendors. Specifically, we will choose vendors who have posted either 5 or 10 advertisements.

In [None]:
# Calculate the frequency of each vendor.
vendor_counts = df['Vendor'].value_counts()

# Filter vendors with frequency between 5 and 10 (inclusive).
vendors_with_5_to_10_ads = vendor_counts[(vendor_counts >= 5) & (vendor_counts <= 10)].index.tolist()

In [None]:
# Randomly select 10 vendors
random_vendors = random.sample(vendors_with_5_to_10_ads, 10)
# Getting the average similarity of random vendors
avg_similarity = {}
for vendor in random_vendors:
  avg_similarity[vendor] = list(compute_normalized_average_similarity_parallel(df, vendor).values())

In [None]:
# Create a box plot for each key in the dictionary
fig = go.Figure()
for key, values in avg_similarity.items():
    fig.add_trace(go.Box(y=values, name=str(key)))

# Customize the layout
fig.update_layout(title_text='Violin plot with average-similarity score between the advertisements',
                  yaxis_title='Similarity Score')

# Show the plot
fig.show()
