In [172]:
import time
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from collections import Counter
from datasketch import MinHash, MinHashLSH
import ast

In [173]:
tqdm.pandas()

In [174]:
nol = pd.read_csv(r'C:\Users\Bill\Desktop\texnikes analisis\test_without_labels.csv')

In [175]:
train = pd.read_csv(r'C:\Users\Bill\Desktop\texnikes analisis\train.csv')

In [176]:
train = train.head(1000)

In [177]:
nol = nol.head(100)

In [178]:
def preprocess_text(title, content):
    """Combine title and content, then tokenize into a set of words."""
    combined_text = f"{title} {content}".lower()  # Combine and lowercase
    tokens = word_tokenize(combined_text)         # Tokenize into words
    return set(tokens)                            # Convert to a set (unique words)

def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)  # Elements common to both sets
    union = len(set1 | set2)         # Unique elements in both sets
    return intersection / union if union != 0 else 0

In [179]:
def brute_force_knn(train_df, test_df, k=7):
    # Create copies of the input DataFrames
    train_df_copy = train_df.copy()
    test_df_copy = test_df.copy()

    # Tokenization step
    train_df_copy["tokenized"] = train_df_copy.apply(
        lambda row: preprocess_text(row["Title"], row["Content"]), axis=1
    )
    test_df_copy["tokenized"] = test_df_copy.apply(
        lambda row: preprocess_text(row["Title"], row["Content"]), axis=1
    )

    predicted_labels = []
    neighbor_ids_list = []

    start_query_time = time.time()

    for _, test_row in tqdm(test_df_copy.iterrows(), total=len(test_df_copy)):
        test_tokens = test_row["tokenized"]
        similarities = []

        for _, train_row in train_df_copy.iterrows():
            train_tokens = train_row["tokenized"]
            similarity = jaccard_similarity(test_tokens, train_tokens)
            similarities.append((train_row["Id"], train_row["Label"], similarity))

        top_k_neighbors = sorted(similarities, key=lambda x: x[2], reverse=True)[:k]

        neighbor_ids = {neighbor[0] for neighbor in top_k_neighbors}
        neighbor_ids_list.append(neighbor_ids)

        neighbor_labels = [neighbor[1] for neighbor in top_k_neighbors]

        most_common_label = Counter(neighbor_labels).most_common(1)[0][0]

        predicted_labels.append(most_common_label)

    query_time = time.time() - start_query_time

    test_df_copy["PredictedLabel"] = predicted_labels
    test_df_copy["NeighborIds"] = neighbor_ids_list

    print(f"Query time: {query_time:.2f} seconds")

    return test_df_copy


In [183]:
def knn_with_minhash_lsh(train_df, test_df, k=7, threshold=0.9, num_perm=16):
    # Create copies of the input DataFrames
    train_df_copy = train_df.copy()
    test_df_copy = test_df.copy()

    # Step 1: Preprocess and tokenize the text data
    train_df_copy["tokenized"] = train_df_copy.apply(
        lambda row: preprocess_text(row["Title"], row["Content"]), axis=1
    )
    test_df_copy["tokenized"] = test_df_copy.apply(
        lambda row: preprocess_text(row["Title"], row["Content"]), axis=1
    )

    # Step 2: Create MinHash objects for the training data
    start_build_time = time.time()
    train_df_copy["minhash"] = train_df_copy["tokenized"].apply(
        lambda tokens: create_minhash(tokens, num_perm=num_perm)
    )

    # Step 3: Initialize LSH
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    for idx, minhash in enumerate(train_df_copy["minhash"]):
        lsh.insert(train_df_copy.iloc[idx]["Id"], minhash)  # Use the "Id" column as a unique key
    build_time = time.time() - start_build_time

    predicted_labels = []
    neighbor_ids_list = []

    # Step 4: Query the LSH for each test instance
    start_query_time = time.time()
    for _, test_row in tqdm(test_df_copy.iterrows(), total=len(test_df_copy)):
        test_tokens = test_row["tokenized"]
        test_minhash = create_minhash(test_tokens, num_perm=num_perm)
        approximate_neighbors = lsh.query(test_minhash)  # Retrieve approximate neighbors

        neighbor_ids_list.append(set(map(int, approximate_neighbors)))
        
        if approximate_neighbors:
            # Find corresponding labels for the neighbors
            neighbor_labels = train_df_copy[
                train_df_copy["Id"].isin(approximate_neighbors)
            ]["Label"]

            # Take the top-k most common labels
            most_common_label = Counter(neighbor_labels).most_common(1)[0][0]
        else:
            # If no neighbors are found, assign a default or random label
            most_common_label = "default_label"  # Replace with an actual default strategy

        predicted_labels.append(most_common_label)
    query_time = time.time() - start_query_time

    # Add predictions and neighbors to the test DataFrame
    test_df_copy["PredictedLabel"] = predicted_labels
    test_df_copy["NeighborIds"] = neighbor_ids_list

    print(f"Build time: {build_time:.2f} seconds")
    print(f"Query time: {query_time:.2f} seconds")

    return test_df_copy


In [181]:
result_df = brute_force_knn(train, nol, k=7)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 25.56it/s]

Query time: 3.92 seconds





In [184]:
lsh_result_df = knn_with_minhash_lsh(train, nol, k=7, threshold=0.3, num_perm=1000)

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 147.97it/s]

Build time: 6.90 seconds
Query time: 0.68 seconds





In [185]:
joint = result_df.merge(lsh_result_df, on = 'Id')

In [186]:
joint["CommonNeighborCount"] = joint.apply(
    lambda row: len(row["NeighborIds_x"].intersection(row["NeighborIds_y"])),
    axis=1
)

In [197]:
print((joint.CommonNeighborCount.sum()/(7*len(joint)))*100,'''%''')

2.571428571428571 %
