In [4]:
import pandas as pd
import re
import numpy as np
import nltk
import matplotlib.pyplot as plt
import os
import sys
import time
import scipy.sparse
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from datasketch import MinHash, MinHashLSH

# Load dataset
train_df = pd.read_csv("train.csv")

# Load the test dataset
test_unlabeled_df = pd.read_csv("test_without_labels.csv")

print(f"Training Data: {train_df.shape} rows")
print(f"Test Data: {test_unlabeled_df.shape} rows")

Training Data: (111795, 4) rows
Test Data: (47912, 3) rows


In [5]:
# Text Preprocessing Function
def preprocess_text(text):
    if pd.isnull(text):  # Handle NaN values
        return ""
    
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation and numbers
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    
    return ' '.join(tokens)

# Function to preprocess entire dataset
def preprocess_dataset(df, text_columns):
    for col in text_columns:
        df[col] = df[col].astype(str).apply(preprocess_text)
    return df

print("Preprocessing functions are ready!")

Preprocessing functions are ready!


In [6]:
# Apply preprocessing
text_columns = ['Title', 'Content']
print("Preprocessing text data...")

for col in text_columns:
    print(f"\nProcessing column: {col}")
    
    total_rows = len(train_df) + len(test_unlabeled_df)  # Total rows to process
    processed_rows = 0  # Track processed rows

    # Process training data
    for i in range(len(train_df)):
        train_df.at[i, col] = preprocess_text(str(train_df.at[i, col]))
        processed_rows += 1
        percentage = (processed_rows / total_rows) * 100
        sys.stdout.write(f"\rProgress: {percentage:.2f}%")
        sys.stdout.flush()

    # Process test data
    for i in range(len(test_unlabeled_df)):
        test_unlabeled_df.at[i, col] = preprocess_text(str(test_unlabeled_df.at[i, col]))
        processed_rows += 1
        percentage = (processed_rows / total_rows) * 100
        sys.stdout.write(f"\rProgress: {percentage:.2f}%")
        sys.stdout.flush()

print("\nPreprocessing complete!")

# Combine 'Title' and 'Content' into a single field
print("Combining Title and Content into a single field...")
train_df['Combined'] = train_df['Title'] + ' ' + train_df['Content']
test_unlabeled_df['Combined'] = test_unlabeled_df['Title'] + ' ' + test_unlabeled_df['Content']

print("Combining complete!")

# Save preprocessed data
train_df.to_csv("preprocessed_train_2nd.csv", index=False)
test_unlabeled_df.to_csv("preprocessed_test_2nd.csv", index=False)

print("Preprocessed data saved: preprocessed_train_2nd.csv, preprocessed_test_2nd.csv")

Preprocessing text data...

Processing column: Title
Progress: 100.00%
Processing column: Content
Progress: 100.00%
Preprocessing complete!
Combining Title and Content into a single field...
Combining complete!
Preprocessed data saved: preprocessed_train_2nd.csv, preprocessed_test_2nd.csv


In [7]:
import pandas as pd
import os
import scipy.sparse
from sklearn.feature_extraction.text import CountVectorizer

# Load Preprocessed Data
print("Loading preprocessed datasets...")
train_df = pd.read_csv("preprocessed_train_2nd.csv")
test_df = pd.read_csv("preprocessed_test_2nd.csv")
#print(f"Train set size: {len(train_df)}, Test set size: {len(test_df)}")
print(f"Train set size: {len(train_df)}, Columns: {list(train_df.columns)}")
print(f"Test set size: {len(test_df)}, Columns: {list(test_df.columns)}")


# Reduce dataset size for faster processing
subset_size = 1  # Use % of the dataset
train_df_reduced = train_df.sample(frac=subset_size, random_state=42)
test_df_reduced = test_df.sample(frac=subset_size, random_state=42)
print(f"{subset_size * 100:.0f}% of the dataset is used.")
print(f"Reduced Train set size: {len(train_df_reduced)}, Reduced Test set size: {len(test_df_reduced)}")

# Check if vectorized files exist and ask user for input
if os.path.exists("train_vectors.npz") and os.path.exists("test_vectors.npz"):
    user_input = input("Vectorized data files found. Do you want to use them? (yes/no): ").strip().lower()
    
    if user_input == "yes":
        print("Loading precomputed vectorized data...")
        train_vectors = scipy.sparse.load_npz("train_vectors.npz")
        test_vectors = scipy.sparse.load_npz("test_vectors.npz")
    else:
        print("Recomputing vectorized data...")
        vectorizer = CountVectorizer(binary=True, analyzer='word', ngram_range=(1, 1), max_features=5000)
        train_vectors = vectorizer.fit_transform(train_df_reduced['Combined'])
        test_vectors = vectorizer.transform(test_df_reduced['Combined'])
        
        # Save vectorized data for future use
        scipy.sparse.save_npz("train_vectors.npz", train_vectors)
        scipy.sparse.save_npz("test_vectors.npz", test_vectors)
        print("Vectorized data saved.")
else:
    print("Vectorized data files not found. Computing new vectorized data...")
    # Use of CountVectorizer with binary=True for Jaccard similarity
    vectorizer = CountVectorizer(
        analyzer='word', 
        ngram_range=(1, 1),  # Use only unigrams
        max_features=5000,   # Reduce dimensionality
        binary=True          # Convert to binary (1 if word appears, 0 if not)
    )    
    
    train_vectors = vectorizer.fit_transform(train_df_reduced['Combined'])
    test_vectors = vectorizer.transform(test_df_reduced['Combined'])
    
    # Save vectorized data for future use
    scipy.sparse.save_npz("train_vectors.npz", train_vectors)
    scipy.sparse.save_npz("test_vectors.npz", test_vectors)
    print("Vectorized data saved.")

print("Preprocessing completed.")

Loading preprocessed datasets...
Train set size: 111795, Columns: ['Id', 'Title', 'Content', 'Label', 'Combined']
Test set size: 47912, Columns: ['Id', 'Title', 'Content', 'Combined']
100% of the dataset is used.
Reduced Train set size: 111795, Reduced Test set size: 47912
Vectorized data files found. Do you want to use them? (yes/no): yes
Loading precomputed vectorized data...
Preprocessing completed.


In [9]:
import time
import numpy as np
import os
import scipy.sparse
from sklearn.metrics.pairwise import pairwise_distances
from tqdm import tqdm  # Progress bar

# Load Vectorized Data
print("Loading vectorized data...")
train_vectors = scipy.sparse.load_npz("train_vectors.npz").toarray().astype(bool) 
test_vectors = scipy.sparse.load_npz("test_vectors.npz").toarray().astype(bool) 
print("Vectorized data loaded.")

# Brute-Force K-NN with Jaccard similarity
def brute_force_knn_optimized(train_vectors, test_vectors, k=7):
    """Optimized Brute-Force K-NN using Jaccard similarity."""
    print("Starting optimized brute-force evaluation...")

    true_knn = np.zeros((test_vectors.shape[0], k), dtype=int)
    brute_start_time = time.time()

    # Compute distances row by row
    for i in tqdm(range(test_vectors.shape[0]), desc="Computing K-NN", unit="doc"):
        distances = pairwise_distances(test_vectors[i].reshape(1, -1), train_vectors, metric="jaccard")
        knn_indices = np.argsort(distances, axis=1)[:, :k]
        true_knn[i] = knn_indices

    brute_force_time = time.time() - brute_start_time
    print(f"\nBrute-Force K-NN completed in {brute_force_time:.2f} seconds.")

    return true_knn, brute_force_time

# Ask the User if They Want to Use Existing Brute-Force Results
if os.path.exists("true_knn.npy") and os.path.exists("brute_time.npy"):
    user_input = input("Brute-Force K-NN results found. Do you want to use them? (yes/no): ").strip().lower()
    
    if user_input == "yes":
        print("Loading precomputed Brute-Force results...")
        true_knn = np.load("true_knn.npy", allow_pickle=True)
        brute_force_time = np.load("brute_time.npy")
    else:
        print("Recomputing Brute-Force K-NN...")
        true_knn, brute_force_time = brute_force_knn_optimized(train_vectors, test_vectors, k=7)
        np.save("true_knn.npy", true_knn)
        np.save("brute_time.npy", brute_force_time)
        print("Brute-Force results saved for future use.")
else:
    print("Brute-Force results not found. Running computation...")
    true_knn, brute_force_time = brute_force_knn_optimized(train_vectors, test_vectors, k=7)
    np.save("true_knn.npy", true_knn)
    np.save("brute_time.npy", brute_force_time)
    print("Brute-Force results saved for future use.")

# Debug: Print first 10 rows of true KNN
print("Sample of True K-NN Results (First 10 test samples):")
for i in range(10):
    print(f"Test sample {i} K-NN: {true_knn[i]}")


Loading vectorized data...
Vectorized data loaded.
Brute-Force K-NN results found. Do you want to use them? (yes/no): yes
Loading precomputed Brute-Force results...
Sample of True K-NN Results (First 10 test samples):
Test sample 0 K-NN: [  8974  36912  29772 105602  77486  93430  71663]
Test sample 1 K-NN: [50059 42849 68395 17059 49499 64408 27661]
Test sample 2 K-NN: [ 35822  88254  80655   9421  20744  43597 101570]
Test sample 3 K-NN: [ 81865 106753  11904  18248    603  23201   6025]
Test sample 4 K-NN: [55241 23134 75246  8489 61535 45757  7065]
Test sample 5 K-NN: [ 69811  26604 110408  93141  34844  87105  72534]
Test sample 6 K-NN: [ 54059 100699  29993 110148 108542  14328  90384]
Test sample 7 K-NN: [58272  5892 18340 83212 96691 29713 71955]
Test sample 8 K-NN: [ 71262 101775  74176  22345  86448  60058 103796]
Test sample 9 K-NN: [ 70466  95604  26830 111625  39114  67278 100622]


In [10]:
import time
import numpy as np
import pandas as pd
import os
import scipy.sparse
from datasketch import MinHash, MinHashLSH

# Load Preprocessed Data
print("Loading vectorized data and Brute-Force K-NN results...")
train_vectors = scipy.sparse.load_npz("train_vectors.npz").toarray().astype(bool)
test_vectors = scipy.sparse.load_npz("test_vectors.npz").toarray().astype(bool)
true_knn = np.load("true_knn.npy", allow_pickle=True)
brute_force_time = np.load("brute_time.npy")
print("All required data loaded.")

# Helper Functions
def create_minhash(text_vector, num_permutations):
    """Creates a MinHash for a text vector, handling both 1D and 2D formats."""
    minhash = MinHash(num_perm=num_permutations)
    
    # Check correct handling of sparse row extraction
    indices = text_vector.nonzero()  # Get nonzero features

    if len(indices) == 1:  # Handle 1D case
        indices = indices[0]
    elif len(indices) > 1:  # Handle 2D case
        indices = indices[1]
    else:
        indices = []

    if len(indices) == 0:
        print("Warning: Empty vector, inserting placeholder hash.")
        minhash.update(b'empty')  # Avoid zero hash issues
    else:
        for index in indices:
            minhash.update(str(index).encode('utf8'))
    
    return minhash


def compute_fraction_retrieved(true_knn, lsh_results):
    """Compute the fraction of true K-NN retrieved by LSH."""
    print("Computing fraction of true K-NN retrieved...")
    fractions = []
    for true, retrieved in zip(true_knn, lsh_results):
        if retrieved:
            true_set = set(int(idx) for idx in np.ravel(true))
            retrieved_set = set(map(int, retrieved))
            overlap = len(true_set.intersection(retrieved_set))
            fractions.append(overlap / len(true_set))
        else:
            fractions.append(0)
    return np.mean(fractions)

# LSH Implementation
def lsh_knn_with_fraction(train_vectors, test_vectors, num_permutations, true_knn, k=7, threshold=0.2):
    """Run LSH, compute K-NN for the test set, and calculate fraction retrieved."""
    print(f"Starting LSH with {num_permutations} permutations...")
    
    print(f"Using threshold={threshold} for num_permutations={num_permutations}")

    # Build LSH
    lsh_start_time = time.time()
    lsh = MinHashLSH(threshold=threshold, num_perm=num_permutations)
    train_minhashes = [create_minhash(vector, num_permutations) for vector in train_vectors]
    for idx, minhash in enumerate(train_minhashes):
        lsh.insert(str(idx), minhash)
    build_time = time.time() - lsh_start_time
    print(f"LSH index built in {build_time:.2f} seconds.")

    print(f"train_vectors shape: {train_vectors.shape}")  # Should be (num_samples, num_features)
    #print(f"First row shape: {train_vectors[0].shape}")  # Should be (num_features,)

    # Precompute MinHashes for test vectors
    test_minhashes = [create_minhash(vector, num_permutations) for vector in test_vectors]

    # Query LSH
    query_start_time = time.time()
    lsh_results = [lsh.query(test_minhash) for test_minhash in test_minhashes]
    #lsh_results = [sorted(lsh.query(test_minhash), key=int)[:100] for test_minhash in test_minhashes]
    query_time = time.time() - query_start_time
    print(f"LSH querying completed in {query_time:.2f} seconds.")

    # Debug: Print how many neighbors are retrieved
    # print("\nChecking LSH Query Results (First 5 test samples)...")
    # for i in range(5):
    #     print(f"Test sample {i} retrieved {len(lsh_results[i])}")

    # Calculate the fraction of true K-NN retrieved
    fraction_retrieved = compute_fraction_retrieved(true_knn, lsh_results)

    return build_time, query_time, build_time + query_time, fraction_retrieved

# Evaluate LSH
print("Evaluating LSH...")
permutations_list = [16, 32, 64]
results = []

# Add Brute-Force results
results.append({
    "Type": "Brute-Force-Jaccard",
    "BuildTime": 0,
    "QueryTime": brute_force_time,
    "TotalTime": brute_force_time,
    "Fraction": "100%",
    "Parameters": "-"
})

thresholds = {16: 0.85, 32: 0.9, 64: 0.9}

for perm in permutations_list:
    threshold = thresholds[perm]    
    build_time, query_time, total_time, fraction = lsh_knn_with_fraction(
        train_vectors, test_vectors, num_permutations=perm, true_knn=true_knn, k=7, threshold=threshold
    )
    results.append({
        "Type": "LSH-Jaccard",
        "BuildTime": round(build_time, 2),
        "QueryTime": round(query_time, 2),
        "TotalTime": round(total_time, 2),
        "Fraction": f"{round(fraction * 100, 2)}%",
        "Parameters": f"Perm={perm}, Threshold={threshold}"
    })

# Save Results
results_df = pd.DataFrame(results)

# Print table
results_df = results_df[["Type", "BuildTime", "QueryTime", "TotalTime", "Fraction", "Parameters"]]

print("\nResults Table:")
print(results_df.to_string(index=False).replace('Parameters', '\nParameters'))
results_df.to_csv("lsh_brute_force_comparison.csv", index=False)
print("Results saved to 'lsh_brute_force_comparison.csv'")

Loading vectorized data and Brute-Force K-NN results...
All required data loaded.
Evaluating LSH...
Starting LSH with 16 permutations...
Using threshold=0.85 for num_permutations=16
LSH index built in 189.20 seconds.
train_vectors shape: (111795, 5000)
LSH querying completed in 0.47 seconds.
Computing fraction of true K-NN retrieved...
Starting LSH with 32 permutations...
Using threshold=0.9 for num_permutations=32
LSH index built in 201.91 seconds.
train_vectors shape: (111795, 5000)
LSH querying completed in 0.21 seconds.
Computing fraction of true K-NN retrieved...
Starting LSH with 64 permutations...
Using threshold=0.9 for num_permutations=64
LSH index built in 233.95 seconds.
train_vectors shape: (111795, 5000)
LSH querying completed in 0.29 seconds.
Computing fraction of true K-NN retrieved...

Results Table:
               Type  BuildTime          QueryTime          TotalTime Fraction              
Parameters
Brute-Force-Jaccard       0.00 133483.86831116676 133483.86831116676 