In [11]:
from src.backend.retrieval.file_reader import read_csv
from src.backend.retrieval.pipeline import preprocess_documents
from src.backend.retrieval.bm25 import BM25Retriever
from src.backend.retrieval.transformer import TransformerRetrieverANN
from sklearn.model_selection import train_test_split
import re

In [2]:
file_path = 'data/processed/all_data_cleaned.csv'
data = read_csv(file_path)
X = data['cleaned_text']
y = data['class_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
X_train_embeddings, transformer_model = preprocess_documents(X_train.tolist())
X_test_embeddings, _ = preprocess_documents(X_test.tolist())

Using device: cuda
Generating embeddings...


Batches:   0%|          | 0/5317 [00:00<?, ?it/s]

Using device: cuda
Generating embeddings...


Batches:   0%|          | 0/1330 [00:00<?, ?it/s]

In [13]:
def preprocess_text(text):
    # Example cleaning steps
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
    return text

# Apply preprocessing to training and test sets
X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

print(X_train.iloc[0])  # First training document
print(y_train.iloc[0])  # Corresponding label


so please skip a few pages amp lets end this with a happy ending
not_humanitarian


In [4]:
print("\nBM25 Retriever Results:")
bm25_retriever = BM25Retriever(X_train)
bm25_results = bm25_retriever.query(X_test[:10], top_n=1)
print("BM25 Results:", bm25_results)


BM25 Retriever Results:


  bm25_results = bm25_retriever.query(X_test[:10], top_n=1)


BM25 Results: [[54718], [89327], [160185], [164499], [110444], [150452], [97162], [153058], [125044], [156877]]


In [10]:
def evaluate_retriever(true_labels, predicted_labels):
    """
    Evaluate the retriever using accuracy, classification report, and confusion matrix.
    """
    print("\nAccuracy:", accuracy_score(true_labels, predicted_labels))
    print("\nClassification Report:\n", classification_report(true_labels, predicted_labels, zero_division=0))
    print("\nConfusion Matrix:\n", confusion_matrix(true_labels, predicted_labels))


# Initialize BM25 Retriever
print("\nBM25 Retriever Evaluation:")
bm25_retriever = BM25Retriever(X_train.tolist())  # Pass training data

# Reduce the test set for debugging
subset_size = 100  # Adjust to a smaller number for faster runs
X_test_subset = X_test.iloc[:subset_size]
y_test_subset = y_test.iloc[:subset_size]

# Generate predictions for BM25
bm25_predicted_labels = []
for query in X_test_subset:
    results = bm25_retriever.query([query], top_n=1)
    top_result_idx = results[0][0]
    bm25_predicted_labels.append(y_train.iloc[top_result_idx])

# Evaluate BM25 on the subset
evaluate_retriever(y_test_subset.tolist(), bm25_predicted_labels)



BM25 Retriever Evaluation:

Accuracy: 0.18

Classification Report:
                                      precision    recall  f1-score   support

                affected_individual       0.00      0.00      0.00         1
                 caution_and_advice       0.00      0.00      0.00         1
          displaced_and_evacuations       0.00      0.00      0.00         0
          donation_and_volunteering       0.00      0.00      0.00         2
                        informative       0.45      0.39      0.42        38
infrastructure_and_utilities_damage       0.00      0.00      0.00         2
             injured_or_dead_people       1.00      1.00      1.00         1
                   not_humanitarian       0.07      0.10      0.08        21
                    not_informative       0.00      0.00      0.00        29
                  requests_or_needs       0.00      0.00      0.00         2
               sympathy_and_support       0.00      0.00      0.00         3

     

In [5]:
print("\nTransformer ANN Retriever Results:")
dimension = X_train_embeddings.shape[1]
transformer_retriever = TransformerRetrieverANN('sentence-transformers/all-MiniLM-L6-v2', n_neighbors=5)
transformer_retriever.fit(X_train.tolist())


Transformer ANN Retriever Results:
Generating embeddings for the corpus...


Batches:   0%|          | 0/5317 [00:00<?, ?it/s]

In [6]:
query = X_test.iloc[0]  # Example query
results, distances = transformer_retriever.query(query, top_n=3)
print("Query:", query)
print("Results:", results)
print("Distances:", distances)

Query: NAIROBI, 1 February (IRIN) - A recent survey by the aid agency International Rescue Committee has shown widespread reluctance among residents of Goma, in the eastern Democratic Republic of Congo, to relocate outside the volcano stricken town, despite the danger of further eruptions in the region.
Results: ['NAIROBI, 1 February (IRIN) - A recent survey by the aid agency International Rescue Committee has shown widespread reluctance among residents of Goma, in the eastern Democratic Republic of Congo, to relocate outside the volcano stricken town, despite the danger of further eruptions in the region.', 'I gather "IRMA" hit some parts of Kenya?', "ISIOLO, 17 October 2012 (IRIN) - Amid rising insecurity, a senior official in Kenya's North Eastern Province has ordered all refugees and unregistered migrants from neighbouring Somalia to move to the under-resourced Dadaab refugee complex by 20 October or face forced relocation."]
Distances: [0.         0.38501334 0.45417732]


In [7]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Define a helper function for evaluation
def evaluate_retriever(y_test, predicted_labels):
    """
    Evaluates the retriever using accuracy and confusion matrix.
    """
    print("\nAccuracy:", accuracy_score(y_test, predicted_labels))
    print("\nClassification Report:\n", classification_report(
        y_test, predicted_labels, zero_division=0))  # Handle undefined metrics
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, predicted_labels))

# Step 4: BM25 Retriever Evaluation
print("\nBM25 Retriever Evaluation:")
bm25_predicted_labels = []

# Debugging Loop
for query in X_test.iloc[:10]:  # Adjust the range as needed
    # Retrieve the top index
    results = bm25_retriever.query([query], top_n=1)
    top_result_idx = results[0][0]

    # Debug: Print the retrieved index and corresponding label
    print(f"Query: {query}")
    print(f"Retrieved Index: {top_result_idx}")
    print(f"Retrieved Label: {y_train.iloc[top_result_idx]}")

    # Append the predicted label
    bm25_predicted_labels.append(y_train.iloc[top_result_idx])

# Evaluate BM25
evaluate_retriever(y_test.iloc[:10].tolist(), bm25_predicted_labels)



BM25 Retriever Evaluation:
Query: NAIROBI, 1 February (IRIN) - A recent survey by the aid agency International Rescue Committee has shown widespread reluctance among residents of Goma, in the eastern Democratic Republic of Congo, to relocate outside the volcano stricken town, despite the danger of further eruptions in the region.
Retrieved Index: 54718
Retrieved Label: requests_or_needs
Query: ��� I love you, but enough of
Retrieved Index: 89327
Retrieved Label: informative
Query: COINCIDENCE????? … … … Damage due to typhoon Pablo reaches P14 billion - NDRRMC
Retrieved Index: 160185
Retrieved Label: infrastructure_and_utilities_damage
Retrieved Index: 164499
Retrieved Label: informative
Query: we want video tweet\nU HV promised us \n\nSRK 13 Million
Retrieved Index: 110444
Retrieved Label: not_humanitarian
Query: Qatar using migrant labor, modern day slavery, &amp; hundreds dead building their World Cup stadiums in the unbearable heat.
Retrieved Index: 150452
Retrieved Label: informat

In [8]:
# Step 5: Transformer ANN Retriever Evaluation
print("\nTransformer ANN Retriever Evaluation:")
transformer_predicted_labels = []

# Loop through the test queries
for query in X_test.iloc[:10]:  # Correctly use .iloc for positional indexing
    indices, _ = transformer_retriever.query(query, top_n=1)

    # Debugging: Print the structure of indices
    print("Query:", query)
    print("Indices:", indices)
    print("Type of indices:", type(indices))

    # Handle indexing based on structure
    if isinstance(indices, np.ndarray):  # If indices is a NumPy array
        index = indices[0][0]  # Extract the first index
    elif isinstance(indices, list):  # If indices is a list
        index = indices[0][0]
    else:
        raise ValueError("Unexpected type for indices:", type(indices))

    # Append the corresponding label
    transformer_predicted_labels.append(y_train.iloc[index])

# Evaluate Transformer ANN
evaluate_retriever(y_test.iloc[:10].tolist(), transformer_predicted_labels)


Transformer ANN Retriever Evaluation:
Query: NAIROBI, 1 February (IRIN) - A recent survey by the aid agency International Rescue Committee has shown widespread reluctance among residents of Goma, in the eastern Democratic Republic of Congo, to relocate outside the volcano stricken town, despite the danger of further eruptions in the region.
Indices: ['NAIROBI, 1 February (IRIN) - A recent survey by the aid agency International Rescue Committee has shown widespread reluctance among residents of Goma, in the eastern Democratic Republic of Congo, to relocate outside the volcano stricken town, despite the danger of further eruptions in the region.']
Type of indices: <class 'list'>


TypeError: Cannot index by location index with a non-integer key