In [153]:
import pandas as pd

df=pd.read_csv("all_data_cleaned.csv")

In [65]:
pip install rank_bm25 torch torchvision transformers



In [66]:
print(df['cleaned_text'].head())  # Check the first few rows
print(df['cleaned_text'].dtype)  # Ensure it's a string type
print(df['cleaned_text'].isnull().sum())  # Check for missing values
df['cleaned_text'] = df['cleaned_text'].fillna("").astype(str)



0    approximately 100km long firebreaks have been ...
1                                        god bless you
2    rt cracked wine casks damaged historical build...
3    im really just excited for new undies and pink...
4    rescue effort expands in india pakistan as flo...
Name: cleaned_text, dtype: object
object
0


In [155]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [154]:
# This part remains the same as your code, where embeddings are computed for semantic similarity-based retrieval:


import numpy as np

def compute_embeddings_batchwise(texts, batch_size=32):
    """Compute dense embeddings in batches to avoid memory issues."""
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

# Compute embeddings
texts = df['cleaned_text'].fillna("").tolist()
embeddings = compute_embeddings_batchwise(texts)

# Save embeddings to a file
np.save("embeddings.npy", embeddings)  # Save as .npy file
print("Embeddings saved successfully!")


KeyboardInterrupt: 

In [189]:
import numpy as np

# Load precomputed embeddings
embeddings = np.load("embeddings.npy")
print("Embeddings loaded successfully!")
print(f"Shape of embeddings: {embeddings.shape}")


Embeddings loaded successfully!
Shape of embeddings: (212766, 768)


In [190]:
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Tokenize the corpus for BM25
tokenized_corpus = [text.split() for text in df['cleaned_text'].fillna("")]
bm25 = BM25Okapi(tokenized_corpus)

def retrieve_top_events(query, bm25, embeddings, df, top_k=5, alpha=0.5):
    """
    Retrieve top-k events combining BM25 and Dense Embedding scores.

    Parameters:
    - query (str): Query text.
    - bm25 (BM25Okapi): BM25 model.
    - embeddings (ndarray): Dense embeddings for documents.
    - df (DataFrame): DataFrame containing the dataset.
    - top_k (int): Number of top results to retrieve.
    - alpha (float): Weight for BM25 scores.

    Returns:
    - List of top-k events ranked by relevance.
    """
    # Step 1: BM25 retrieval
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_scores = bm25_scores / np.max(bm25_scores)  # Normalize BM25 scores

    # Step 2: Dense retrieval
    query_embedding = compute_query_embedding(query)
    dense_scores = cosine_similarity(query_embedding, embeddings).flatten()
    dense_scores = dense_scores / np.max(dense_scores)  # Normalize Dense scores

    # Step 3: Combine BM25 and Dense scores
    combined_scores = alpha * bm25_scores + (1 - alpha) * dense_scores

    # Step 4: Rank and retrieve top-k events
    top_indices = np.argsort(combined_scores)[::-1][:top_k]
    top_events = df.iloc[top_indices]['event'].unique()
    return top_events


In [191]:
def filter_by_class_label(df, event, class_label):
    """Filter rows by event and class label."""
    return df[(df['event'] == event) & (df['class_label'].str.contains(class_label, case=False, na=False))]


In [192]:
import re

def clean_text(text):
    """Remove unnecessary tokens, URLs, and mentions from text."""
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'rt ', '', text, flags=re.IGNORECASE)  # Remove "RT" for retweets
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

def summarize_top_relevant_texts(results, max_rows=3):
    """
    Summarize the top N most relevant rows, dynamically adjusting max_length.
    """
    top_texts = " ".join(clean_text(text) for text in results['cleaned_text'].head(max_rows))
    input_length = len(top_texts.split())
    max_length = min(100, input_length - 10)  # Dynamically adjust max_length
    min_length = min(30, max_length // 2)  # Adjust min_length proportionally

    try:
        summary = summarizer(top_texts, max_length=max_length, min_length=min_length, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return f"Error during summarization: {e}"



In [193]:
from transformers import pipeline

# Load the summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_texts(filtered_df, max_length=100, min_length=30):
    """Summarize filtered texts."""
    combined_text = " ".join(filtered_df['cleaned_text'].tolist())
    combined_text = combined_text[:2000]  # Truncate for summarization
    try:
        summary = summarizer(combined_text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return f"Error during summarization: {e}"


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [194]:
def interactive_hybrid_summarization(df, bm25, embeddings):
    print("Welcome to the summarization system!")
    query = input("What event are you interested in? (e.g., 'earthquake relief', 'hurricane rescue')\n> ").strip()

    # Step 1: Retrieve top events using BM25 + Dense Retrieval
    top_events = retrieve_top_events(query, bm25, embeddings, df, top_k=5, alpha=0.5)
    print("\nTop events related to your query:")
    for idx, event in enumerate(top_events, 1):
        print(f"{idx}. {event}")

    event_idx = int(input("\nSelect the number corresponding to your event: ").strip()) - 1
    selected_event = top_events[event_idx]
    print(f"\nYou selected: {selected_event}")

    # Step 2: Retrieve available class labels for the selected event
    available_labels = df[df['event'] == selected_event]['class_label'].unique()
    print("\nWhat label are you interested in? Here are the available options:")
    for idx, label in enumerate(available_labels, 1):
        print(f"{idx}. {label}")

    label_idx = int(input("\nSelect the number corresponding to your label: ").strip()) - 1
    selected_label = available_labels[label_idx]
    print(f"\nYou selected: {selected_label}")

    # Step 3: Filter rows based on event and class label
    filtered_df = filter_by_class_label(df, selected_event, selected_label)
    if not filtered_df.empty:
        print("\nGenerating summary...")
        summary = summarize_texts(filtered_df)
        print(f"\nSummary for '{selected_event}' and label '{selected_label}':\n{summary}")
    else:
        print(f"No relevant data found for event: '{selected_event}' and label: '{selected_label}'.")


In [195]:
def filter_relevant_rows(results, query):
    """
    Filter rows that are directly relevant to the query.
    Prioritize `event` matches, then `class_label`, and finally text-based matches.
    """
    normalized_query = escape_special_chars(query.lower().replace(' ', '_'))
    event_matches = results[results['event'].str.contains(normalized_query, case=False, na=False)]
    if not event_matches.empty:
        return event_matches  # Return rows with event matches

    # Fall back to class_label and text matches if no event matches
    text_matches = results[
        results['cleaned_text'].str.contains('|'.join(normalized_query.split('_')), case=False, na=False)
    ]
    return text_matches



In [196]:
import re

def escape_special_chars(text):
    """Escape special characters for regex matching."""
    return re.escape(text)

def filter_relevant_rows(results, query):
    """
    Filter rows for relevance by prioritizing event matches.
    """
    normalized_query = escape_special_chars(query)
    # Exact match with `event` column
    event_matches = results[results['event'].str.contains(normalized_query, case=False, na=False)]
    if not event_matches.empty:
        return event_matches

    # Fallback to broader text-based relevance
    text_matches = results[
        results['cleaned_text'].str.contains('|'.join(normalized_query.split('_')), case=False, na=False)
    ]
    return text_matches



In [197]:
def custom_query_summary(df, bm25, embeddings, query, alpha=0.5, top_k=10):
    """
    Generate a summary for a custom query by searching across all events and class labels.
    """
    # Normalize the query
    normalized_query = normalize_query(query)

    # Step 1: BM25 scores
    tokenized_query = normalized_query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_scores = bm25_scores / np.max(bm25_scores) if np.max(bm25_scores) > 0 else bm25_scores  # Normalize BM25 scores

    # Step 2: Dense scores
    query_embedding = compute_query_embedding(query)
    dense_scores = cosine_similarity(query_embedding, embeddings).flatten()
    dense_scores = dense_scores / np.max(dense_scores) if np.max(dense_scores) > 0 else dense_scores  # Normalize Dense scores

    # Step 3: Combine scores
    combined_scores = alpha * bm25_scores + (1 - alpha) * dense_scores
    top_indices = np.argsort(combined_scores)[::-1][:top_k]

    # Step 4: Retrieve top rows
    top_results = df.iloc[top_indices]
    print(f"\nTop results based on BM25 and dense scores:")
    print(top_results[['event', 'class_label', 'cleaned_text']].head(5))

    # Step 5: Filter for relevant rows
    filtered_results = filter_relevant_rows(top_results, normalized_query)

    if not filtered_results.empty:
        print(f"\nFiltered relevant rows for query '{query}':")
        print(filtered_results[['event', 'class_label', 'cleaned_text']].head(5))
        summary = summarize_top_relevant_texts(filtered_results, max_rows=3)
        return summary
    else:
        # Fallback: Match directly with the event column
        matched_event_rows = df[df['event'].str.contains(escape_special_chars(normalized_query), case=False, na=False)]
        if not matched_event_rows.empty:
            print(f"\nRows found for event match '{query}':")
            print(matched_event_rows[['event', 'class_label', 'cleaned_text']].head(5))
            summary = summarize_top_relevant_texts(matched_event_rows, max_rows=3)
            return summary
        else:
            return f"No relevant data found for query: '{query}'."


In [187]:
def humanize_labels(label):
    """Convert snake_case class labels to human-readable format."""
    return label.replace('_', ' ').title()

def humanize_events(event):
    """Convert snake_case events to human-readable format with specific disaster handling."""
    event = event.replace('_earthquake', ' Earthquake')
    event = event.replace('_hurricane', ' Hurricane')
    event = event.replace('_floods', ' Floods')
    event = event.replace('_typhoon', ' Typhoon')
    event = event.replace('_cyclone', ' Cyclone')
    event = event.replace('_bombings', ' Bombings')
    event = event.replace('_explosion', ' Explosion')
    event = event.replace('_train-crash', ' Train Crash')
    event = event.replace('_wildfires', ' Wildfires')
    event = event.replace('_volcano', ' Volcano')
    event = event.replace('_shootings', ' Shootings')
    event = event.replace('_syndrome', ' Syndrome')
    event = event.replace('_fire', ' Fire')
    event = event.replace('_building-collapse', ' Building Collapse')
    event = event.replace('_airport', ' Airport')
    event = event.replace('_refinery-explosion', ' Refinery Explosion')
    event = event.replace('_respiratory', ' Respiratory')
    event = event.replace('_', ' ').title()  # Convert remaining snake_case to Title Case
    return event



def normalize_input(user_input):
    """Normalize user input to match dataset format."""
    return user_input.lower().replace(' ', '_')


def normalize_query(query):
    """Normalize the query to match the dataset format."""
    # Define filler words and phrases to remove
    fillers = [
        "what about ", "tell me about ", "show me ", "can you find ",
        "could you find ", "is there any ", "do you have ", "please find ",
        "any information on ", "anything about ", "how about ",
        "do you know ", "can I see ", "give me details on", "can you find information about",
        "what does"
    ]

    # Remove filler words
    query = query.lower()
    for filler in fillers:
        query = query.replace(filler, "")

    # Remove multiple question marks and extra spaces
    query = query.replace("?", "").strip()
    query = query.replace("????", "").strip()
    query = query.replace("!", "").strip()
    # Replace spaces with underscores
    query = query.replace(" ", "_")

    return query






In [198]:
events = [
    '2015_nepal_earthquake', '2014_california_earthquake', '2013_boston_bombings-ontopic',
    '2014_pakistan_floods', '2014_chile_earthquake', '2014_philippines_typhoon-hagupit',
    '2012_sandy_hurricane-ontopic', '2013_west_texas', 'hurricane_irma', 'hurricane_maria',
    '2013_queensland_floods-ontopic', '2013_pakistan_earthquake', 'hurricane_harvey',
    '2014-2015_worldwide_landslides', '2013_alberta_floods', '2013_oklahoma_tornado-ontopic',
    '2013_alberta_floods-ontopic', '2015_vanuatu_cyclone', '2013_colorado_floods',
    '2013_australia_bushfire', '2014_india_floods', '2013_ny_train-crash', 'srilanka_floods',
    '2012_philipinnes_floods', '2014_mexico_hurricane-odile', '2013_glasgow_helicopter-crash',
    '2011_joplin_tornado-a121571', '2013_bangladesh_savar-building-collapse',
    '2013_brazil_nightclub-fire', 'mexico_earthquake', '2013_manila_floods',
    '2013_phillipines_typhoon-yolanda', '2013_singapore_haze', '2013_russia_meteor_en-mixed',
    '2014_iceland_volcano', 'iraq_iran_earthquake', '2013_queensland_floods',
    '2013_canada_lac-megantic-train-crash', '2012_philippines_typhoon-pablo',
    'california_wildfires', '2015_vanuatu_cyclone-pam', '2012_us_sandy-hurricane-a143145',
    '2014_philippines_typhoon', '2013_bohol_earthquake', '2013_west-texas_explosion',
    '2012_costa-rica_earthquake', '2013_spain_train-crash_en-mixed', '2013_boston_bombings',
    '2012_colorado_wildfires', '2014_worldwide_ebola', '2013_la_airport-shootings',
    '2012_us_sandy-hurricane-a144267', '2011_joplin_tornado-a131709', '2014_malaysia_airline',
    '2012_venezuela_refinery-explosion', '2012_guatemala_earthquake',
    '2014_middle-east_respiratory-syndrome', '2013_italy_sardinia', '2012_italy_earthquakes',
    '2014_chile_earthquake_esp'
]

human_readable_events = [humanize_events(event) for event in events]
print(human_readable_events)


['2015 Nepal Earthquake', '2014 California Earthquake', '2013 Boston Bombings-Ontopic', '2014 Pakistan Floods', '2014 Chile Earthquake', '2014 Philippines Typhoon-Hagupit', '2012 Sandy Hurricane-Ontopic', '2013 West Texas', 'Hurricane Irma', 'Hurricane Maria', '2013 Queensland Floods-Ontopic', '2013 Pakistan Earthquake', 'Hurricane Harvey', '2014-2015 Worldwide Landslides', '2013 Alberta Floods', '2013 Oklahoma Tornado-Ontopic', '2013 Alberta Floods-Ontopic', '2015 Vanuatu Cyclone', '2013 Colorado Floods', '2013 Australia Bushfire', '2014 India Floods', '2013 Ny Train Crash', 'Srilanka Floods', '2012 Philipinnes Floods', '2014 Mexico Hurricane-Odile', '2013 Glasgow Helicopter-Crash', '2011 Joplin Tornado-A121571', '2013 Bangladesh Savar-Building-Collapse', '2013 Brazil Nightclub-Fire', 'Mexico Earthquake', '2013 Manila Floods', '2013 Phillipines Typhoon-Yolanda', '2013 Singapore Haze', '2013 Russia Meteor En-Mixed', '2014 Iceland Volcano', 'Iraq Iran Earthquake', '2013 Queensland Flood

In [199]:
# Original class labels
class_labels = [
    'infrastructure_and_utilities_damage', 'not_humanitarian', 'injured_or_dead_people',
    'sympathy_and_support', 'donation_and_volunteering', 'response_efforts',
    'caution_and_advice', 'requests_or_needs', 'affected_individual',
    'displaced_and_evacuations', 'missing_and_found_people', 'not_informative',
    'informative'
]

# Transform class labels
human_readable_labels = [humanize_labels(label) for label in class_labels]
print(human_readable_labels)


['Infrastructure And Utilities Damage', 'Not Humanitarian', 'Injured Or Dead People', 'Sympathy And Support', 'Donation And Volunteering', 'Response Efforts', 'Caution And Advice', 'Requests Or Needs', 'Affected Individual', 'Displaced And Evacuations', 'Missing And Found People', 'Not Informative', 'Informative']


In [200]:
def interactive_hybrid_summarization_with_improvements(df, bm25, embeddings):
    print("Welcome to the summarization system!")
    print("\nOptions:")
    print("1. Summarize based on specific event and class label")
    print("2. Generate a summary for a custom query across the dataset")

    option = input("Choose an option (1 or 2): ").strip()

    if option == "1":
        query = input("What event are you interested in? (e.g., 'earthquake relief', 'hurricane rescue')\n> ").strip()
        normalized_query = normalize_input(query)
        top_events = retrieve_top_events(normalized_query, bm25, embeddings, df, top_k=5, alpha=0.5)

        # Display human-readable events
        print("\nTop events related to your query:")
        human_readable_events = [humanize_events(event) for event in top_events]
        for idx, event in enumerate(human_readable_events, 1):
            print(f"{idx}. {event}")
        try:
            event_idx = int(input("\nSelect the number corresponding to your event: ").strip()) - 1
            if event_idx < 0 or event_idx >= len(top_events):
                raise ValueError("Invalid selection. Please choose a valid number.")
        except ValueError as e:
            print(e)
            return
        selected_event = top_events[event_idx]
        print(f"\nYou selected: {humanize_events(selected_event)}")

        # Retrieve available class labels
        available_labels = df[df['event'] == selected_event]['class_label'].unique()
        human_readable_labels = [humanize_labels(label) for label in available_labels]
        print("\nWhat label are you interested in? Here are the available options:")
        for idx, label in enumerate(human_readable_labels, 1):
            print(f"{idx}. {label}")
        try:
            label_idx = int(input("\nSelect the number corresponding to your label: ").strip()) - 1
            if label_idx < 0 or label_idx >= len(available_labels):
                raise ValueError("Invalid selection. Please choose a valid number.")
        except ValueError as e:
            print(e)
            return
        selected_label = available_labels[label_idx]
        print(f"\nYou selected: {humanize_labels(selected_label)}")

        # Summarize and save
        filtered_df = filter_by_class_label(df, selected_event, selected_label)
        if not filtered_df.empty:
            print("\nGenerating summary...")
            summary = summarize_texts(filtered_df)
            print(f"\nSummary for '{humanize_events(selected_event)}' and label '{humanize_labels(selected_label)}':\n{summary}")
            save_option = input("\nWould you like to save this summary? (yes/no): ").strip().lower()
            if save_option == "yes":
                save_summary(humanize_events(selected_event), humanize_labels(selected_label), summary)
        else:
            print(f"No relevant data found for event: '{humanize_events(selected_event)}' and label: '{humanize_labels(selected_label)}'.")

    elif option == "2":
        query = input("Enter your custom query (e.g., 'earthquake damage'):\n> ").strip()
        normalized_query = normalize_input(query)
        summary = custom_query_summary(df, bm25, embeddings, normalized_query, alpha=0.5, top_k=10)
        print(f"\nSummary for custom query '{query}':\n{summary}")
    else:
        print("Invalid option. Please restart the system.")



In [201]:
def summarize_texts(filtered_df, max_length=100, min_length=30):
    combined_text = " ".join(filtered_df['cleaned_text'].tolist())
    if len(combined_text) > 2000:  # Truncate if too long
        combined_text = combined_text[:2000]
    try:
        summary = summarizer(combined_text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return f"Error during summarization: {e}"

def save_summary(event, label, summary):
    """Save the generated summary to a file."""
    filename = f"summary_{event.replace(' ', '_')}_{label.replace(' ', '_')}.txt"
    with open(filename, "w") as file:
        file.write(f"Event: {event}\n")
        file.write(f"Label: {label}\n")
        file.write(f"Summary:\n{summary}")
    print(f"Summary saved to {filename}.")

In [203]:
# Ensure BM25 and embeddings are initialized
interactive_hybrid_summarization_with_improvements(df, bm25, embeddings)


Welcome to the summarization system!

Options:
1. Summarize based on specific event and class label
2. Generate a summary for a custom query across the dataset
Choose an option (1 or 2): 1
What event are you interested in? (e.g., 'earthquake relief', 'hurricane rescue')
> earthquake relief


  bm25_scores = bm25_scores / np.max(bm25_scores)  # Normalize BM25 scores



Top events related to your query:
1. 2015 Vanuatu Cyclone
2. 2013 Alberta Floods-Ontopic
3. 2013 Oklahoma Tornado-Ontopic
4. 2015 Nepal Earthquake
5. 2013 West Texas

Select the number corresponding to your event: 5

You selected: 2013 West Texas

What label are you interested in? Here are the available options:
1. Not Humanitarian
2. Informative
3. Not Informative

Select the number corresponding to your label: 2

You selected: Informative

Generating summary...

Summary for '2013 West Texas' and label 'Informative':
This has been a horrible week filled with tragedy why i know its not right but ima be so angry and hateful when i see fathers hugging their daughters graduation day rt chill another explosion in texas either jesus is coming back or the government is sending a message lol no mam but i always feel nauseated every night at the same time its been like that for like 3 days its okay i can forgive you i havent seen that nigga since i was in the 10

Would you like to save this s

In [91]:
print(df[df['event'].str.contains("mexico_earthquake", case=False)])


                        id              event     source  \
159     911750544728891392  mexico_earthquake  crisismmd   
183     910547655625003009  mexico_earthquake  crisismmd   
354     910809662383149058  mexico_earthquake  crisismmd   
532     911640840908476416  mexico_earthquake  crisismmd   
542     913491401320521728  mexico_earthquake  crisismmd   
...                    ...                ...        ...   
212586  910739720065384449  mexico_earthquake  crisismmd   
212604  910756608371822594  mexico_earthquake  crisismmd   
212692  910525106329329664  mexico_earthquake  crisismmd   
212702  912398081500565508  mexico_earthquake  crisismmd   
212734  910743867951058944  mexico_earthquake  crisismmd   

                                                     text lang  lang_conf  \
159     RT @WingsChronicles: á¼³FDrop &amp; Donate to ...   en        NaN   
183     What You Can Do To Help Mexico City Earthquake...   en        NaN   
354     Rescuers struggle to save trapped girl a