###### The BM25 algorithm is designed to rank documents based on the relevance of terms in a query, considering factors like term frequency and document length. However, if your query doesn't match any documents exactly, you might need to adjust your approach to improve the similarity detection. Here are some strategies to enhance the effectiveness of BM25 in finding more relevant matches:
######  Implemnted in this new script
###### Synonyms and Stemming: Use techniques like stemming or lemmatization to reduce words to their base forms, and consider expanding your query with synonyms to capture more variations of the terms. 
###### Query Expansion: Manually or automatically expand your query with related terms. This can be done using a thesaurus or word embeddings like Word2Vec or GloVe to find semantically similar words.
###### Preprocessing Enhancements: Improve your preprocessing steps by removing noise, handling typos, and ensuring consistent formatting across your dataset.
###### Custom Scoring: Consider implementing a custom scoring function that combines BM25 with other metrics, such as semantic similarity using embeddings.

In [3]:
import pandas as pd
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
import nltk

# Ensure NLTK resources are available
nltk.download('wordnet')

# Load the Excel file
file_path = 'C:/Users/oscarahe/OneDrive - Intel Corporation/Desktop/Exceles/query2.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Select the column to compare against
column_to_compare = 'description'  # Replace with your column name

# Initialize stemmer. This helps to reduce words to their base forms. New feature
stemmer = PorterStemmer()

# Preprocess the text data
def preprocess_text(text):
    # Convert to lowercase
    text = str(text).lower()
    # Tokenize, stem, and remove stop words
    tokens = [stemmer.stem(word) for word in text.split() if word not in ENGLISH_STOP_WORDS]
    return tokens

# Expand query with synonyms. This is a new feature. 
def expand_query(query):
    expanded_query = set(query)
    for word in query:
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                expanded_query.add(stemmer.stem(lemma.name()))
    return list(expanded_query)

# Preprocess the data in the selected column
documents = df[column_to_compare].apply(preprocess_text).tolist()

# Initialize BM25
bm25 = BM25Okapi(documents)

# Statement to compare
statement = "BIOS"  # Replace with your statement
query = preprocess_text(statement)

# Expand the query
expanded_query = expand_query(query)

# Get BM25 scores
scores = bm25.get_scores(expanded_query)

# Find the indices of the top 5 scores
top_n = 5
top_indices = scores.argsort()[-top_n:][::-1]

# Get the top 5 most similar sightings
top_sightings = df.iloc[top_indices]

#print("Top 5 most similar sightings:")
print(top_sightings[column_to_compare])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oscarahe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


365    <p></p><p><b><span style="font-size: 14px;">&n...
291    <p>link_mca_ctl.mask_ecc_error , when set to 1...
114    <p class="MsoNormal">&nbsp;<span style="font-s...
128    <p style="margin:0in"><span style="font-size: ...
123    <p style="margin:0in;font-family:Calibri;font-...
Name: description, dtype: object


In [5]:
import pandas as pd
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
import nltk
from bs4 import BeautifulSoup
import logging

# Ensure NLTK resources are available
nltk.download('wordnet')

# Initialize stemmer
stemmer = PorterStemmer()

def remove_html(text):
    """Remove HTML tags from a string."""
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def preprocess_text(text):
    """Preprocess text by removing HTML, converting to lowercase, tokenizing, stemming, and removing stop words."""
    # Remove HTML tags
    text = remove_html(text)
    # Convert to lowercase
    text = str(text).lower()
    # Tokenize, stem, and remove stop words
    tokens = [stemmer.stem(word) for word in text.split() if word not in ENGLISH_STOP_WORDS]
    return tokens

def expand_query(query):
    """Expand query with synonyms."""
    expanded_query = set(query)
    for word in query:
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                expanded_query.add(stemmer.stem(lemma.name()))
    return list(expanded_query)

def load_data(file_path, column_name):
    """Load data from a CSV file and preprocess the specified column."""
    try:
        df = pd.read_csv(file_path)
        documents = df[column_name].apply(preprocess_text).tolist()
        return df, documents
    except FileNotFoundError:
        logging.error(f"File not found: {file_path}")
        return None, None

def get_top_sightings(df, documents, statement, top_n=5):
    """Retrieve top N most similar sightings based on BM25 scores."""
    query = preprocess_text(statement)
    expanded_query = expand_query(query)
    bm25 = BM25Okapi(documents)
    scores = bm25.get_scores(expanded_query)
    top_indices = scores.argsort()[-top_n:][::-1]
    return df.iloc[top_indices]

def main(file_path, column_name, statement):
    """Main function to execute the retrieval process."""
    df, documents = load_data(file_path, column_name)
    if df is not None:
        top_sightings = get_top_sightings(df, documents, statement)
        print("Top 5 most similar sightings:")
        print(top_sightings[column_name])

if __name__ == "__main__":
    file_path = 'C:/Users/oscarahe/OneDrive - Intel Corporation/Desktop/Exceles/query2.csv'
    column_name = 'description'
    statement = "BIOS"
    main(file_path, column_name, statement)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oscarahe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Top 5 most similar sightings:
114    <p class="MsoNormal">&nbsp;<span style="font-s...
365    <p></p><p><b><span style="font-size: 14px;">&n...
286    <p>During early BIOS boots we're seeing interm...
227    <br /><p>Tested on EMR A0 IFWI 94D13 and BIOS ...
257    <p><span style="font-weight: bolder;"></span><...
Name: description, dtype: object


In [12]:
from bs4 import BeautifulSoup

def remove_html_malformed(text):
    """Remove HTML tags from a malformed HTML string using BeautifulSoup."""
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Example usage with malformed HTML
malformed_html_content = "<div>Observing completely closed eye once xtalk is enabled and not able to reach 0 BER showing fast unit bathtub plot as a reference, fast unit eye is small but able to reach 0 BER and perform JTOL&nbsp;</div><div><img src=https://hsdes.intel.com/rest/binary/14019102588 data-filename=image.png style=width: 531px; />&nbsp;<img"
clean_text = remove_html_malformed(malformed_html_content)
print(clean_text)  # Output: This is a sample text with HTML tags.

Observing completely closed eye once xtalk is enabled and not able to reach 0 BER showing fast unit bathtub plot as a reference, fast unit eye is small but able to reach 0 BER and perform JTOL  <img
