In [8]:
from keybert import KeyBERT
import json
import pandas as pd
import concurrent.futures
from sentence_transformers import SentenceTransformer

top_n_vals = [5, 10, 50, 100, 500, 1000]

cranfield = []
with open("cranfield/cran_docs.json", "r") as f:
    cranfield = json.load(f)

body_list = [data['body'] for data in cranfield]
combined_text = " ".join(body_list)

# Load the model and tokenizer explicitly
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

def get_keywords(text, top_n, model):
    # Initialize KeyBERT with the loaded model
    kw_model = KeyBERT(model=model)

    # Extract keywords
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 1),
        stop_words="english",
        use_mmr=True,
        top_n=top_n
    )

    # Save keywords to a CSV file
    df = pd.DataFrame(keywords, columns=['Keyword', 'Score'])
    df.to_csv(f"topics/topics_top_{top_n}.csv", index=False)
    return None


with concurrent.futures.ThreadPoolExecutor() as executor:
    # Extract keywords in parallel
    keyword_results = list(executor.map(get_keywords, [combined_text]*len(top_n_vals), top_n_vals, [model]*len(top_n_vals)))



In [9]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet
import requests
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, filename='wiki_summary.log', filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s')

def get_wikipedia_summary(keyword, max_attempts=3):
    headers = {'User-Agent': 'NLP IR agent'}
    search_url = "https://en.wikipedia.org/w/api.php"

    # Search Wikipedia for the keyword
    search_params = {
        'action': 'query',
        'list': 'search',
        'srsearch': keyword,
        'format': 'json'
    }
    search_response = requests.get(search_url, params=search_params, headers=headers)
    search_data = search_response.json()

    search_hits = search_data.get('query', {}).get('search', [])

    # Try multiple hits (up to max_attempts)
    for idx, hit in enumerate(search_hits[:max_attempts]):
        title = hit['title']
        
        # Get the summary
        summary_params = {
            'action': 'query',
            'prop': 'extracts|categories',
            'exintro': True,
            'explaintext': True,
            'titles': title,
            'format': 'json'
        }
        summary_response = requests.get(search_url, params=summary_params, headers=headers)
        summary_data = summary_response.json()

        pages = summary_data.get('query', {}).get('pages', {})
        if pages:
            page = next(iter(pages.values()))
            summary = page.get('extract')
            categories = [cat['title'] for cat in page.get('categories', [])] if 'categories' in page else []

            # Check for disambiguation indicators
            if summary:
                is_disambig = ("disambiguation pages" in " ".join(categories).lower() or
                               "may refer to" in summary.lower() or
                               "may stand for" in summary.lower())
                
                if not is_disambig:
                    # Good summary found
                    logging.info(f"Found summary for '{keyword}'")  # Print first 100 characters
                    return summary
                else:
                    logging.info(f"Disambiguation detected for '{title}', trying next result...")

    # Fallback to dictionary definition
    synsets = wordnet.synsets(keyword)
    if synsets:
        return synsets[0].definition()

    return None

lemmatizer = WordNetLemmatizer()

for i in top_n_vals:

    df = pd.read_csv(f"topics/topics_top_{i}.csv")
    df['Keyword'] = df['Keyword'].apply(lambda x: lemmatizer.lemmatize(x))
    df.to_csv(f"topics/topics_top_{i}_lem.csv", index=False)

    # Get Wikipedia summaries in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Extract Wikipedia summaries in parallel
        wiki_results = list(executor.map(get_wikipedia_summary, df['Keyword']))

    # Save Wikipedia summaries to a json
    wiki_summaries = {df['Keyword'][i]: wiki_results[i] 
                      for i in range(len(wiki_results))
                      if wiki_results[i] is not None}
    with open(f"topics/topics_top_{i}_lem_wiki.json", "w") as f:
        json.dump(wiki_summaries, f, indent=4)





[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aaditmahajan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/aaditmahajan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
import json
import logging
import pandas as pd
import numpy as np
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import wikipediaapi
from nltk.corpus import stopwords

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", filename="debug.log")
logger = logging.getLogger(__name__)

def preprocess_text(text):

    '''
    Preprocess the text by tokenizing, removing stop words, and lemmatizing.
    Args:
        text (str): The input text to preprocess.
    Returns:
        str: The preprocessed text.
    '''

    # Ensure nltk data is available when called by the Spark job later. 
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer

    nltk.data.path.append("./nltk_data")

    if not text:
        return ""
    # Tokenize the text
    tokens = word_tokenize(text.lower()) 
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)  # Return as a string

def clean_text(text):
    '''
    Clean the text by removing non-alphanumeric characters and converting to lowercase.
    (helper function)
    '''
    text = re.sub(r"\W+", " ", text.lower())
    text = re.sub(r"\s+", " ", text).strip()
    return text


def load_corpus(corpus_file="topics/topics_top_1000_lem_wiki.json"):
    '''
    Load the corpus from a JSON file.
    Args:
        corpus_file (str): Path to the JSON file containing the corpus.
    Returns:
        dict: The loaded corpus as a dictionary.
    '''

    try:
        with open(corpus_file, "r") as file:
            corpus_dict = json.load(file)
        logger.info(f"Corpus successfully loaded from {corpus_file}.")
        return corpus_dict
    except Exception as e:
        logger.error(f"Failed to load corpus from {corpus_file}: {e}")
        return {}


def generate_esa_vectors(text, corpus_file="topics/topics_top_1000_lem_wiki.json"):
    '''
    Generate ESA vectors for the given text using the lemmatized corpus.
    Args:
        text (str): The input text to generate ESA vectors for.
    Returns:
        list: The ESA vectors for the input text.
    '''
    
    logger.info("Generating ESA vectors for artist.")
    
    corpus = load_corpus(corpus_file)  # Load the corpus
    if not corpus:
        logger.error("Corpus is empty or could not be loaded.")
        return [], []

    # preprocessing 
    sentences = sent_tokenize(text)
    processed_sentences = [preprocess_text(s) for s in sentences]
    processed_corpus = list(corpus.values())
    all_documents = processed_sentences + processed_corpus

    vectorizer = TfidfVectorizer(stop_words="english")          # Create a TF-IDF vectorizer
    tfidf_matrix = vectorizer.fit_transform(all_documents)      # Fit and transform the documents

    esa_vectors = []
    # Generate ESA vectors for each processed sentence
    for i in range(len(processed_sentences)):
        similarities = cosine_similarity(tfidf_matrix[i:i+1], tfidf_matrix[len(processed_sentences):])
        esa_vector = similarities.flatten()
        esa_vectors.append(esa_vector)
    
    if esa_vectors:         # ESA vectors are generated
        esa_vectors = np.mean(esa_vectors, axis=0)
        return esa_vectors.tolist()
    else:
        logger.error("No ESA vectors generated.")
    return []

def main():
    '''
    Main function to run the ESA vector generation.
    '''
    # Example text
    text = "what are the structural and aeroelastic problems associated with flight of high speed aircraft ."
    
    # Generate ESA vectors for the example text
    esa_vectors = generate_esa_vectors(text)
    
    if esa_vectors:
        print("ESA Vectors generated successfully.")
        print(esa_vectors)
    else:
        print("Failed to generate ESA vectors.")

if __name__ == "__main__":
    main()

ESA Vectors generated successfully.
[0.044378063000222345, 0.044378063000222345, 0.044378063000222345, 0.04755062787844759, 0.0, 0.019375754127319955, 0.1466344915846601, 0.023407483290908786, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11793889753811097, 0.03967212815146216, 0.0, 0.05674916589948491, 0.023875933906285948, 0.0, 0.03044256118476253, 0.01785744351093431, 0.08401778914142174, 0.05942736317751768, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.020608076026420286, 0.0, 0.0, 0.0, 0.11793889753811097, 0.0, 0.0, 0.07904616146568252, 0.11865105196729897, 0.0, 0.0, 0.061367357100295346, 0.0, 0.0, 0.0, 0.12229784775631222, 0.013108107746545953, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.009983066887084828, 0.00773732684617084, 0.0, 0.0, 0.0, 0.0, 0.023465704878583304, 0.01946214875561731, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.029595535270625764, 0.020687972263834998, 0.0, 0.0, 0.0, 0.0, 0.0073753404852729375, 0.06504833299603477, 0.0, 0.020583992794123842, 0.008097141700602583, 0.0, 0.1441566580575947, 