In [13]:
from keybert import KeyBERT
import json
import pandas as pd
import requests
import concurrent.futures
from sentence_transformers import SentenceTransformer


cranfield = []
with open("cranfield/cran_docs.json", "r") as f:
    cranfield = json.load(f)

body_list = [data['body'] for data in cranfield]
combined_text = " ".join(body_list)

# Load the model and tokenizer explicitly
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

def get_keywords(text, top_n, model):
    # Initialize KeyBERT with the loaded model
    kw_model = KeyBERT(model=model)

    # Extract keywords
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 1),
        stop_words="english",
        use_mmr=True,
        top_n=top_n
    )

    # Save keywords to a CSV file
    df = pd.DataFrame(keywords, columns=['Keyword', 'Score'])
    df.to_csv(f"topics/topics_top_{top_n}.csv", index=False)
    return None

top_n_vals = [5, 10, 50, 100, 500, 1000]

with concurrent.futures.ThreadPoolExecutor() as executor:
    # Extract keywords in parallel
    keyword_results = list(executor.map(get_keywords, [combined_text]*len(top_n_vals), top_n_vals, [model]*len(top_n_vals)))



In [None]:
from nltk.stem import WordNetLemmatizer

def get_wikipedia_summary(keyword):
    headers = {'User-Agent': 'NLP IR agent'}
    search_url = "https://en.wikipedia.org/w/api.php"

    AEROSPACE_CATEGORIES = [
        "Aerodynamics", "Aircraft", "Aerospace engineering", 
        "Aviation", "Spacecraft", "Rocketry", "Engineering", "Technology"
        "Space exploration", "Aerospace", ""
    ]

    # Step 1: Search with aerospace context
    search_params = {
        'action': 'query',
        'list': 'search',
        'srsearch': f"{keyword} aerospace",
        'format': 'json'
    }

    search_response = requests.get(search_url, params=search_params, headers=headers)
    search_data = search_response.json()

    if search_data.get('query', {}).get('search'):
        first_hit_title = search_data['query']['search'][0]['title']

        # Step 2: Check categories for aerospace relevance
        category_params = {
            'action': 'query',
            'prop': 'categories',
            'titles': first_hit_title,
            'format': 'json'
        }
        
        category_response = requests.get(search_url, params=category_params, headers=headers)
        category_data = category_response.json()

        pages = category_data.get('query', {}).get('pages', {})
        if pages:
            page = next(iter(pages.values()))
            categories = page.get('categories', [])
            
            # Extract clean category titles
            page_categories = [cat['title'].replace("Category:", "") for cat in categories]
            
            # Check for overlap with aerospace categories
            if any(cat in page_categories for cat in AEROSPACE_CATEGORIES):
                # Fetch the summary if relevant
                summary_params = {
                    'action': 'query',
                    'prop': 'extracts',
                    'exintro': True,
                    'explaintext': True,
                    'titles': first_hit_title,
                    'format': 'json'
                }

                summary_response = requests.get(search_url, params=summary_params, headers=headers)
                summary_data = summary_response.json()

                pages = summary_data.get('query', {}).get('pages', {})
                if pages:
                    page = next(iter(pages.values()))
                    return page.get('extract')
            else:
                print(f"No aerospace-relevant categories found for '{first_hit_title}'")

    return None


lemmatizer = WordNetLemmatizer()
for i in top_n_vals:

    df = pd.read_csv(f"topics/topics_top_{i}.csv")
    df['Keyword'] = df['Keyword'].apply(lambda x: lemmatizer.lemmatize(x))
    df.to_csv(f"topics/topics_top_{i}_lem.csv", index=False)

    # Get Wikipedia summaries in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Extract Wikipedia summaries in parallel
        wiki_results = list(executor.map(get_wikipedia_summary, df['Keyword']))

    # Save Wikipedia summaries to a json
    wiki_summaries = {df['Keyword'][i]: wiki_results[i] for i in range(len(wiki_results))}
    with open(f"topics/topics_top_{i}_lem_wiki.json", "w") as f:
        json.dump(wiki_summaries, f, indent=4)





No aerospace-relevant categories found for 'Mechanical engineering'No aerospace-relevant categories found for 'Firefly Aerospace'

No aerospace-relevant categories found for 'Fluid Components International'
No aerospace-relevant categories found for 'Lightcraft'
No aerospace-relevant categories found for 'Firefly Aerospace'
No aerospace-relevant categories found for 'Council of Scientific and Industrial Research'
No aerospace-relevant categories found for 'Aerospace'No aerospace-relevant categories found for 'Lightcraft'
No aerospace-relevant categories found for 'Discontinuity (geotechnical engineering)'

No aerospace-relevant categories found for 'Mechanical engineering'
No aerospace-relevant categories found for 'Fluid Components International'
No aerospace-relevant categories found for 'Firefly Aerospace'
No aerospace-relevant categories found for 'Lightcraft'No aerospace-relevant categories found for 'Pioneer Aerospace Corporation'

No aerospace-relevant categories found for 'Fire