In [282]:
import spacy
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Load scientific/biomedical model if needed
# nlp = spacy.load("en_core_sci_sm")  # if using scispaCy
nlp = spacy.load("en_core_web_sm") #, disable=["ner", "parser"])  # faster for our use


# Combine NLTK and sklearn stopwords
STOPWORDS = set(stopwords.words("english")).union(ENGLISH_STOP_WORDS)

# Optional: custom stopwords for astrophysics
CUSTOM_STOPWORDS = set([
    "et", "al", "figure", "using", "based", "data", "datum", "analysis",
    "result", "results", "show", "use", "used", "paper", "new", "present", "study", "scientific",
    "tool", "dataset", "mass"
])
STOPWORDS.update(CUSTOM_STOPWORDS)

def preprocess_text(text):
    
    # Convert the abstract to lowercase
    text = text.lower()

    # Remove URLs, LaTeX math, inline citations, and other nonp-ascii texts
    text= re.sub(r"\d+", "", text)
    text = re.sub(r"\$.*?\$", "", text)  # remove LaTeX math and numbers
    text = re.sub(r"\[.*?\]|\(.*?et al\.\)", "", text)  # remove inline citations

    # Replace hyphens with spaces
    text = text.replace('-', ' ')

    # Tokenize + lemmatize with SpaCy
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.is_stop or token.is_punct or token.like_num:
            continue
        if token.lemma_ in STOPWORDS or token.lemma_ in string.punctuation:
            continue
        if len(token.lemma_) < 4:
            continue
        tokens.append(token.lemma_)

    return " ".join(tokens)

In [253]:
import sqlite3
import pandas as pd
import multiprocessing 

In [254]:
%cd ../arXiv_MetadataExtraction

/Users/nakulgangolli/Desktop/arXiv_Reader/arXiv_MetadataExtraction


In [255]:
# Connect to an existing database or create a new one if it doesn't exist
con = sqlite3.connect("arxiv_astro.db")

# Create a cursor object
cur = con.cursor()
cur.execute("PRAGMA database_list;")
databases = cur.fetchall()
for db in databases:
    # The second element in the tuple is the database name (main, temp, or attached databases)
    # The third element is the path/filename
    print(db)
    print(f"Database Name: {db[1]}, Path: {db[2]}")

query = "SELECT * FROM arxiv_papers"  # Replace 'your_table_name' with the actual table name
# df = pd.read_sql_query(query, con)
# con.close()

(0, 'main', '/Users/nakulgangolli/Desktop/arXiv_Reader/arXiv_MetadataExtraction/arxiv_astro.db')
Database Name: main, Path: /Users/nakulgangolli/Desktop/arXiv_Reader/arXiv_MetadataExtraction/arxiv_astro.db


In [256]:
query = "SELECT *, STRFTIME('%Y', published) AS Year, STRFTIME('%m', published) AS Month, "+\
        "LENGTH(authors) - LENGTH(REPLACE(authors, ',', ''))+1 as NumAuthors FROM arxiv_papers"
df = pd.read_sql_query(query, con)
df.drop(columns=['published', 'updated'], inplace=True)

In [283]:
ABSTRACT_LIST = df['abstract'].values

In [None]:
CLEANED_ABSTRACT_LIST = list(map(lambda _ABSTRACT: preprocess_text(_ABSTRACT), ABSTRACT_LIST))

In [274]:
print(CLEANED_ABSTRACT_LIST[2:10])

['investigate radio property complete sample nearby massive bright elliptical galaxy sample contain galaxy rosat survey flux distance galaxy complete radio chandra coverage nuclear radio emission detect galaxy galaxy exhibit extend radio emission exhibit clear evidence interaction radio source surround emit galaxy unresolved radio source clear small cavity like feature chandra image disturbed morphology radio luminosity limit equivalent calculate radio loud fraction miss majority radio detect galaxy sample determine integrate radio flux ratio galaxy span large range factor calculate weighted cool time hint anticorrelation radio luminosity calculate limit ratio total particle energy relativistic electron radiate range volume factor plasma cavity distribution broad reflect previous large galaxy cluster lower flux limit expense complete chandra coverage increase size sample galaxy nuclear radio activity detect extended sample', 'luminous spatially resolve binary quasar clearly inhabit ong

In [275]:
# from bertopic import BERTopic
# from sentence_transformers import SentenceTransformer

# embedding_model = SentenceTransformer("allenai-specter")

# # Train BERTopic model
# topic_model = BERTopic(embedding_model=embedding_model)
# topics, probs = topic_model.fit_transform(CLEANED_ABSTRACT_LIST[:4000])

# # View top topics
# topic_model.get_topic_info()

In [276]:
# topic_model.visualize_topics()

In [277]:
import gensim
from gensim import corpora
from gensim.models import Phrases 
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS


TOKENIZED_ABSTRACT_LIST = []
for ABSTRACT in CLEANED_ABSTRACT_LIST: 
    TOKENIZED_ABSTRACT_LIST.append(ABSTRACT.split(' '))

print(len(TOKENIZED_ABSTRACT_LIST))

ARXIV_CONNECTOR_WORDS = list(ENGLISH_CONNECTOR_WORDS)# +['dark', 'black', 'physical', 
                                                     #   galactic', 'stellar', 'primordial']
print(ARXIV_CONNECTOR_WORDS)

bigrams = Phrases(TOKENIZED_ABSTRACT_LIST, min_count=5, threshold=10., max_vocab_size=40000, connector_words=ARXIV_CONNECTOR_WORDS)
bigram_model = gensim.models.phrases.Phraser(bigrams)

BIGRAM_ABSTRACT_LIST = [bigram_model[ABSTRACT] for ABSTRACT in TOKENIZED_ABSTRACT_LIST]
print(BIGRAM_ABSTRACT_LIST[0])
# Create Dictionary and Corpus

# print(LDA_ABSTRACT_LIST[:10])
dictionary = corpora.Dictionary(BIGRAM_ABSTRACT_LIST)
dictionary.filter_extremes(no_below=5, no_above=0.95)
corpus = [dictionary.doc2bow(ABSTRACT) for ABSTRACT in BIGRAM_ABSTRACT_LIST]


5610
['to', 'an', 'or', 'with', 'on', 'from', 'by', 'and', 'a', 'of', 'the', 'at', 'without', 'in', 'for']
['venga', 'large', 'scale', 'extragalactic', 'survey', 'bulge', 'large', 'outer', 'disk', 'nearby', 'normal', 'spiral', 'galaxy', 'target', 'choose', 'span', 'wide', 'range', 'hubble', 'type', 'star', 'formation', 'activity', 'morphology', 'inclination', 'time', 'vast', 'available', 'multi', 'wavelength', 'coverage', 'available', 'mapping', 'venga', 'provide', 'stellar', 'kinematic', 'chemical', 'abundance', 'density', 'ionization', 'state', 'dust', 'extinction', 'stellar', 'population', 'galaxy', 'uniqueness', 'virus', 'large', 'field', 'view', 'permit', 'large', 'scale', 'mapping', 'perform', 'venga', 'allow', 'correlate', 'important', 'quantity', 'different', 'environment', 'galactic', 'disk', 'allow', 'conduction', 'large', 'number', 'star', 'formation', 'structure', 'assembly', 'galactic', 'feedback', 'galaxy']


In [278]:
from gensim.models import LdaModel

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=12,         # Change to desired number of topics
    random_state=1008,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

In [279]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

import warnings
warnings.filterwarnings("ignore")

pyLDAvis.enable_notebook()
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [280]:
topics = lda_model.print_topics(num_words=10)
for topic_num, topic in topics:
    print(f"Topic {topic_num}: {topic}")

def get_dominant_topic(ldamodel, bow):
    topics = ldamodel.get_document_topics(bow)
    topics = sorted(topics, key=lambda x: -x[1])
    print(topics)
    return topics[0] if topics else (None, 0.0)

# Assign dominant topic to each abstract
for idx, abstract in enumerate(ABSTRACT_LIST):
    bow = corpus[idx]
    topic_id, prob = get_dominant_topic(lda_model, bow)
    print(f"Abstract {idx} => Topic {topic_id} (confidence: {prob:.2f})")

Topic 0: 0.101*"galaxy" + 0.077*"cluster" + 0.027*"halo" + 0.014*"scale" + 0.013*"group" + 0.011*"large" + 0.011*"redshift" + 0.011*"profile" + 0.011*"structure" + 0.010*"core"
Topic 1: 0.038*"energy" + 0.028*"dark_matter" + 0.025*"gamma" + 0.022*"cosmic" + 0.019*"particle" + 0.016*"neutrino" + 0.012*"detector" + 0.012*"high" + 0.011*"shower" + 0.011*"background"
Topic 2: 0.022*"telescope" + 0.019*"survey" + 0.017*"resolution" + 0.014*"high" + 0.014*"field" + 0.013*"instrument" + 0.010*"calibration" + 0.010*"mission" + 0.010*"large" + 0.009*"image"
Topic 3: 0.025*"star" + 0.018*"source" + 0.017*"gaia" + 0.017*"survey" + 0.014*"sample" + 0.012*"photometric" + 0.011*"spectra" + 0.011*"candidate" + 0.010*"object" + 0.010*"catalog"
Topic 4: 0.052*"star" + 0.025*"binary" + 0.022*"stellar" + 0.014*"black_hole" + 0.013*"galaxy" + 0.012*"massive" + 0.012*"model" + 0.011*"evolution" + 0.011*"rate" + 0.011*"formation"
Topic 5: 0.018*"simulation" + 0.018*"star" + 0.018*"disk" + 0.016*"accretion" 