In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests

# Replace with your actual API key
API_KEY = 'bgJyXuHdGkrBKt4VsCvR0LeiwE8x39WZ'

# Set the base URL
BASE_URL = 'https://api.core.ac.uk/v3/search/works'

# Define the query parameters
params = {
    'q': 'machine learning',
    'page': 5,
    'pageSize': 5  # Number of results per page
}

# Set the authorization header
headers = {
    'Authorization': f'Bearer {API_KEY}'
}

# Make the GET request
response = requests.get(BASE_URL, headers=headers, params=params)

# Check response
if response.status_code == 200:
    data = response.json()
    print("Total Results:", data.get('totalHits', 'N/A'))
    for paper in data.get('results', []):
        print("\nTitle:", paper.get('title'))
        print("Authors:", paper.get('authors'))
        print("DOI:", paper.get('doi'))
        print("Published:", paper.get('publishedDate'))
        print("Full Text URL:", paper.get('fullTextLink'))
else:
    print("Request failed with status code:", response.status_code)
    print(response.json())


Total Results: 1748270

Title: A Hybrid Approach to Audio-to-Score Alignment
Authors: [{'name': 'Agrawal, R'}, {'name': 'Dixon, S'}, {'name': 'Machine Learning for Music Discovery Workshop at International Conference on Machine Learning (ICML)'}]
DOI: None
Published: 2019-05-30T00:00:00
Full Text URL: None

Title: The scientific evaluation of music content analysis systems: Valid empirical foundations for future real-world impact
Authors: [{'name': 'Grossmann, H'}, {'name': 'International Conference on Machine Learning'}, {'name': 'Maruri-Aguilar, H'}, {'name': 'Parker, B'}, {'name': 'STURM, BLT'}]
DOI: None
Published: 2016-02-29T12:04:33
Full Text URL: None

Title: Machine Learning for Software Engineering: Models, Methods, and Applications
Authors: [{'name': 'Bennaceur, Amel'}, {'name': 'Meinke, Karl'}]
DOI: 10.1145/3183440.3183461
Published: 2018-01-01T00:00:00
Full Text URL: None

Title: Replica conditional sequential monte carlo
Authors: [{'name': 'Doucet, A'}, {'name': 'Proceedin

In [13]:
# NLP Project: Idea is All We Need
# ----------------------------------
import requests
import json
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import time

nltk.download('punkt')
nltk.download('stopwords')

# -------------------------
# Step 1: Load Data from CORE API or Fallback
# -------------------------
API_KEY = 'bgJyXuHdGkrBKt4VsCvR0LeiwE8x39WZ'
headers = {'Authorization': f'Bearer {API_KEY}'}
params = {'q': 'machine learning', 'pageSize': 5}

# Retry logic with fallback
try:
    for attempt in range(3):
        response = requests.get('https://api.core.ac.uk/v3/search/works', headers=headers, params=params)
        if response.status_code == 200:
            break
        print(f"Attempt {attempt+1} failed with status {response.status_code}. Retrying in 3s...")
        time.sleep(3)

    response_json = response.json()
    if 'results' in response_json:
        data = response_json['results']
    else:
        print(" CORE API did not return 'results'. Using fallback mock data.\n")
        data = []
except Exception as e:
    print(" API failed. Reason:", str(e))
    data = []

# Fallback data
if not data:
    data = [
        {
            "title": "Sample Paper on BERT",
            "abstract": "This paper introduces BERT, a model for NLP.",
            "fullText": "BERT improves the state-of-the-art in many NLP tasks. Future work includes multilingual support.",
            "doi": "10.0001/sample.doi",
            "authors": "John Doe"
        },
        {
            "title": "Transformers in NLP",
            "abstract": "Transformers have revolutionized NLP tasks.",
            "fullText": "Transformers enable parallel processing and long context understanding. Limitations include compute cost.",
            "doi": "10.0002/sample.doi",
            "authors": "Jane Smith"
        }
    ]

# Create DataFrame
papers = pd.DataFrame([{ 
    'title': d['title'], 
    'abstract': d.get('abstract', ''), 
    'fullText': d.get('fullText', ''), 
    'doi': d.get('doi', ''),
    'authors': d.get('authors', '')
} for d in data])

# -------------------------
# Step 2: Clean and Preprocess Text
# -------------------------
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    if not isinstance(text, str):
        return []
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', '', text)
    tokens = word_tokenize(text.lower())
    return [stemmer.stem(w) for w in tokens if w not in stop_words and len(w) > 2]

papers['tokens'] = papers['abstract'].apply(preprocess)

# -------------------------
# Step 3: Topic Modeling with LDA
# -------------------------
from gensim import corpora, models

texts = papers['tokens'].tolist()
dictionary = corpora.Dictionary(texts)
corpus_bow = [dictionary.doc2bow(text) for text in texts]

lda = models.LdaModel(corpus_bow, num_topics=3, id2word=dictionary, passes=15)

print("\n Topics from LDA:")
for idx, topic in lda.print_topics(-1):
    print(f"Topic {idx}: {topic}\n")

# -------------------------
# Step 4: Semantic Search
# -------------------------
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')
abstracts = papers['abstract'].tolist()
embeddings = model.encode(abstracts, convert_to_tensor=True)

query = "research using BERT in NLP"
query_embedding = model.encode(query, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
top_results = cos_scores.topk(3)

print("🔍 Top Matching Papers:")
for score, idx in zip(top_results[0], top_results[1]):
    print(f"Score: {score.item():.4f} | Title: {papers.iloc[idx.item()]['title']}\n")


# -------------------------
# Step 5: Question Answering with BERT
# -------------------------
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
context = papers.iloc[0]['fullText']
question = "What is the future work discussed?"

answer = qa_pipeline(question=question, context=context)
print("❓ QA Answer:", answer['answer'])

# -------------------------
# Final Notes
# -------------------------
# Save to CSV
# papers.to_csv("papers_processed.csv", index=False)

# Add enhancements: citation graph, Streamlit UI, section-specific QA, etc.


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/srujana_chintala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/srujana_chintala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



 Topics from LDA:
Topic 0: 0.041*"metric" + 0.041*"fair" + 0.021*"propos" + 0.017*"adhoc" + 0.015*"model" + 0.013*"alloc" + 0.013*"notion" + 0.013*"resourc" + 0.013*"measur" + 0.013*"human"

Topic 1: 0.036*"method" + 0.036*"softwar" + 0.036*"engin" + 0.019*"model" + 0.019*"applic" + 0.013*"machin" + 0.013*"infer" + 0.013*"learn" + 0.013*"area" + 0.013*"benefit"

Topic 2: 0.031*"model" + 0.025*"gener" + 0.025*"train" + 0.014*"resourc" + 0.014*"studi" + 0.014*"develop" + 0.014*"effici" + 0.014*"profil" + 0.014*"advanc" + 0.014*"gpu"



Device set to use mps:0


🔍 Top Matching Papers:
Score: 0.1829 | Title: Machine Learning for Software Engineering: Models, Methods, and Applications

Score: 0.1051 | Title: The scientific evaluation of music content analysis systems: Valid empirical foundations for future real-world impact

Score: 0.0726 | Title: Replica conditional sequential monte carlo

❓ QA Answer: Matching networks for one shot learning
