In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

GROQ_API_KEY=os.getenv("GROQ_API_KEY")
os.environ["GROQ_API_KEY"]= GROQ_API_KEY

from langchain_groq import ChatGroq
llm=ChatGroq(model_name="Gemma2-9b-It")

from langchain.chains import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
import re
from PyPDF2 import PdfReader

In [None]:
def extract_text_excluding_tables(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"

    # Split into lines for processing
    lines = text.split("\n")
    filtered_lines = []

    for line in lines:
        # Exclude lines that are mostly numbers, or contain table-like structures
        if not re.match(r"^\s*[\d\s\.,:;|/\\%+-]+\s*$", line):  # Numeric-heavy lines
            filtered_lines.append(line)

    # Combine back into text
    filtered_text = "\n".join(filtered_lines)
    return filtered_text

In [None]:
def extract_title_abstract_text(filtered_text):

    # Regex pattern to extract title, abstract, and the rest of the text
    pattern = r"^(.*?)\nAbstract\n(.*?)\n1 Introduction\n(.*)"

    # Apply regex to extract title, abstract, and remaining text
    match = re.match(pattern, filtered_text, re.DOTALL)

    if match:
        title = match.group(1).strip()
        abstract = match.group(2).strip()
        remaining_text = match.group(3).strip()

        return title, abstract, remaining_text
    else:
        print("Pattern not found.")
        return None, None, None

def prepare_final_documents(remaining_text):
    
    if not remaining_text:
        raise ValueError("The remaining text is empty or None.")

    # Wrap the remaining text into a LangChain Document
    docs = [Document(page_content=remaining_text)]

    # Use RecursiveCharacterTextSplitter to split the documents
    splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    final_documents = splitter.split_documents(docs)

    return final_documents

In [None]:
def get_research_paper_summary(llm, final_documents):
    
    # Define the initial prompt
    initial_prompt = PromptTemplate(
        input_variables=["text"],
        template="""
        You are an expert in Machine Learning (ML) and Deep Learning (DL). Analyze the following research paper and summarize its key aspects, focusing specifically on:

        1. **Machine Learning and Deep Learning Techniques**: Describe the ML/DL methods used in the research.
        2. **Algorithms and Architectures**: Highlight any significant algorithms, models, or architectures introduced or applied in the paper.
        3. **Datasets and Training Details**: Include information about the datasets used, their relevance, and how they were utilized in the research.
        4. **Applications and Results**: Summarize the practical applications or use cases and the key outcomes or results.

        Be concise but ensure that each aspect is clearly addressed.

        **Research Paper Content:**
        {text}
        """
    )

    # Define the refine prompt
    refine_prompt = PromptTemplate(
        input_variables=["existing_answer", "text"],
        template="""
        You have an initial summary of the paper 

        **Initial Summary:**
        {existing_answer}

        Refine this summary to make it more focused and detailed on the following aspects:

        1. **Machine Learning and Deep Learning Techniques**: Ensure that the ML/DL methods used in the paper are clearly described.
        2. **Algorithms and Architectures**: Add details about any significant algorithms, models, or architectures introduced or applied.
        3. **Datasets and Training Details**: Provide specifics about the datasets, their role in the research, and any preprocessing or augmentation techniques.
        4. **Applications and Results**: Highlight the practical implications or outcomes of the research.

        Ensure the refined summary is structured, clear, and concise.

        Consider the additional context provided below
        **Research paper content**
        {text}
        """
    )

    # Load the summarization chain
    chain = load_summarize_chain(
        llm=llm,
        chain_type="refine",
        verbose=True,
        question_prompt=initial_prompt,
        refine_prompt=refine_prompt
    )

    # Run the chain to generate the summary
    summary = chain.run(final_documents)

    return summary


def extract_keywords_from_summary(llm, summary_text):
   
    # Define the prompt template
    prompt_template = """
    You are analyzing the summary of a research paper. Extract only the names of the methodologies, key techniques, algorithms, datasets and other keywords and topics related to the field of AI and ML used in the provided text. Do not include any explanations or descriptions.

    Text: {text}

    Please provide a concise list of:
    1. Methodologies
    2. Key Techniques and Algorithms
    3. Datasets
    4. Other keywords
    5. Topics in the AI field
    """

    # Initialize the PromptTemplate
    template = PromptTemplate(
        input_variables=["text"], 
        template=prompt_template
    )

    # Create the LLMChain
    chain = LLMChain(llm=llm, prompt=template)

    # Run the chain to extract keywords from the summary
    keywords_list = chain.run(text=summary_text)

    return keywords_list

In [None]:
def process_pdf_for_summary_and_keywords(pdf_path, llm):
    # Step 1: Extract text from PDF excluding tables (already done by previous function)
    filtered_text = extract_text_excluding_tables(pdf_path)

    # Step 2: Extract title, abstract, and remaining text
    title, abstract, remaining_text = extract_title_abstract_text(filtered_text)

    # Step 3: Split the remaining text into documents
    final_documents = prepare_final_documents(remaining_text)

    # Step 4: Get the summary of the research paper
    summary_text = get_research_paper_summary(llm, final_documents)

    # Step 5: Extract keywords from the summary
    keywords = extract_keywords_from_summary(llm,summary_text)

    # Step 6: Create a dictionary of the extracted keywords
    #keywords_dict = create_keywords_dict(keywords)

    return summary_text, keywords

In [None]:
import pinecone
from sentence_transformers import SentenceTransformer
api_key="pcsk_4nF9Fr_BJA6QADDyj1iK3YQz3tChPZyc8bckcFuLGMT75XS9ZjYEVchz6kdgWuipJ1QYEF"
from langchain_community.retrievers import PineconeHybridSearchRetriever

import os
from pinecone import Pinecone,ServerlessSpec
index_name="hybrid-search-langchain-pinecone"
## initialize the Pinecone client
pc=Pinecone(api_key=api_key)

#create the index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # dimensionality of dense model
        metric="dotproduct",  # sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )


index=pc.Index(index_name)
index

os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings

In [None]:
from pinecone_text.sparse import BM25Encoder
from sentence_transformers import SentenceTransformer

bm25_encoder = BM25Encoder().default()  # Sparse encoder
dense_encoder = SentenceTransformer('all-MiniLM-L6-v2')  # Dense encoder

conference_profiles = {
    "NeurIPS":"The NeurIPS conference focuses on advancing the understanding of machine learning, artificial intelligence, and neural information processing systems. It spans a broad range of innovative methodologies, including reinforcement learning (RL), generative modeling, safe predictors, constrained neural networks, multi-task learning (MTL), transfer learning (TL), statistical learning theory, deep reinforcement learning, knowledge graph embedding (KGE), and neural ordinary differential equations (ODEs). NeurIPS also covers research on adversarial robustness, safe AI, multi-modal integration, optimization theory, and game theory applications. Core key techniques and algorithms include Graph Attention Networks (GATs), generative adversarial networks (GANs), deep Q-networks (DQN), Wasserstein bounds, multi-armed bandit algorithms, Empirical Risk Minimization (ERM), knowledge graph embedding (TransE, TransH, RotatE), and sophisticated optimization methods like gradient descent and convex approximation. Additionally, techniques like attention mechanisms, bit-strings, reinforcement learning algorithms (policy gradient, value iteration), and self-supervised learning are widely explored. Mathematical innovations include VC-dimension, Rademacher complexity, chaining theory for stochastic processes, Lipschitz constants, and non-convex optimization. Research at NeurIPS spans a diverse array of datasets, such as the ACAS-Xu dataset (aircraft collision avoidance), WikiKG90Mv2 (knowledge graphs), MNIST, CIFAR-10, Atari 2600 games, and other benchmark datasets for reinforcement learning, computer vision, and natural language processing tasks. Datasets such as Fares (2016), WMT translation corpora, and NomBank are commonly used for applications in machine translation, knowledge graph completion, and semantic role labeling. Key keywords explored across papers include safety constraints, generalization, constraint representation, data efficiency, adversarial robustness, non-convex optimization, deep reinforcement learning, generative modeling, optimization algorithms, textual reasoning, reinforcement learning agent performance, and knowledge graph completion. Other focus areas include game theory, optimization convergence, predictive modeling, autonomous systems, neural architecture search, and uncertainty modeling. The primary topics in the AI field addressed at NeurIPS include deep learning, reinforcement learning, generative modeling, machine learning theory, optimization, neural networks, multi-task learning, game theory, explainable AI, knowledge graph embedding, adversarial robustness, and predictive modeling. The conference serves as a premier venue for cutting-edge interdisciplinary research in neural information processing, spanning applications from autonomous systems to probabilistic models and large-scale optimization problems.",
    "CVPR" : "The CVPR Conference focuses on cutting-edge advancements in computer vision, machine learning, and artificial intelligence, with a strong emphasis on topics such as deep learning in computer vision, large-scale image recognition, CNN performance evaluation, feature hierarchies in visual recognition, object localization, image segmentation, generative models and image synthesis, real-time computer vision, multi-class classification, geometric reasoning, 3D modeling, sports analytics, action recognition, temporal modeling, automated image labeling, and creativity in AI. The methodologies employed in the research presented at CVPR include large-scale visual recognition, image classification, transfer learning, object detection, instance segmentation, residual learning, generative modeling, real-time object detection, region-based CNNs, feature extraction, super-resolution, dense reconstruction, neural implicit surface representation, structure from motion (SfM), keyframe selection, multi-label classification, sub-event learning, super-event representation, segmentation as machine translation (SAM), and temporal modeling. The algorithms and techniques used in the research include CNN architecture, fine-tuning, dropout, region proposal networks (RPN), mask branch, ROIAlign, residual connections, identity mapping, backpropagation, generative adversarial networks (GANs), discriminator, generator, minimax game, bounding box prediction, non-maximum suppression, fast R-CNN, SIFT features, keypoint detection, descriptor matching, truncated signed distance function (TSDF), NeuS2, 2D Gaussian splatting, PixSfM, XMem++, ICP (iterative closest point) registration, SuperPoint, SuperGlue, COLMAP, temporal convolutional kernels, two-stream CNNs, LSTMs, and temporal pooling. Datasets commonly used in CVPR research include ImageNet, ImageNet Large Scale Visual Recognition Challenge (ILSVRC), MS COCO, Pascal VOC, ADE20K, CIFAR-10, CIFAR-100, MNIST, CelebA, Set5, Set14, BSD100, Oxford, HPatches, Caltech-101, MetaFood3D, and MLB-YouTube Dataset. Important keywords in the research include image classification, object recognition, data augmentation, deep learning, transfer learning, convolutional layers, feature extraction, instance segmentation, semantic segmentation, multi-task learning, mask generation, real-time detection, object localization, bounding box regression, class prediction, image super-resolution, CNN-based enhancement, high-resolution image reconstruction, pixel-level prediction, feature matching, local features, scale-invariance, robust matching, image stitching, interest points, 3D food reconstruction, dietary analysis, image-based dietary assessment, portion size estimation, real-world size calibration, checkerboard patterns, optical flow, pitch speed prediction, pitch type classification, continuous video activity detection, game statistics, and video understanding.",
    "EMNLP": "The EMNLP conference focuses on advancing the state of natural language processing (NLP), with a strong emphasis on innovative methodologies, key techniques, datasets, and diverse applications. Methodologies prominently explored include transformer-based architectures, sequence-to-sequence modeling, transfer learning (TL), multi-task learning (MTL), crowd-sourcing forecasting, argumentation mining, neural language models, attention mechanisms, generative modeling, denoising probabilistic methods, and optimization frameworks. Research also highlights reinforcement learning (RL), pre-training and fine-tuning paradigms, multi-modal integration, and computational linguistics approaches. Core key techniques and algorithms include scaled dot-product attention, multi-head attention, TLE (Transferring Embedding Layer), TLH (Transferring Hidden Layer), TLEH (Transferring Embedding and Hidden Layers), MTLE (Multi-Task Learning with Shared Embedding Layers), and BERT-based pre-trained models. Advances in neural architectures such as Long Short-Term Memory (LSTM) networks, feed-forward networks, RNN encoder-decoder frameworks, and dense representation learning using word2vec, GloVe, and XLNet also play a pivotal role. Techniques like Wasserstein bounds, KL divergence minimization, argumentation mining, text classification, and bold extrapolation for optimization are emphasized. Mathematical innovations include ν-weak monotonicity, Lipschitz constants, and probabilistic loss functions like negative log-likelihood. Research at EMNLP spans a diverse set of datasets, such as WMT translation benchmarks, BooksCorpus, Wikipedia, PCEDT, NomBank, the Good Judgment Open dataset, and domain-specific corpora like Amazon-book and Yelp2018. Additional datasets include GAN training sets, RL benchmarks, Freebase, and graph-based datasets like Cora and Citeseer. These are used to address challenges in machine translation, semantic role labeling, text analysis, and predictive modeling. Key keywords featured across papers include natural language understanding (NLU), collective intelligence, superforecasters, skewed datasets, justification quality, popularity bias, temporal dependencies, textual reasoning, argument mining, noun-noun compounds, interpretability, regularization, and generalization. Other focus areas include fairness in AI, uncertainty modeling, explainable AI (via textual justification), time-series analysis, and computational efficiency. The primary topics in the AI field addressed at EMNLP encompass natural language processing (NLP), machine learning (ML), deep learning, transfer learning, multi-task learning, optimization, explainable AI, predictive modeling, text generation, computational linguistics, and robust model evaluation. The conference represents a hub for interdisciplinary breakthroughs that combine statistical approaches with advanced neural models to tackle real-world NLP challenges and achieve state-of-the-art results in language understanding and processing.",
    "KDD": "The conference focuses on cutting-edge advancements in methodologies, techniques, and applications across diverse domains of data mining, machine learning, deep learning, and artificial intelligence. The research presented integrates methodologies such as density-based clustering, semi-supervised clustering, contrastive learning, deep learning frameworks, multi-task learning, gradient boosting, sentiment analysis, graph mining, temporal graph analysis, personalized medicine, and recommender systems. Techniques and algorithms featured include DBSCAN, XGBoost, multihead self-attention, dual convolutional layers, Viterbi algorithm, recommendation loss (LREC), supervised alignment loss (LSA), re-weighting contrast loss (LCL), InfoNCE, Multi-Layer Perceptrons (MLPs), recurrent neural networks (RNNs), decision trees, Gaussian mixture models (GMMs), and frequent itemset mining. Datasets utilized span multiple domains, including Amazon-book, Yelp2018, Gowalla, retail transactional data, customer reviews, real-world free-living datasets from smart home environments, social network data from platforms like Facebook and Twitter, and temporal graph datasets. Research topics encompass diverse fields such as machine learning, deep learning, supervised and unsupervised learning, representation learning, recommender systems, graph mining, social network analysis, personalized medicine, predictive modeling, fairness in AI, and bias mitigation. Keywords include popularity bias, fairness, diversity, item representations, L2 regularization, Wi-Fi signal strength (RSSI), accelerometer data, gait patterns, room localization, medication state prediction, sentiment analysis, clustering, temporal modeling, gradient boosting, graph densification, shrinking diameters, graph evolution, network dynamics, association rule mining, and pattern summarization. This conference continues to serve as a premier platform for groundbreaking advancements in AI, focusing on methodologies, real-world applications, and ethical challenges in the evolving field of data mining and artificial intelligence.",
    "TMLR": "The TMLR conference emphasizes cutting-edge research in machine learning and artificial intelligence, featuring diverse methodologies, innovative techniques, and theoretical advances. Key methodologies explored include extragradient methods, min-max optimization, generative adversarial networks (GANs), reinforcement learning (RL), hierarchical modeling, score-based generative models, denoising diffusion probabilistic models (DDPMs), variational autoencoders (VAEs), neural-symbolic integration, self-supervised learning, sparse learning, Bayesian optimization, and robust multi-agent reinforcement learning. Researchers also delve into federated learning, multi-task learning, contrastive learning, and disentangled representations for multi-modal domains. Significant key techniques and algorithms presented at TMLR include extragradient (EG), optimistic gradient descent ascent (OGDA), adaptive step-size EG+, discrete-time DDPM, forward and backward processes for denoising, reparameterization tricks, minimizing reconstruction loss, Wasserstein bounds, harmonic representations, symbolic logic integration, TransE, pruning strategies, policy gradient methods, contrastive augmentations, Bayesian hyperparameter tuning, and adversarial training. Mathematical innovations include saddle point analysis, ν-weak monotonicity, Lipschitz constants, score functions, KL divergence, and convergence analysis frameworks. The conference features research leveraging diverse datasets, including GAN training datasets, RL benchmarks, Amazon-book, Yelp2018, Gowalla, Freebase, WordNet, Cora, Citeseer, FEMNIST, ImageNet, CIFAR-10, MNIST, PubMed, Open Graph Benchmark (OGB), and Visual Genome, while also emphasizing dataset-agnostic techniques. These datasets span domains such as graph-based learning, recommender systems, healthcare, image generation, and multi-modal tasks. Prominent keywords include saddle points, weak Minty solutions, bold extrapolation, cautious updates, von Neumann’s ratio game, forsaken solutions, latent disentanglement, symbolic reasoning, federated heterogeneity, popularity bias, and adversarial robustness. Research also addresses interpretability, privacy preservation, fairness in AI, sparse representations, geometry-adaptive modeling, time-series analysis, and efficiency optimization. The TMLR conference spans a wide array of topics in AI, including machine learning, deep learning, optimization, game theory, probabilistic modeling, graph neural networks, federated learning, reinforcement learning, generative modeling, representation learning, symbolic AI, fairness and bias mitigation, resource-efficient AI, and theoretical machine learning. This diversity highlights TMLR's commitment to advancing AI research across both theoretical and applied dimensions, fostering breakthroughs in scalable, interpretable, and robust AI systems.",
}

# Upsert data into Pinecone
for name, profile in conference_profiles.items():
    # Sparse vector
    sparse_vector = bm25_encoder.encode_documents([profile])[0]  # Pass the profile as a single-document list

    # Dense vector
    dense_vector = dense_encoder.encode(profile).tolist()

    # Upsert both vectors
    index.upsert([
        {
            "id": name,
            "values": dense_vector,
            "sparse_values": sparse_vector,
            "metadata": {"name": name, "profile": profile}
        }
    ])

In [None]:
def get_top_matched_conference(pdf_path):
    
    # Step 1: Extract text from PDF excluding tables
    filtered_text = extract_text_excluding_tables(pdf_path)

    # Step 2: Extract title, abstract, and remaining text
    title, abstract, remaining_text = extract_title_abstract_text(filtered_text)

    # Check if text was properly extracted
    if not remaining_text:
        raise ValueError("Unable to extract meaningful text from the PDF.")

    # Step 3: Split the remaining text into documents
    final_documents = prepare_final_documents(remaining_text)

    # Step 4: Get the summary of the research paper
    summary_text = get_research_paper_summary(llm, final_documents)

    # Step 5: Perform hybrid search for the top conference match
    sparse_query = bm25_encoder.encode_documents([summary_text])[0]
    dense_query = dense_encoder.encode(summary_text).tolist()

    results = index.query(
        vector=dense_query,
        sparse_vector=sparse_query,
        top_k=3,
        include_metadata=True
    )

    if not results['matches']:
        raise ValueError("No matching conferences found for the paper.")

    top_result = max(results['matches'], key=lambda match: match['score'])
    top_conference_name = top_result['metadata']['name']

    # Step 6: Generate justification for the top match
    conference_profile = conference_profiles[top_conference_name]

    template = """
    Given the summary of a paper and the profile of a conference, explain why this paper is a good fit for the conference within 100 words(follow this strictly). 
    Do not include any numerical scores, just focus on the content alignment.

    Paper Summary: {paper_summary}

    Conference Profile: {conference_profile}

    Justification:
    """
    prompt = PromptTemplate(input_variables=["paper_summary", "conference_profile"], template=template)
    chain = LLMChain(llm=llm, prompt=prompt)
    justification = chain.run({"paper_summary": summary_text, "conference_profile": conference_profile})

    return top_conference_name, justification

In [None]:
import pandas as pd
import os

def process_publishable_papers(csv_path, pdf_dir, output_csv_path, llm):
    # Read the CSV file
    df = pd.read_csv(csv_path)

    # Ensure required columns are present
    if 'Paper ID' not in df.columns or 'Publishable' not in df.columns:
        raise ValueError("The input CSV must contain 'Paper ID' and 'Publishable' columns.")

    # Add new columns for conference and rationale
    df['Conference'] = ""
    df['Rationale'] = ""

    for index, row in df.iterrows():
        paper_id = row['Paper ID']
        publishable = row['Publishable']

        # Skip non-publishable papers
        if publishable == 0:
            continue

        # Get the corresponding PDF file path
        if publishable == 1:
            pdf_path = os.path.join(pdf_dir, f"{paper_id}.pdf")
            

        try:
            # Use the provided function to get the top conference and rationale
            summary, keywords_dict2 = process_pdf_for_summary_and_keywords(pdf_path, llm)

            # Perform hybrid search
            top_conference_name, justification = get_top_matched_conference(pdf_path)

            # Update the DataFrame
            df.at[index, 'Conference'] = top_conference_name
            df.at[index, 'Rationale'] = justification
        except Exception as e:
            print(f"Error processing Paper ID {paper_id}: {e}")

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_csv_path, index=False)
    print(f"Updated CSV saved to {output_csv_path}")

# Example usage
csv_path = r"C:\Users\OMEN\Documents\GENAI\task 1 results.csv"  # Path to the input CSV file
pdf_dir = r"C:\Users\OMEN\Documents\GENAI\Papers-20250113T145151Z-001\Papers" # Directory containing the PDF files
output_csv_path = "task 2 results.csv"  # Path to save the updated CSV

process_publishable_papers(csv_path, pdf_dir, output_csv_path, llm)
