In [None]:
!pip install python-dotenv langchain pypdf transformers pymupdf requests beautifulsoup4 PyPDF2

In [62]:
import requests
import os
from groq import Groq
import statistics
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from dotenv import load_dotenv
load_dotenv()
from bs4 import BeautifulSoup

In [64]:
# Set up Groq client
api_key = os.getenv("GROQ_API_KEY")
if api_key is None:
    raise ValueError("GROQ_API_KEY environment variable is not set")
client = Groq(api_key=api_key)

In [59]:
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text
# text = extract_text_from_pdf(pdf_path)

In [6]:
def extract_title_from_text(text):
    parts = text.split("Abstract")
    
    # The title is the content before "Abstract"
    title = parts[0].strip()
    return title
# title = extract_title_from_text(text)

In [7]:
def process_pdf_and_extract_headings(file_path):
    from langchain.prompts import PromptTemplate
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain.document_loaders import PyPDFLoader

    # Step 1: Load PDF
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    # Step 2: Split Text into Chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    chunks = text_splitter.split_documents(documents)

    # Step 3: Extract and clean Headings using Custom API
    def clean_headings(raw_headings):
        prompt_template = """
        Given the following list of extracted headings, clean and rewrite them to ensure clarity and correctness:
         - Remove unwanted characters such as square brackets ([ ]), backslashes (\), newlines (\n), and extra spaces (but retain spaces between words).
         - Split any combined headings into separate headings if they represent distinct topics.
         - Eliminate duplicate or repeated headings.
         - Only return clear and valid headings, and ensure they are concise.
         - remove general heading , or basic heading which does not make any sense in the research paper
          Input Headings: {raw_headings}
          Return Format:
          ["Heading 1", "Heading 2", "Heading 3", ...]
          Output should strictly be a clean list of valid headings without any extra formatting, body text, or explanations.
        """
        prompt = PromptTemplate(input_variables=["raw_headings"], template=prompt_template)
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt.format(raw_headings=raw_headings)}],
            model="gemma2-9b-it",
        )
        return chat_completion.choices[0].message.content.strip()

    def extract_headings(chunks):
        prompt_template = """
        Extract only the broad headings and subheadings from the following text that are typically found in research papers. 
        The headings should be large and general , not detailed.
        Only return the headings, do not include body text or explanations. 
        Text: {text}
        Conditions:
        1. If none of these headings or similar ones are found, return a blank space (" ").
        2. important to note that return a heading of maximum 3 words 
        3. Two consecutive headings must be at least a paragraph apart.
        4. Return the headings in the format "heading" only, without any extra characters or formatting.  
           If no headings are present, return a blank space (" ").
        Common headings to look for: 
         Abstract,Introduction, Methodology, Results, Discussion, Conclusion, Literature Review, References, 
        Acknowledgments, Background, Scope, Objectives, Problem Statement, Hypothesis, Significance, 
        Data Collection, Analysis, Findings, Future Work, Recommendations, Limitations, Theoretical Framework, 
        Ethical Considerations, Appendix, Table of Contents, List of Figures, Index, Contributions
        """
        prompt = PromptTemplate(input_variables=["text"], template=prompt_template)
        raw_headings = []
        for chunk in chunks:
            chat_completion = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt.format(text=chunk.page_content)}],
                model="gemma2-9b-it",
            )
            raw_headings.append(chat_completion.choices[0].message.content.strip())
        return clean_headings("\n".join(raw_headings))

    cleaned_headings = extract_headings(chunks)

    # Step 4: Format the cleaned headings into a list format
    def format_headings_to_list(cleaned_headings):
        cleaned_headings = cleaned_headings.strip("[]").replace('"', '').replace(" ", "").split(",")
        return [heading.strip() for heading in cleaned_headings if heading]

    return format_headings_to_list(cleaned_headings)
# formatted_headings_list = process_pdf_and_extract_headings(pdf_path)
# print("Formatted Headings List:", formatted_headings_list)

In [51]:
import re
def split_by_two_capitals(word_list):
    updated_list = []
    for word in word_list:
        updated_word = re.sub(r'([a-z])([A-Z])', r'\1 \2', word)
        updated_list.append(updated_word)
    print(updated_list)
# formatted_headings_list=split_by_two_capitals(formatted_headings_list)

In [10]:
def pair_headings_with_text(headings, text):
    pattern = '|'.join(re.escape(h) for h in headings)
    matches = re.split(f'({pattern})', text)  

    pairs = []
    used_headings = set()  
    current_heading = None
    current_paragraph = []

    for part in matches:
        part = part.strip() 

        if part in headings and part not in used_headings:  
            if current_heading:
                paragraph = ' '.join(current_paragraph)
                if len(paragraph) >= 100:  # Exclude paragraphs with fewer than 100 characters
                    pairs.append((current_heading, paragraph))
                    used_headings.add(current_heading)
            current_heading = part  
            current_paragraph = [] 
        elif part: 
            current_paragraph.append(part)  
    
    if current_heading and current_paragraph:
        paragraph = ' '.join(current_paragraph)
        if len(paragraph) >= 100:  # Exclude paragraphs with fewer than 100 characters
            pairs.append((current_heading, paragraph))
            used_headings.add(current_heading)

    return pairs
# paired_text = pair_headings_with_text(formatted_headings_list, text)

In [11]:
def rewrite_math_expressions_in_paired_text(paired_text):
    new_paired_text = []
    
    for heading, paragraph in paired_text:
        prompt = f"""
        text: {paragraph}
        The text contains mathematical terms and expressions.   
        For example:
        Input: p1 = (a^3)*5 + 8
        Output: p1 is equal to a raised to the power of 3, multiplied by 5, plus 8.
        Please rewrite the text, converting any mathematical expressions into plain language. If there are no mathematical expressions, keep the text unchanged.
        Output should be clean and should not include body text or explanations.
        """
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="gemma2-9b-it",
        )
        rewritten_paragraph = response.choices[0].message.content.strip()
        new_paired_text.append((heading, rewritten_paragraph))
    return new_paired_text
# paired_text = rewrite_math_expressions_in_paired_text(paired_text)

In [12]:
def find_paragraph_for_heading(paired_text, target_heading):
    for heading, paragraph in paired_text:
        if heading.lower() == target_heading.lower(): 
            return paragraph
    return None  
# abstract = find_paragraph_for_heading(paired_text, "Abstract")
# introduction = find_paragraph_for_heading(paired_text, "Introduction")

In [13]:
import re

def evaluate_title_against_abstract(title, abstract):
    prompt = f"""
    Title: {title}
    Abstract: {abstract}
    Evaluate if the Title accurately reflects the main claims and content of the Abstract. Check for alignment in terms of key topics, scope, and focus.
    Provide a score between 0 and 100, where 100 indicates perfect alignment.
    Return only the score as an integer.
    """

    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="gemma2-9b-it",
    )

    score = int(response.choices[0].message.content.strip())
    return score
# title_alignment_score = evaluate_title_against_abstract(title, abstract)

# title_alignment_score


In [43]:
def evaluate_paragraph_flow(paired_text):
    paragraph_scores = []
    for i in range(len(paired_text) - 1):
        prompt = f"""
        Paragraph 1: {paired_text[i][1]}
        Paragraph 2: {paired_text[i+1][1]}
        Evaluate the logical flow between these two paragraphs. Assess if the explanation is clear, and the overall paper flow is maintained. Give a score between 0 and 100, where 100 represents perfect flow and 0 represents no logical connection.
        Return only the score as an integer.
        """
        
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="gemma2-9b-it",
        )
        score = int(response.choices[0].message.content.strip())
        paragraph_scores.append(score)
    if(paragraph_scores==[]):
      return
    return statistics.mean(paragraph_scores)
# scores = evaluate_paragraph_flow(paired_text)
# print(scores)

In [15]:
import re
def evaluate_claims_against_results(paired_text):
    sections = {heading.lower(): paragraph for heading, paragraph in paired_text}
    if 'results' not in sections:
        return 
    if 'abstract' not in sections and 'introduction' not in sections:
        return  
    combined_text = ""
    if 'abstract' in sections and 'introduction' in sections:
        combined_text = sections['abstract'] + "\n\n" + sections['introduction']
    elif 'abstract' in sections:
        combined_text = sections['abstract']
    elif 'introduction' in sections:
        combined_text = sections['introduction']
    prompt = f"""
    abstract/introduction: {combined_text}
    Results: {sections['results']}
    Only use this , if the Results part actually contain results of the research paper , otherwise if it contain other thing , return "None"
    Evaluate the claims made in the abstract/introduction against the results. Check if the claims are clearly validated by the results.
    Provide a score between 0 and 100, where 100 indicates perfect alignment.
    Return only the score as an integer.
    """
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="gemma2-9b-it",
    )
    if(response=="None" or "none"):
        return 
    score = int(response.choices[0].message.content.strip())
    return score
# consistency_score = evaluate_claims_against_results(paired_text)
# consistency_score

In [16]:
def evaluate_conclusion_clarity(paired_text):
    sections = {heading.lower(): paragraph for heading, paragraph in paired_text}
    if 'conclusion' not in sections:
        return 
    prompt = f"""
    Conclusion: {sections['conclusion']}
    Only use this , if the conclusion part actually contain conclusion for the research paper , otherwise if it contain other thing , return "None"
    Evaluate the clarity and accuracy of the conclusion. Check if the conclusion is concise, up to the point, and does not attempt to obscure or hide the outcome of the paper.
    Provide a score between 0 and 100, where 100 indicates perfect clarity and transparency.
    Return only the score as an integer 
    """
    
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="gemma2-9b-it",
    )
    score = int(response.choices[0].message.content.strip())
    if(response=="None" or "none"):
        return 
    return score
# conclusion_score = evaluate_conclusion_clarity(paired_text)
# print(conclusion_score)

In [36]:
def evaluate_paragraph_grammar(paired_text):
    paragraph_scores = []
    
    for i in range(len(paired_text)):
        text = paired_text[i][1]
        
        # The prompt evaluates the fluency (grammar and coherence) of the text.
        prompt = f"""
        Paragraph: {text}
        Evaluate the fluency of the English, which includes grammar, coherence, and clarity.
        The fluency should be rated between 0 and 100, where 100 represents perfect fluency and 0 represents no coherence or clarity.
        Return only the score as an integer and should not include body text or explanations.
        """
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="gemma2-9b-it",
        )
        result = response.choices[0].message.content.strip()
        try:
            fluency_score = int(result)
        except ValueError:
            fluency_score = 0 
        
        paragraph_scores.append(fluency_score)
    if(paragraph_scores==[]):
        return
    return statistics.mean(paragraph_scores)
# scores_grammar = evaluate_paragraph_grammar(paired_text)
# print(scores_grammar)

In [18]:
def evaluate_paragraph_spelling(paired_text):
    paragraph_scores = []
    for i in range(len(paired_text)):
        text = paired_text[i][1]
        prompt = f"""
        Paragraph: {text}
        Count the spelling mistakes in the text (ignore mathematical expressions like 'x^2 + y = 10').
        Return only the score as an integer and should not include body text or explanations.
        Do not consider mathematical part during testing .
        """
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="gemma2-9b-it",
        )
        result = response.choices[0].message.content.strip()
        try:
            num_spelling_mistakes = int(result)
        except ValueError:
            num_spelling_mistakes = 0  # Default in case of parsing error
        
        paragraph_scores.append(num_spelling_mistakes)
    
    return statistics.mean(paragraph_scores)
# scores_spelling = evaluate_paragraph_spelling(paired_text)
# print(scores_spelling)

In [19]:
def search_google(query, domain_filter=None):
    api_key = "6780193db0bfc6546d1e2c15"
    url = "https://api.scrapingdog.com/google"
    if domain_filter:
        query += f" {domain_filter}"
    params = {
        "api_key": api_key,
        "query": query,
        "results": 10,
        "country": "us",
        "page": 0,
        "advance_search": "false"
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException:
        return None

def get_arxiv_links_from_search(query, max_results=20, sort_by_date=False):
    search_results = search_google(query, domain_filter="site:arxiv.org")
    links = []
    if search_results and "organic_results" in search_results:
        for result in search_results["organic_results"][:max_results]:
            link = result.get("link", "")
            if "arxiv.org" in link:
                if "/pdf/" in link:
                    link = link.replace("/pdf/", "/abs/").split(".pdf")[0]
                if "/html/" in link:
                    link = link.replace("/html/", "/abs/").split(".html")[0]
                links.append(link)

    links = list(dict.fromkeys(links))

    if sort_by_date:
        try:
            links.sort(key=lambda link: link.split('/abs/')[-1][:4])
        except Exception:
            pass
    return links

def get_best_abstract_date(paper_title, original_abstract, max_retries=3, retry_delay=2, sort_by_date=False):
    client = groq.Client(api_key=os.getenv("GROQ_API_KEY"))

    def scrape_website(url, retries=0):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup.get_text()
        except requests.exceptions.RequestException:
            if retries < max_retries:
                time.sleep(retry_delay)
                return scrape_website(url, retries + 1)
            return ""

    def extract_info_from_text(original_abstract, scraped_text):
        if not scraped_text.strip():
            return "January 14, 2025"

        prompt = f"""
        Original Abstract: {original_abstract}
        Scraped Text from website: {scraped_text}

        1. Compare the scraped text with the original abstract.
        2. If the abstract in the scraped text is similar to the original abstract, extract the publication date.
        3. If no abstract is found or the scraped text does not match the original, return 'January 2025'.
        4. If the abstract matches and a valid publication date is found, return the date only.

        Please return only the publication date.
        """
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="gemma2-9b-it"
        )

        if response and response.choices:
            return response.choices[0].message.content.strip()
        return "January 14, 2025"

    arxiv_links = get_arxiv_links_from_search(paper_title)

    for link in arxiv_links:
        scraped_text = scrape_website(link)
        extracted_date = extract_info_from_text(original_abstract, scraped_text)
        if extracted_date != "January 14, 2025":
            return extracted_date
    return "January 14, 2025"

In [48]:
import os
import requests
import arxiv
from datetime import datetime
import time
import statistics
import groq

# Function to search for Google and filter for arXiv links
def search_google(query, domain_filter=None):
    api_key = "6780193db0bfc6546d1e2c15"
    url = "https://api.scrapingdog.com/google"
    if domain_filter:
        query += f" {domain_filter}"
    params = {
        "api_key": api_key,
        "query": query,
        "results": 10,
        "country": "us",
        "page": 0,
        "advance_search": "false"
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        # print(f"Google search successful. Found {len(response.json().get('organic_results', []))} results.")
        return response.json()
    except requests.exceptions.RequestException as e:
        # print(f"Search failed: {e}")
        return None

# Function to get arXiv links from search results
def get_arxiv_links_from_search(query, max_results=20):
    # print(f"Searching for arXiv papers on '{query}'")
    search_results = search_google(query, domain_filter="site:arxiv.org")
    links = []
    if search_results and "organic_results" in search_results:
        for result in search_results["organic_results"][:max_results]:
            link = result.get("link", "")
            if "arxiv.org" in link:
                if "/pdf/" in link:
                    link = link.replace("/pdf/", "/abs/").split(".pdf")[0]  # Convert PDF to abs link
                if "/html/" in link:
                    link = link.replace("/html/", "/abs/").split(".html")[0]  # Convert HTML to abs link
                links.append(link)
    # print(f"Found {len(links)} arXiv links.")
    return list(set(links))

# Function to filter arXiv links by publication date
def filter_arxiv_links_with_date_difference(links, date_cutoff):
    # print(f"Filtering links by date cutoff: {date_cutoff}")
    try:
        cutoff_date = datetime.strptime(date_cutoff, '%B %d, %Y').date()
    except ValueError as e:
        # print(f"Invalid date format for cutoff date: {e}")
        return []

    filtered_links = []
    for link in links:
        try:
            paper_id = link.split("/")[-1]
            year_month = paper_id.split(".")[0]
            year = int("20" + year_month[:2])
            month = int(year_month[2:])
            paper_date = datetime(year, month, 1).date()

            # Ensure paper_date is at least 1 year older than cutoff_date
            if (cutoff_date - paper_date).days >= 250:
                filtered_links.append(link)
        except Exception as e:
            print(f"Error processing date for link {link}: {e}")
            continue

    print(f"{len(filtered_links)} links passed the date and 1-year difference filter.")
    return filtered_links

# Function to fetch metadata from arXiv
def fetch_arxiv_metadata(arxiv_links, max_retries=3, retry_delay=2):
    # print(f"Fetching metadata for {len(arxiv_links)} arXiv links.")
    paper_data = []
    for link in arxiv_links:
        paper_id = link.split("/")[-1]  # Extract the paper ID
        success = False
        for attempt in range(max_retries):
            try:
                search = arxiv.Search(id_list=[paper_id])
                for result in arxiv.Client().results(search):
                    paper_data.append({
                        "title": result.title,
                        "abstract": result.summary,
                        "published": result.published.strftime('%B %d, %Y'),  # Ensure consistent date format
                        "link": link,
                    })
                    success = True
                    break
                if success:
                    break
            except Exception as e:
                # print(f"Attempt {attempt + 1}/{max_retries} failed for {link}: {e}")
                time.sleep(retry_delay)

        if not success:
            print(f"Failed to fetch metadata for {link} after {max_retries} attempts.")
            paper_data.append({
                "title": None,
                "abstract": None,
                "published": None,
                "link": link,
                "error": f"Failed after {max_retries} attempts."
            })
    print(f"Fetched metadata for {len(paper_data)} papers.")
    return paper_data
def extract_topics_using_llm(abstract, title ,introduction):
    client = groq.Client(api_key=os.getenv("GROQ_API_KEY"))
    prompt = f"""Analyze the given title and abstract and introduction of a research paper. Based on the key research topics, generate 4-5 alternative titles for the paper. Each title should be concise, consisting of 3-4 words, and should not be an exact copy of the original title. These new titles should reflect the major areas of focus in the paper and serve as helpful keywords for related studies.
            Title: {title}
            Abstract:{abstract}
            Introduction:{introduction}
            do not consider very general topic or any common words in the outut titles . 
            note then , output should be in the following format [title1,title2,title3] , do not return any explanation or body text , return list of titles only"""
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}], 
        model="gemma2-9b-it",
        temperature=0
    )
    if response and response.choices:
        try:
            topics = response.choices[0].message.content.strip()
            return [topic.strip() for topic in topics.split(",")]
        except Exception as e:
            print(f"Failed to parse topics: {e}")
            return []
    return []

# Function to summarize the title, abstract, and introduction
def summarize_paper_content(title, abstract, introduction):
    client = groq.Client(api_key=os.getenv("GROQ_API_KEY"))
    prompt = f"""You are given the title, abstract, and introduction of a research paper. Summarize the key points into a concise paragraph, preserving the major concepts, research question, methods, and findings, while removing redundant information. The summarized paragraph should combine elements from the title, abstract, and introduction without losing the essence of the research.
            Title: {title}
            Abstract: {abstract}
            Introduction: {introduction}
            Please return a well-structured paragraph summary that combines the key aspects of the paper, which can be used as the input for further analysis.
    """
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}], 
        model="gemma2-9b-it",
        temperature=0
    )
    if response and response.choices:
        try:
            summarized_paragraph = response.choices[0].message.content.strip()
            return summarized_paragraph
        except Exception as e:
            # print(f"Failed to summarize content: {e}")
            return ""
    return ""
# Function to compute novelty scores using Groq's LLM
def compute_novelty_scores_for_new_paper(input_abstract, existing_papers_abstracts):
    # print(f"Computing novelty scores for the new paper's abstract.")
    client = groq.Client(api_key=os.getenv("GROQ_API_KEY"))
    scores = []

    for paper_abstract in existing_papers_abstracts:
        prompt = f"""Given two abstracts, evaluate the similarity between the new paper and the existing one. Rate the similarity on a scale of 0 to 100, where 0 indicates no similarity at all, and 100 indicates they are exactly the same.\n\nExisting Paper: {paper_abstract}\nNew Paper: {input_abstract}\nReturn only the score as an integer."""

        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}], 
            model="gemma2-9b-it",
            temperature=0
        )

        if response and response.choices:
            try:
                score = int(response.choices[0].message.content.strip())
                scores.append(score)
            except ValueError:
                scores.append(0)
        else:
            scores.append(0)
    # print(f"Novelty scores computed: {scores}")
    return scores

# Main process
def main(title, abstract, introduction, date_cutoff):
    try:
        date_cutoff = datetime.strptime(date_cutoff, '%B %d, %Y').strftime('%B %d, %Y')
    except ValueError as e:
        # print(f"Error in date format: {e}")
        return 
    summarized_abstract = summarize_paper_content(title, abstract, introduction)


    topics = extract_topics_using_llm(summarized_abstract, title, introduction)

    all_arxiv_links = []
    for topic in topics:
        links = get_arxiv_links_from_search(topic)
        all_arxiv_links.extend(links)

    all_arxiv_links = list(set(all_arxiv_links))
    # print(f"Total unique arXiv links collected: {len(all_arxiv_links)}")

    filtered_links = filter_arxiv_links_with_date_difference(all_arxiv_links, date_cutoff)

    paper_data = fetch_arxiv_metadata(filtered_links)

    existing_papers_abstracts = [paper["abstract"] for paper in paper_data if paper["abstract"]]
    novelty_scores = compute_novelty_scores_for_new_paper(summarized_abstract, existing_papers_abstracts)
    top_5_novelty_scores = sorted(novelty_scores, reverse=True)[:5]
    novelty_score = statistics.mean(top_5_novelty_scores) if top_5_novelty_scores else 0
    # print(f"Average Novelty Score: {novelty_score}")
    return novelty_score


**---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Function to find threshold value for each checkpoint for testing pdf's using the given reference pdf**

In [None]:
def process_pdf(pdf_path):
    # Dictionary to store results
    results = {}
    text=extract_text_from_pdf(pdf_path)
    title = extract_title_from_text(text)
    print("done_1")
    # Extract headings
    formatted_headings_list = process_pdf_and_extract_headings(pdf_path)
    print("done_2")
    # Format headings
    formatted_headings_list = split_by_two_capitals(formatted_headings_list)
    print("done_3")
    # Pair headings with text
    paired_text = pair_headings_with_text(formatted_headings_list, text)
    print("done_4")
    # Rewrite math expressions
    paired_text = rewrite_math_expressions_in_paired_text(paired_text)
    print("done_5")
    # Extract key sections
    abstract = find_paragraph_for_heading(paired_text, "Abstract")
    introduction = find_paragraph_for_heading(paired_text, "Introduction")
    print("done_6")
    # Evaluate various aspects
    results['evaluate_title_against_abstract'] = evaluate_title_against_abstract(title, abstract)
    results['evaluate_conclusion_clarity'] = evaluate_conclusion_clarity(paired_text)
    print("done_7")
    results['evaluate_paragraph_flow'] = evaluate_paragraph_flow(paired_text)
    results['evaluate_claims_against_results'] = evaluate_claims_against_results(paired_text)
    print("done_8")
    results['evaluate_paragraph_grammar'] = evaluate_paragraph_grammar(paired_text)
    results['evaluate_paragraph_spelling'] = evaluate_paragraph_spelling(paired_text)
    print("done_9")
    # Get best abstract date
    date = get_best_abstract_date(title, abstract)
    print("done_10")
    # Compute final score
    results['Novalty_score'] = main(title, abstract, introduction, date)
    return results

# Example usage:
results = process_pdf(r"enter the path of the pdf file")
print(results)

**---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

***Function to detect the publishable and non publishable using research paper***

In [53]:
def process_pdf_with_threshold(pdf_path):
    try:
        # Extract text from PDF
        text = extract_text_from_pdf(pdf_path)
        if not text:
            return 0

        title = extract_title_from_text(text)
        print("done_1")
        if not title:
            return 0

        # Extract headings
        formatted_headings_list = process_pdf_and_extract_headings(pdf_path)
        print("done_2")
        if not formatted_headings_list:
            return 0

        # Format headings
        formatted_headings_list = split_by_two_capitals(formatted_headings_list)
        print("done_3")
        if not formatted_headings_list:
            return 0

        # Pair headings with text
        paired_text = pair_headings_with_text(formatted_headings_list, text)
        print("done_4")
        if not paired_text:
            return 0

        # Rewrite math expressions
        paired_text = rewrite_math_expressions_in_paired_text(paired_text)
        print("done_5")
        if not paired_text:
            return 0

        # Extract key sections
        abstract = find_paragraph_for_heading(paired_text, "Abstract")
        introduction = find_paragraph_for_heading(paired_text, "Introduction")
        print("done_6")
        if not abstract or not introduction:
            return 0

        # Evaluate various aspects
        if evaluate_title_against_abstract(title, abstract) < 85:
            return 0

        if evaluate_conclusion_clarity(paired_text) < 75:
            return 0

        print("done_7")
        if evaluate_paragraph_flow(paired_text) < 70:
            return 0

        if evaluate_claims_against_results(paired_text) < 70:
            return 0

        print("done_8")
        if evaluate_paragraph_grammar(paired_text) < 75:
            return 0

        if evaluate_paragraph_spelling(paired_text) > 5:
            return 0

        print("done_9")
        # Get best abstract date
        date = get_best_abstract_date(title, abstract)
        print("done_10")
        if not date:
            return 0

        # Compute final score
        if main(title, abstract, introduction, date) > 80:
            return 0

        return 1  # All checks passed
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return 0


In [None]:
import os
import csv

input_directory = r"enter the path of the directory containing PDF files"
output_csv = "enter the path of the output CSV file"

results = []

# Process each PDF in the directory
for file_name in os.listdir(input_directory):
    if file_name.endswith(".pdf"):
        pdf_path = os.path.join(input_directory, file_name)
        score = process_pdf_with_threshold(pdf_path)
        results.append({"PDF Name": file_name, "Score": score})

# Write results to CSV
with open(output_csv, mode="w", newline="") as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=["PDF Name", "Score"])
    writer.writeheader()
    writer.writerows(results)

print(f"Scores written to {output_csv}")