## Required Libraries and Installation
This project utilizes several key Python libraries. To ensure proper execution, please install them using the following commands:

In [23]:
##Exact Pip Command for Installation:

# pip install spacy sentence-transformers scikit-learn pandas numpy PyPDF2 textstat seaborn matplotlib langchain
##Installs core libraries for NLP, ML, data handling, and PDF processing.

##Downloads the small English spaCy language model for text analysis.
# python (or python3) -m spacy download en_core_web_sm

## Imports and Initialization
This section imports necessary libraries and initializes key components for our paper classification pipeline:

In [24]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import os

## Load the Model
This function loads the pre-trained SPECTER model and its tokenizer, which are used for generating paper embeddings.

In [25]:
def load_model():
    """Initialize and load the SPECTER model and tokenizer"""
    tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
    model = AutoModel.from_pretrained('allenai/specter')
    return tokenizer, model

## Define Labeled Papers
This list contains reference papers and their corresponding conferences, which will be used for training and creating conference-specific embeddings.

In [26]:
labeled_papers = [
    ('R006.pdf', 'CVPR'),
    ('R007.pdf', 'CVPR'),
    ('R008.pdf', 'EMNLP'),
    ('R009.pdf', 'EMNLP'),
    ('R010.pdf', 'KDD'),
    ('R011.pdf', 'KDD'),
    ('R012.pdf', 'NeurIPS'),
    ('R013.pdf', 'NeurIPS'),
    ('R014.pdf', 'TMLR'),
    ('R015.pdf', 'TMLR')
]

## Generate Embeddings for Papers
This function takes the text of a paper and generates an embedding using the SPECTER model.

In [27]:
def get_embedding(text, tokenizer, model):
    """Generate embedding for given text using SPECTER"""
    max_length = 512
    text = ' '.join(text.split()[:max_length])
    
    inputs = tokenizer(text, padding=True, truncation=True, 
                      max_length=512, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

## Extract Features from Papers
This function extracts various features from the paper’s content by counting the occurrences of specific keywords related to different research areas.

In [28]:
def extract_paper_features(text):
    """Extract key features from paper text using keyword matching"""
    features = {
        'deep_learning': len(re.findall(r'\b(deep learning|neural network|CNN|RNN|LSTM|deep neural|artificial neural|convolutional|recurrent neural)\b', text.lower())),
        'computer_vision': len(re.findall(r'\b(computer vision|image processing|object detection|segmentation|visual recognition|image classification|feature detection|opencv)\b', text.lower())),
        'nlp': len(re.findall(r'\b(natural language|nlp|text mining|language model|transformer|bert|gpt|word embedding|tokenization|semantic analysis)\b', text.lower())),
        'data_mining': len(re.findall(r'\b(data mining|clustering|pattern recognition|kdd|knowledge discovery|association rules|anomaly detection|predictive analytics)\b', text.lower())),
        'theory': len(re.findall(r'\b(theorem|proof|lemma|theoretical|convergence|mathematical model|algorithm complexity|optimization theory)\b', text.lower())),
        'methodology': len(re.findall(r'\b(methodology|experimental design|ablation study|comparative analysis|empirical study|statistical analysis)\b', text.lower())),
        'results': len(re.findall(r'\b(results|performance|accuracy|precision|recall|f1 score|benchmark|evaluation metrics)\b', text.lower()))
    }
    return features

## Create Conference Embeddings
This function reads labeled reference papers, extracts content, generates embeddings, and stores features for each conference.

In [29]:
def create_conference_embeddings(labeled_papers, tokenizer, model):
    """Create embeddings for each conference's reference papers"""
    conference_papers = {}
    conference_features = {}
    
    base_path = os.getcwd()  # Get current working directory
    
    for filename, conference in labeled_papers:
        if conference not in conference_papers:
            conference_papers[conference] = []
            conference_features[conference] = []
        
        try:
            paper_path = os.path.join(base_path, filename)
            if not os.path.exists(paper_path):
                print(f"Reference paper not found: {paper_path}")
                continue
                
            loader = PyPDFLoader(paper_path)
            pages = loader.load()
            content = ' '.join([page.page_content for page in pages[:3]])
            
            conference_papers[conference].append(content)
            conference_features[conference].append(extract_paper_features(content))
            
        except Exception as e:
            print(f"Error loading reference paper {filename}: {str(e)}")
            continue
    
    conference_embeddings = {}
    for conference, papers in conference_papers.items():
        paper_embeddings = [get_embedding(paper, tokenizer, model) for paper in papers]
        conference_embeddings[conference] = paper_embeddings
    
    return conference_embeddings, conference_features

## Recommend a Conference
This function takes a new paper, compares it with reference papers, and recommends the most suitable conference based on similarity scores.

In [30]:
def recommend_conference(new_paper_content, conference_embeddings, conference_features, tokenizer, model):
    """Recommend conference based on content similarity and features"""
    new_paper_embedding = get_embedding(new_paper_content, tokenizer, model)
    new_paper_features = extract_paper_features(new_paper_content)
    
    conference_scores = {}
    feature_similarities = {}
    
    for conference, paper_embeddings in conference_embeddings.items():
        # Calculate embedding similarities
        similarities = []
        for paper_embedding in paper_embeddings:
            similarity = cosine_similarity([new_paper_embedding], [paper_embedding])[0][0]
            similarities.append(similarity)
        
        top_similarities = sorted(similarities, reverse=True)[:2]
        conference_scores[conference] = np.mean(top_similarities)
        
        # Calculate feature similarity
        conf_features = conference_features[conference]
        feature_matches = []
        for paper_feat in conf_features:
            total_features = sum(paper_feat.values()) + sum(new_paper_features.values())
            if total_features == 0:
                match_score = 0
            else:
                common_features = sum(min(paper_feat[k], new_paper_features[k]) for k in paper_feat)
                match_score = 2 * common_features / total_features
            feature_matches.append(match_score)
        feature_similarities[conference] = np.mean(feature_matches)
    
    # Combine scores (70% embedding similarity, 30% feature similarity)
    final_scores = {
        conf: 0.7 * emb_score + 0.3 * feature_similarities[conf]
        for conf, emb_score in conference_scores.items()
    }
    
    return max(final_scores.items(), key=lambda x: x[1]), final_scores

## Feature Generation Function

This function generates detailed justifications for conference recommendations based on paper features and similarity scores. It analyzes various aspects like computer vision, NLP, data mining, and theoretical content to provide a comprehensive explanation.

In [31]:
def generate_justification(paper_features, conference, similarity_score):
    """Generate a detailed, specific justification for the conference classification"""
    # Get feature counts and methodological aspects
    total_features = sum(paper_features.values())
    feature_strengths = sorted(
        [(k, v) for k, v in paper_features.items() if v > 0],
        key=lambda x: x[1], 
        reverse=True
    )
    
    # Calculate percentages for top features
    feature_percentages = {k: (v/total_features)*100 for k, v in feature_strengths[:3]} if total_features > 0 else {}
    
    # Build detailed justification based on conference and actual content
    if conference == 'CVPR':
        cv_focus = paper_features['computer_vision']
        dl_focus = paper_features['deep_learning']
        justification = (
            f"Paper demonstrates {cv_focus} computer vision concepts and {dl_focus} deep learning applications. "
            f"Content analysis shows {feature_percentages.get('computer_vision', 0):.1f}% computer vision focus. "
            f"Methodology includes {paper_features['methodology']} experimental components. "
            f"Similarity score with CVPR papers: {similarity_score:.2f}. "
            f"Strong alignment with computer vision research scope."
        )

    elif conference == 'EMNLP':
        nlp_focus = paper_features['nlp']
        method_focus = paper_features['methodology']
        justification = (
            f"Contains {nlp_focus} natural language processing elements and {method_focus} methodological components. "
            f"NLP content comprises {feature_percentages.get('nlp', 0):.1f}% of technical content. "
            f"Includes {paper_features['results']} results-related discussions. "
            f"Shows {similarity_score:.2f} similarity with EMNLP papers. "
            f"Well-aligned with computational linguistics scope."
        )

    elif conference == 'KDD':
        dm_focus = paper_features['data_mining']
        results_focus = paper_features['results']
        justification = (
            f"Exhibits {dm_focus} data mining concepts and {results_focus} experimental results. "
            f"Data mining comprises {feature_percentages.get('data_mining', 0):.1f}% of content. "
            f"Includes {paper_features['methodology']} methodology discussions. "
            f"Similarity score with KDD papers: {similarity_score:.2f}. "
            f"Strong focus on knowledge discovery and data mining."
        )

    elif conference == 'NeurIPS':
        theory_focus = paper_features['theory']
        dl_focus = paper_features['deep_learning']
        justification = (
            f"Contains {theory_focus} theoretical concepts and {dl_focus} deep learning elements. "
            f"Theoretical content makes up {feature_percentages.get('theory', 0):.1f}% of the paper. "
            f"Includes {paper_features['methodology']} methodology components. "
            f"Shows {similarity_score:.2f} similarity with NeurIPS papers. "
            f"Strong theoretical machine learning focus."
        )

    elif conference == 'TMLR':
        dl_focus = paper_features['deep_learning']
        method_focus = paper_features['methodology']
        justification = (
            f"Demonstrates {dl_focus} machine learning concepts and {method_focus} methodological elements. "
            f"Technical content comprises {feature_percentages.get('deep_learning', 0):.1f}% ML focus. "
            f"Contains {paper_features['results']} results-related discussions. "
            f"Similarity score with TMLR papers: {similarity_score:.2f}. "
            f"Well-suited for machine learning research scope."
        )

    # Ensure justification is between 50-100 words
    words = justification.split()
    if len(words) > 100:
        justification = ' '.join(words[:100])
    elif len(words) < 50:
        justification += f" The paper's technical depth and methodology align well with {conference}'s standards."
    
    return justification

## Main Processing Function

This function orchestrates the entire paper processing pipeline:
1. Loads the necessary models
2. Creates conference embeddings
3. Processes each paper
4. Generates recommendations and justifications
5. Saves results to CSV

In [32]:
def process_papers():
    """Main function to process all papers"""
    print("Loading model and tokenizer...")
    tokenizer, model = load_model()
    
    print("Creating conference embeddings...")
    conference_embeddings, conference_features = create_conference_embeddings(
        labeled_papers, tokenizer, model
    )
    
    print("Reading results.csv...")
    df = pd.read_csv('results.csv')
    
    # Initialize new columns
    df['conference'] = 'NA'
    df['justification'] = ''
    
    # Create the base path for the Papers folder
    base_path = os.path.join(os.getcwd(), "Papers")
    
    print("Processing papers...")
    for idx, row in df.iterrows():
        print(f"Processing paper {idx+1}/{len(df)}: {row['paper_id']}")
        
        if row['publishable'] == 1:
            try:
                # Construct the full path to the PDF
                paper_path = os.path.join(base_path, row['paper_id'])
                
                # Verify file exists before processing
                if not os.path.exists(paper_path):
                    print(f"File not found: {paper_path}")
                    continue
                
                loader = PyPDFLoader(paper_path)
                pages = loader.load()
                paper_content = ' '.join([page.page_content for page in pages[:3]])
                
                (recommended_conference, score), _ = recommend_conference(
                    paper_content, 
                    conference_embeddings, 
                    conference_features, 
                    tokenizer, 
                    model
                )
                
                paper_features = extract_paper_features(paper_content)
                
                df.at[idx, 'conference'] = recommended_conference
                df.at[idx, 'justification'] = generate_justification(
                    paper_features, 
                    recommended_conference, 
                    score
                )
                
                print(f"Successfully classified as: {recommended_conference}")
                
            except Exception as e:
                print(f"Error processing {row['paper_id']}: {str(e)}")
                continue
    
    print("Saving results...")
    df.to_csv('final_results.csv', index=False)
    print("Processing complete. Results saved to final_results.csv")

## Main Execution Block

Entry point of the script that initiates the paper processing pipeline.

In [33]:
if __name__ == "__main__":
    process_papers()

Loading model and tokenizer...
Creating conference embeddings...
Reading results.csv...
Processing papers...
Processing paper 1/135: P005.pdf
Successfully classified as: CVPR
Processing paper 2/135: P011.pdf
Successfully classified as: KDD
Processing paper 3/135: P039.pdf
Processing paper 4/135: P038.pdf
Successfully classified as: EMNLP
Processing paper 5/135: P010.pdf
Successfully classified as: KDD
Processing paper 6/135: P004.pdf
Successfully classified as: NeurIPS
Processing paper 7/135: P012.pdf
Successfully classified as: EMNLP
Processing paper 8/135: P006.pdf
Processing paper 9/135: P007.pdf
Successfully classified as: CVPR
Processing paper 10/135: P013.pdf
Successfully classified as: EMNLP
Processing paper 11/135: P017.pdf
Successfully classified as: EMNLP
Processing paper 12/135: P003.pdf
Successfully classified as: EMNLP
Processing paper 13/135: P002.pdf
Processing paper 14/135: P016.pdf
Processing paper 15/135: P028.pdf
Successfully classified as: EMNLP
Processing paper 16/