In [1]:
pip install ipywidgets


Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


2024-11-28 12:41:02.882 python[8209:189268] +[IMKClient subclass]: chose IMKClient_Modern
2024-11-28 12:41:02.882 python[8209:189268] +[IMKInputSession subclass]: chose IMKInputSession_Modern


True

In [3]:
import os
import json
import random
import requests
import logging
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
import sacrebleu
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
#import Moverscore_v2 as mv
from dotenv import load_dotenv
import nltk
from tqdm import TqdmWarning
import warnings
warnings.filterwarnings("ignore", category=TqdmWarning)
load_dotenv()

logging.basicConfig(level=logging.INFO)

In [4]:
# DeepSeek API configuration
DEEPSEEK_API_ENDPOINT = "https://api.deepseek.com/beta/v1/completions"
API_KEY = os.getenv('DEEPSEEK_API_KEY')

In [5]:
def extract_data(data_split_path):
    reviews_path = os.path.join(data_split_path, 'reviews')
    parsed_pdfs_path = os.path.join(data_split_path, 'parsed_pdfs')
    data_list = []

    if not os.path.exists(reviews_path) or not os.path.exists(parsed_pdfs_path):
        logging.error("Reviews path or parsed PDFs path does not exist.")
        return data_list

    for review_filename in os.listdir(reviews_path):
        if review_filename.endswith('.json'):
            review_file_path = os.path.join(reviews_path, review_filename)
            with open(review_file_path, 'r', encoding='utf-8') as f:
                paper_data = json.load(f)

            paper_id = os.path.splitext(review_filename)[0]
            parsed_pdf_file = os.path.join(parsed_pdfs_path, f"{paper_id}.pdf.json")

            if not os.path.exists(parsed_pdf_file):
                continue

            with open(parsed_pdf_file, 'r', encoding='utf-8') as f_pdf:
                parsed_pdf = json.load(f_pdf)
                metadata = parsed_pdf.get('metadata', {})
                title = metadata.get('title', 'No title')
                abstract_text = metadata.get('abstractText', '')
                sections = parsed_pdf.get('pdf_parse', {}).get('body_text', [])

                section_texts = ' '.join(
                    f"{section.get('section', '')}: {section.get('text', '')}"
                    for section in sections
                )
                full_body = f"{title} {abstract_text} {section_texts}".strip()

            reviews = paper_data.get('reviews', [])
            review_texts = [
                review.get('comments', '').strip()
                for review in reviews if review.get('comments', '').strip()
            ]

            if review_texts:
                data_list.append({
                    'title': title,
                    'abstract': abstract_text,
                    'paper_content': full_body,
                    'reviews': review_texts
                })

    return data_list


In [6]:
def generate_response_with_deepseek(prompt, model="deepseek-chat", max_tokens=1000):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    data = {
        "model": model,
        "prompt": prompt,
        "max_tokens": max_tokens
    }
    try:
        response = requests.post(DEEPSEEK_API_ENDPOINT, headers=headers, json=data)
        response.raise_for_status()
        return response.json().get("choices", [{}])[0].get("text", "")
    except requests.exceptions.RequestException as e:
        logging.error(f"Request failed: {e}")
        return None

In [7]:
def generate_few_shot_feedback(paper_text, example_essays, max_examples=3):
    selected_examples = example_essays[:max_examples]
    prompt = "You are an expert reviewer. Provide detailed feedback for the essays.\n\n"
    for idx, example in enumerate(selected_examples):
        prompt += (
            f"Essay {idx + 1}:\n{example['essay']}\n"
            f"Feedback {idx + 1}:\n{example['feedback']}\n\n"
        )
    prompt += f"Now, here is a new essay:\n{paper_text}\nProvide detailed feedback for this essay."
    feedback = generate_response_with_deepseek(prompt)
    return feedback

In [8]:
def create_ground_truth_file(reviews_path, output_file):
    ground_truths = []
    if not os.path.exists(reviews_path):
        print(f"Reviews path {reviews_path} does not exist.")
        return

    for review_filename in os.listdir(reviews_path):
        if review_filename.endswith('.json'):
            review_file_path = os.path.join(reviews_path, review_filename)
            with open(review_file_path, 'r', encoding='utf-8') as f:
                paper_data = json.load(f)

            reviews = paper_data.get('reviews', [])
            for review in reviews:
                if 'comments' in review:
                    ground_truths.append({
                        'true_feedback': review['comments'].strip()
                    })

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(ground_truths, f, indent=4, ensure_ascii=False)
    print(f"Ground truth file saved as '{output_file}'")

In [9]:
# Add these imports at the top of your script
from nltk.translate.gleu_score import sentence_gleu
from nltk.tokenize import word_tokenize

In [10]:
def evaluate_metrics(predictions_file, ground_truth_file):
    with open(predictions_file, 'r', encoding='utf-8') as f:
        predictions = json.load(f)

    with open(ground_truth_file, 'r', encoding='utf-8') as f:
        ground_truths = json.load(f)

    bleu_scores = []
    rouge_scores = []
    meteor_scores = []
    gleu_scores = []
    # Moverscore_scores = []

    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

    for pred, truth in zip(predictions, ground_truths):
        pred_feedback = pred['predicted_feedback']
        true_feedback = truth['true_feedback']
        if not pred_feedback or not true_feedback:
            continue  # Skip empty cases

        from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
        smoothie = SmoothingFunction().method1  # You can choose other methods as well

        bleu_score = sentence_bleu(
            [true_feedback.split()], 
            pred_feedback.split(), 
            smoothing_function=smoothie)

        # Compute BLEU score
        #bleu_score = sentence_bleu([true_feedback.split()], pred_feedback.split())
        bleu_scores.append(bleu_score)

        # Compute ROUGE score
        rouge_score = scorer.score(true_feedback, pred_feedback)
        rouge_scores.append(rouge_score)

        # Tokenize before using METEOR
        true_tokens = word_tokenize(true_feedback.lower())
        pred_tokens = word_tokenize(pred_feedback.lower())

        meteor = meteor_score([true_tokens], pred_tokens)
        meteor_scores.append(meteor)

        # Compute GLEU score using NLTK
        reference_tokens = [true_tokens]  # NLTK expects a list of reference token lists
        hypothesis_tokens = pred_tokens
        gleu_score = sentence_gleu(reference_tokens, hypothesis_tokens)
        gleu_scores.append(gleu_score)

    # Calculate averages as before...
    avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
    avg_rouge = {
        key: sum(score[key].fmeasure for score in rouge_scores) / len(rouge_scores) if rouge_scores else 0
        for key in rouge_scores[0]
    }
    avg_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
    avg_gleu = sum(gleu_scores) / len(gleu_scores) if gleu_scores else 0

    print(f"Evaluation for {predictions_file}:")
    print(f"Average BLEU Score: {avg_bleu}")
    print(f"Average ROUGE Scores: {avg_rouge}")
    print(f"Average METEOR Score: {avg_meteor}")
    print(f"Average GLEU Score: {avg_gleu}")



In [None]:
'''
def load_bleurt_model():
    tokenizer = AutoTokenizer.from_pretrained("bleurt-base-128", use_auth_token="paste hugging face token")
    model = AutoModelForSequenceClassification.from_pretrained("bleurt-base-128", use_auth_token="paste hugging face token")
    bleurt_scorer = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)
    return bleurt_scorer
'''

if __name__ == "__main__":
    #folders = ['iclr_2017', 'conll_2016', 'acl_2017']
    folders = ['conll_2016', 'acl_2017','iclr_2017']
    base_path = 'PeerRead/data'

    for folder_name in folders:
        test_data_path = os.path.join(base_path, folder_name, 'test')
        reviews_path = os.path.join(test_data_path, 'reviews')
        ground_truth_file = f'ground_truths_{folder_name}.json'
        create_ground_truth_file(reviews_path, ground_truth_file)

        train_data_path = os.path.join(base_path, folder_name, 'train')
        train_data = extract_data(train_data_path)
        if not train_data:
            logging.error(f"No training data extracted for {folder_name}.")
            continue

        example_essays = []
        for data_item in train_data:
            if 'paper_content' in data_item and 'reviews' in data_item:
                for review in data_item['reviews']:
                    example_essays.append({
                        "essay": data_item['paper_content'],
                        "feedback": review
                    })
        random.shuffle(example_essays)
        example_essays = example_essays[:3]

        test_data = extract_data(test_data_path)
        if not test_data:
            logging.error(f"No test data extracted for {folder_name}.")
            continue

        predictions = []
        for data_item in test_data:
            if 'paper_content' in data_item:
                new_essay = data_item['paper_content']
                feedback = generate_few_shot_feedback(new_essay, example_essays)
                predictions.append({
                    'paper_content': new_essay,
                    'predicted_feedback': feedback
                })

        predictions_file = f'predictions_{folder_name}.json'
        with open(predictions_file, 'w', encoding='utf-8') as f:
            json.dump(predictions, f, indent=4, ensure_ascii=False)

        evaluate_metrics(predictions_file, ground_truth_file)


Ground truth file saved as 'ground_truths_conll_2016.json'


INFO:absl:Using default tokenizer.


Evaluation for predictions_conll_2016.json:
Average BLEU Score: 0.006195286224538256
Average ROUGE Scores: {'rouge1': 0.2726837883582309, 'rougeL': 0.12571442588368043}
Average METEOR Score: 0.24545376522767928
Average GLEU Score: 0.06414602078967911
Ground truth file saved as 'ground_truths_acl_2017.json'


INFO:absl:Using default tokenizer.


Evaluation for predictions_acl_2017.json:
Average BLEU Score: 0.008030556143367505
Average ROUGE Scores: {'rouge1': 0.3871730133105085, 'rougeL': 0.16380530317289382}
Average METEOR Score: 0.24615929728705915
Average GLEU Score: 0.09851655413118575
