In [9]:
import requests
import json
import os
import random
import logging
from dotenv import load_dotenv

load_dotenv()

logging.basicConfig(level=logging.INFO)

# Step 2: Configure API access for DeepSeek
DEEPSEEK_API_ENDPOINT = "https://api.deepseek.com/beta/v1/completions"
API_KEY = os.getenv('DEEPSEEK_API_KEY')

# Step 3: Function to extract data from the given data_split_path
def extract_data(data_split_path):
    reviews_path = os.path.join(data_split_path, 'reviews')
    parsed_pdfs_path = os.path.join(data_split_path, 'parsed_pdfs')
    data_list = []

    if not os.path.exists(reviews_path) or not os.path.exists(parsed_pdfs_path):
        logging.error("Reviews path or parsed PDFs path does not exist.")
        return data_list

    for review_filename in os.listdir(reviews_path):
        if review_filename.endswith('.json'):
            review_file_path = os.path.join(reviews_path, review_filename)
            with open(review_file_path, 'r', encoding='utf-8') as f:
                paper_data = json.load(f)

            paper_id = os.path.splitext(review_filename)[0]
            parsed_pdf_file = os.path.join(parsed_pdfs_path, f"{paper_id}.pdf.json")

            if not os.path.exists(parsed_pdf_file):
                continue

            with open(parsed_pdf_file, 'r', encoding='utf-8') as f_pdf:
                parsed_pdf = json.load(f_pdf)
                metadata = parsed_pdf.get('metadata', {})
                title = metadata.get('title', 'No title')
                abstract_text = metadata.get('abstractText', '')
                sections = parsed_pdf.get('pdf_parse', {}).get('body_text', [])

                # Combine section texts
                section_texts = ' '.join(
                    f"{section.get('section', '')}: {section.get('text', '')}"
                    for section in sections
                )
                full_body = f"{title} {abstract_text} {section_texts}".strip()

            # Extract reviews
            reviews = paper_data.get('reviews', [])
            review_texts = [
                review.get('comments', '').strip()
                for review in reviews if review.get('comments', '').strip()
            ]

            if review_texts:
                data_list.append({
                    'title': title,
                    'abstract': abstract_text,
                    'paper_content': full_body,
                    'reviews': review_texts
                })

    return data_list

# Step 4: Function to generate responses using DeepSeek API
def generate_response_with_deepseek(prompt, model="deepseek-chat", max_tokens=700): 
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    data = {
        "model": model,
        "prompt": prompt,
        "max_tokens": max_tokens
    }
    try:
        response = requests.post(DEEPSEEK_API_ENDPOINT, headers=headers, json=data)
        response.raise_for_status()
        return response.json().get("choices", [{}])[0].get("text", "")
    except requests.exceptions.RequestException as e:
        logging.error(f"Request failed: {e}")
        return None

# MODIFIED THIS 
def generate_few_shot_feedback(paper_text, example_essays, max_examples=3):
    # Limit the number of examples
    selected_examples = example_essays[:max_examples]

    # Build the prompt with examples
    prompt = "You are an expert reviewer. Provide detailed feedback for the essays.\n\n"
    for idx, example in enumerate(selected_examples):
        prompt += (
            f"Essay {idx + 1}:\n{example['essay']}\n"
            f"Feedback {idx + 1}:\n{example['feedback']}\n\n"
        )
    prompt += f"Now, here is a new essay:\n{paper_text}\nProvide detailed feedback for this essay."

    # Generate the feedback using DeepSeek API
    feedback = generate_response_with_deepseek(prompt)
    return feedback

if __name__ == "__main__":
    # Define the path where data is stored
    data_split_path = './data/acl_2017/train'

    # Extract data using the extract_data function
    extracted_data = extract_data(data_split_path)
    if not extracted_data:
        logging.error("No data extracted.")
        exit()

    # Prepare example essays and feedback from extracted data
    example_essays = []
    new_essays = []
    for data_item in extracted_data:
        if 'paper_content' in data_item and 'reviews' in data_item:
            for review in data_item['reviews']:
                example_essays.append({
                    "essay": data_item['paper_content'],
                    "feedback": review
                })
            new_essays.append(data_item['paper_content'])

    # Shuffle and select examples
    random.shuffle(example_essays)
    example_essays = example_essays[:3]

    # Select a new essay not in examples
    new_essay_candidates = [
        essay for essay in new_essays
        if essay not in [ex['essay'] for ex in example_essays]
    ]
    if not new_essay_candidates:
        logging.error("No new essay available for feedback generation.")
        exit()
    new_essay = random.choice(new_essay_candidates)

    # Generate feedback
    feedback = generate_few_shot_feedback(new_essay, example_essays)
    if feedback:
        print("Generated Feedback:\n", feedback)
    else:
        logging.error("Feedback generation failed.")


Generated Feedback:
 

Feedback for the new essay:

- Strengths:

  1) The introduction of ngrams into existing word representation methods is a novel approach.
  
  2) The paper presents comprehensive experiments on word analogy and similarity tasks, which are crucial for evaluating the effectiveness of the proposed method.
  
  3) The demonstration of the usefulness of trained ngram representations in finding antonyms and collocations is a significant contribution.
  
  4) The proposed approach for building the co-occurrence matrix to alleviate hardware burden is practical and valuable.

- Weaknesses:

  1) The paper lacks a detailed explanation of how ngrams are integrated into the existing methods (SGNS, GloVe, PPMI matrix, and its SVD factorization). A more in-depth description of the methodology would strengthen the paper.
  
  2) The paper does not provide a clear comparison with state-of-the-art methods. It is essential to show how the proposed method compares to the current be