In [None]:
!pip install transformers datasets openai huggingface_hub

In [None]:
import pandas as pd
from openai import OpenAI
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from huggingface_hub import HfApi, HfFolder
import logging
import time
import os

# Set up logging
logging.basicConfig(level=logging.INFO)

# Initialize OpenAI client
client = OpenAI(api_key = "sk-proj-OpenAIKEY")

# Set up Hugging Face credentials
hf_token = "hf_HuggingfaceKey"

# Load the dataset from Hugging Face
def load_hf_dataset(dataset_name, num_rows=100):
    dataset = load_dataset(dataset_name)
    df = pd.DataFrame(dataset['train'][:num_rows])  # Limit to num_rows
    return df

# Initialize tokenizer for token counting
tokenizer = AutoTokenizer.from_pretrained("LlamaFinetuneBase/Meta-Llama-3.1-8B-Instruct")

# Updated function to count tokens
def count_tokens(instruction, input_text, output):
    combined_text = f"{instruction}\n{input_text}\n{output}"
    return len(tokenizer.encode(combined_text))

# Function to categorize text using OpenAI API
def categorize_text(text):
    response = client.chat.completions.create(
        messages=[{
            "role": "user",
            "content": f"Categorize this text into one of the following categories, do not include the word 'Category' in your response only responsed with a single word: Programming, Small Talk, Technology, Science, Politics, Entertainment, Sports\n\n{text}\n\nCategory:",
        }],
        model="gpt-4o-mini",
        max_tokens=3,
        temperature=0.3,
    )
    return response.choices[0].message.content.strip()

def score_text(text):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                messages=[{
                    "role": "user",
                    "content": f"Rate how well-formed and coherent this sentence is on a scale from 0 (poorly formed, incoherent) to 9 (well-formed, coherent). Consider grammar, clarity, and logical flow. Respond with ONLY an integer between 0 and 9:\n\n{text}",
                }],
                model="gpt-4o-mini",
                max_tokens=2,
                temperature=0.5,
            )
            content = response.choices[0].message.content.strip()
            logging.info(f"API response: {content}")
            return int(content)
        except ValueError as e:
            logging.warning(f"Attempt {attempt + 1} failed: {e}")
            if attempt == max_retries - 1:
                logging.error(f"Failed to get a valid score after {max_retries} attempts. Returning 5 as default.")
                return 5
        except Exception as e:
            logging.error(f"Unexpected error: {e}")
            return 5
        time.sleep(1)  # Wait for 1 second before retrying

def score_training_quality(instruction, input, output):
    prompt = f"""
    Rate the quality of this instruction-input-output trio for training a language model (LLM) on a scale from 0 to 9, where 0 is very poor and 9 is excellent. Consider the following factors:

    1. Clarity and specificity of the instruction
    2. Relevance and usefulness of the input
    3. Appropriateness and correctness of the output
    4. Consistency between instruction, input, and output
    5. Diversity and representativeness for LLM training
    6. Potential for improving the model's capabilities

    Respond with ONLY an integer between 0 and 9.

    Instruction: {instruction}
    Input: {input}
    Output: {output}
    """
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="gpt-4o-mini",
                max_tokens=2,
                temperature=0.5,
            )
            content = response.choices[0].message.content.strip()
            logging.info(f"Training quality score API response: {content}")
            return int(content)
        except ValueError as e:
            logging.warning(f"Attempt {attempt + 1} failed: {e}")
            if attempt == max_retries - 1:
                logging.error(f"Failed to get a valid training quality score after {max_retries} attempts. Returning 5 as default.")
                return 5
        except Exception as e:
            logging.error(f"Unexpected error in scoring training quality: {e}")
            return 5
        time.sleep(1)  # Wait for 1 second before retrying

# Updated main processing function
def process_dataset(df):
    df['tokens'] = df.apply(lambda row: count_tokens(row['instruction'], row['input'], row['output']), axis=1)
    df['response_score'] = df['output'].apply(score_text)
    df['question_score'] = df['instruction'].apply(score_text)
    df['category'] = df['instruction'].apply(categorize_text)
    df['training_score'] = df.apply(lambda row: score_training_quality(row['instruction'], row['input'], row['output']), axis=1)
    return df

# Function to push dataset to Hugging Face Hub
def push_to_hub(df, repo_name):
    # Convert DataFrame to Hugging Face Dataset
    dataset = Dataset.from_pandas(df)

    # Push to Hub
    dataset.push_to_hub(repo_name, token=hf_token)
    logging.info(f"Dataset pushed to Hugging Face Hub: {repo_name}")

# Main execution
if __name__ == "__main__":
    if not hf_token:
        raise ValueError("HF_TOKEN environment variable not set. Please set your Hugging Face API token.")

    HfFolder.save_token(hf_token)
    dataset_name = "LlamaFinetuneBase/Notebook-Test"
    df = load_hf_dataset(dataset_name, num_rows=100)  # Load only 100 rows
    processed_df = process_dataset(df)

    # Push the processed dataset to Hugging Face Hub
    push_to_hub(processed_df, "LlamaFinetuneBase/Notebook-Test-Processed-100")

    print("Complete. Results saved and pushed to Hugging Face Hub.")