In [None]:
# CSV Metadata Enrichment with FLAN-T5
# This notebook processes a CSV file with title, date, question, and answer columns
# and generates enriched metadata using FLAN-T5 Large

import pandas as pd
import numpy as np
import re
import json
import time
from tqdm.notebook import tqdm
# from google.colab import files
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, pipeline

# Check for GPU availability
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU model: {torch.cuda.get_device_name(0)}")

# Step 1: Upload the CSV file
print("\n📂 Please upload your CSV file containing the title, date, question, and answer columns")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
dataset = load_dataset("json", data_files="nonbooks_poutput_sharegpt.jsonl")

# Step 2: Load and preprocess the CSV data
df = pd.read_csv(file_name)
print(f"\n✅ Successfully loaded data with {len(df)} rows")
print("\n📊 Sample of your data:")
display(df.head(3))

# Verify required columns exist
required_columns = ['title', 'date', 'question', 'answer']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"\n⚠️ Warning: Missing required columns: {', '.join(missing_columns)}")
    print("Please ensure your CSV has 'title', 'date', 'question', and 'answer' columns.")
else:
    print("\n✅ All required columns present")

# Step 3: Install and import necessary libraries
print("\n📦 Installing required libraries...")
!pip install -q transformers sentencepiece

# Step 4: Initialize the FLAN-T5 Large model
print("\n🤖 Loading FLAN-T5 Large model...")
model_name = "google/flan-t5-large"

def load_model():
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    # Move model to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Create pipeline
    pipe = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1,
        max_length=200
    )
    return pipe

# Load model
llm = load_model()
print(f"\n✅ Model loaded and running on {'GPU' if torch.cuda.is_available() else 'CPU'}")

# Step 5: Create functions for metadata extraction
def extract_year(date_str):
    """Extract year from date string"""
    try:
        # Try to find a 4-digit number in the string
        year_match = re.search(r'\b(19|20)\d{2}\b', str(date_str))
        if year_match:
            return int(year_match.group(0))

        # If no match found, return current year
        import datetime
        return datetime.datetime.now().year
    except Exception as e:
        print(f"Error extracting year: {e}")
        return 2023  # Default year

def get_llm_response(prompt, max_length=100, temperature=0.2, num_return_sequences=1):
    """Get response from FLAN-T5 model"""
    try:
        # Add retry mechanism for robustness
        max_retries = 3
        for attempt in range(max_retries):
            try:
                result = llm(
                    prompt,
                    max_length=max_length,
                    temperature=temperature,
                    num_return_sequences=num_return_sequences
                )
                return result[0]['generated_text'].strip()
            except Exception as e:
                if attempt < max_retries - 1:
                    print(f"Retry {attempt+1}/{max_retries}: {e}")
                    time.sleep(2)  # Wait before retrying
                else:
                    raise e
    except Exception as e:
        print(f"Error with LLM: {e}")
        return "Could not generate response"

def determine_setting(title, question, answer):
    """Generate an appropriate setting for the content"""

    # Extract meaningful content and truncate to reasonable lengths
    title_short = title[:100] if title else ""
    question_short = question[:200] if question else ""
    answer_short = answer[:300] if answer else ""

    # Create a detailed prompt
    prompt = f"""
    Task: Analyze the content and determine the most appropriate setting or format where this conversation or exchange would take place.

    Examples of settings include:
    - "MIT Lecture" (academic lecture at a university)
    - "Television Interview" (TV show interview)
    - "Oxford Debate" (formal debate setting)
    - "Panel Discussion" (experts discussing at a conference)
    - "Political Rally" (political speech or campaign event)
    - "Podcast Discussion" (audio format conversation)
    - "Classroom Seminar" (educational setting)
    - "Press Conference" (official statements to media)
    - "Radio Show" (audio broadcast)
    - "Online Forum" (internet discussion)

    Content to analyze:
    TITLE: {title_short}
    QUESTION: {question_short}
    ANSWER EXCERPT: {answer_short}

    Based on the tone, formality, style, and content, what is the most appropriate setting for this exchange?
    Respond with only the setting name (2-4 words):
    """

    response = get_llm_response(prompt, max_length=50, temperature=0.1)

    # Clean up response (remove quotes, periods, etc.)
    response = response.strip('"\'.,;: ')

    return response

def determine_topic(title, question, answer):
    """Generate an appropriate topic for the content"""

    # Extract meaningful content and truncate to reasonable lengths
    title_short = title[:100] if title else ""
    question_short = question[:200] if question else ""
    answer_short = answer[:300] if answer else ""

    # Create a detailed prompt
    prompt = f"""
    Task: Analyze the content and determine the main topic or subject matter being discussed.

    Examples of topics include:
    - "Syntax" (linguistic structure)
    - "Semantics" (meaning in language)
    - "Politics" (political systems or events)
    - "Foreign Policy" (international relations)
    - "Cognitive Science" (study of mind and intelligence)
    - "Media Critique" (analysis of media)
    - "neoliberalism" (economic ideology)
    - "immigration" (movement of people between countries)
    - "community" (social groups and organization)
    - "Vietnam" (country or Vietnam War)
    - "Iraq" (country or Iraq War)
    - "India" (country or Indian politics/culture)
    - "United States" (country or US politics)

    Content to analyze:
    TITLE: {title_short}
    QUESTION: {question_short}
    ANSWER EXCERPT: {answer_short}

    Based on the content, what is the main topic being discussed?
    Respond with only the topic (1-3 words):
    """

    response = get_llm_response(prompt, max_length=50, temperature=0.1)

    # Clean up response
    response = response.strip('"\'.,;: ')

    return response

def determine_persona(title, question, answer):
    """Generate an appropriate persona for the content"""

    # Extract meaningful content and truncate to reasonable lengths
    title_short = title[:100] if title else ""
    question_short = question[:200] if question else ""
    answer_short = answer[:300] if answer else ""

    # Create a detailed prompt
    prompt = f"""
    Task: Analyze the content and determine what type of person or professional role would typically give this kind of answer.

    Examples of personas include:
    - "Professor" (academic expert)
    - "Debater" (skilled in formal arguments)
    - "Political Activist" (advocate for political causes)
    - "Cognitive Scientist" (researcher of mind and intelligence)
    - "Journalist" (news reporter or analyst)
    - "Policy Expert" (specialist in public policy)
    - "Historian" (expert in historical events)
    - "Economist" (specialist in economics)
    - "Political Candidate" (person running for office)
    - "Cultural Critic" (analyst of cultural trends)

    Content to analyze:
    TITLE: {title_short}
    QUESTION: {question_short}
    ANSWER EXCERPT: {answer_short}

    Based on the tone, expertise level, perspective, and content, what persona would likely give this answer?
    Respond with only the persona type (1-3 words):
    """

    response = get_llm_response(prompt, max_length=50, temperature=0.1)

    # Clean up response
    response = response.strip('"\'.,;: ')

    return response

def generate_instruction(title, setting, topic, persona):
    """Generate an instruction for an AI based on metadata"""

    prompt = f"""
    Task: Create a clear, concise instruction for an AI assistant based on these parameters:

    TITLE: {title[:100]}
    SETTING: {setting}
    TOPIC: {topic}
    PERSONA: {persona}

    Write an instruction that tells the AI to respond as if it were a {persona} in a {setting} discussing {topic}.
    The instruction should be clear, specific, and 1-2 sentences long.
    Begin with "Respond as a..." or similar directive phrase:
    """

    response = get_llm_response(prompt, max_length=100, temperature=0.3)
    return response

# Step 6: Process the data
print("\n🔍 Processing the first 10 rows to extract metadata...")
rows_to_process = min(10, len(df))
enriched_data = []

for i in tqdm(range(rows_to_process)):
    row = df.iloc[i]

    # Extract raw data
    title = str(row.get('title', ''))
    date_str = str(row.get('date', ''))
    question = str(row.get('question', ''))
    answer = str(row.get('answer', ''))

    # Extract year from date
    year = extract_year(date_str)

    # Generate metadata
    print(f"\n📝 Processing row {i+1}/{rows_to_process}: '{title[:50]}...'")

    setting = determine_setting(title, question, answer)
    print(f"  Setting: {setting}")

    topic = determine_topic(title, question, answer)
    print(f"  Topic: {topic}")

    persona = determine_persona(title, question, answer)
    print(f"  Persona: {persona}")

    instruction = generate_instruction(title, setting, topic, persona)
    print(f"  Instruction: {instruction}")

    # Create enriched entry
    enriched_entry = {
        "year": year,
        "setting": setting,
        "topic": topic,
        "persona": persona,
        "instruction": instruction,
        "input": question,
        "output": answer
    }

    enriched_data.append(enriched_entry)

    # Add a small delay to avoid overloading the model
    time.sleep(0.5)

# Step 7: Save the enriched data
print("\n💾 Saving enriched data...")

# Save as JSON with pretty formatting
output_file_json = "enriched_data.json"
with open(output_file_json, 'w') as f:
    json.dump(enriched_data, f, indent=2)

# Save as CSV
output_file_csv = "enriched_data.csv"
enriched_df = pd.DataFrame(enriched_data)
enriched_df.to_csv(output_file_csv, index=False)

print(f"\n✅ Data saved to {output_file_json} and {output_file_csv}")

# Step 8: Download the files
print("\n📥 Preparing files for download...")
files.download(output_file_json)
files.download(output_file_csv)

# Display sample of the enriched data
print("\n📊 Sample of enriched data:")
display(enriched_df.head(3))

print("\n🎉 Processing complete! Files have been downloaded to your computer.")

# Optional: Functions to process more data if needed
def process_additional_data(start_idx, num_rows):
    """Process additional rows from the dataset"""
    print(f"\n🔄 Processing rows {start_idx+1} to {min(start_idx+num_rows, len(df))}")

    additional_data = []
    end_idx = min(start_idx + num_rows, len(df))

    for i in tqdm(range(start_idx, end_idx)):
        row = df.iloc[i]

        # Extract raw data
        title = str(row.get('title', ''))
        date_str = str(row.get('date', ''))
        question = str(row.get('question', ''))
        answer = str(row.get('answer', ''))

        # Extract year from date
        year = extract_year(date_str)

        # Generate metadata
        setting = determine_setting(title, question, answer)
        topic = determine_topic(title, question, answer)
        persona = determine_persona(title, question, answer)
        instruction = generate_instruction(title, setting, topic, persona)

        # Create enriched entry
        enriched_entry = {
            "year": year,
            "setting": setting,
            "topic": topic,
            "persona": persona,
            "instruction": instruction,
            "input": question,
            "output": answer
        }

        additional_data.append(enriched_entry)
        time.sleep(0.5)  # Small delay

    return additional_data

# Uncomment this code to process additional rows
"""
# Process next 10 rows
more_data = process_additional_data(10, 10)

# Add to existing data
enriched_data.extend(more_data)

# Save updated data
with open(output_file_json, 'w') as f:
    json.dump(enriched_data, f, indent=2)

# Update CSV
pd.DataFrame(enriched_data).to_csv(output_file_csv, index=False)

# Download updated files
files.download(output_file_json)
files.download(output_file_csv)
"""

ModuleNotFoundError: No module named 'google.colab'