In [6]:
# !pip install pandas numpy tqdm
# !pip install torch torchvision torchaudio
# !pip install transformers sentencepiece


In [7]:
import pandas as pd
import numpy as np
import re
import json
import time
from tqdm import tqdm  # Use `tqdm` for the terminal if needed
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Check for GPU availability
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU model: {torch.cuda.get_device_name(0)}")

# Step 1: Load the CSV file
file_name = "cleaned_2018Before.csv"
df = pd.read_csv(file_name)
print(f"\n✅ Successfully loaded data with {len(df)} rows")
print("\n📊 Sample of your data:")
print(df.head(3))  # Using print for local running

# Verify required columns exist
required_columns = ['title', 'date', 'question', 'answer']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"\n⚠️ Warning: Missing required columns: {', '.join(missing_columns)}")
    print("Please ensure your CSV has 'title', 'date', 'question', and 'answer' columns.")
else:
    print("\n✅ All required columns present")

# Step 3: Install and import necessary libraries
# If you haven't installed the transformers and sentencepiece libraries, run the following in your terminal:
# pip install transformers sentencepiece

# Step 4: Initialize the FLAN-T5 Large model
print("\n🤖 Loading FLAN-T5 Large model...")
model_name = "google/flan-t5-large"

def load_model():
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    # Move model to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Create pipeline
    pipe = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1,
        max_length=200
    )
    return pipe

# Load model
llm = load_model()
print(f"\n✅ Model loaded and running on {'GPU' if torch.cuda.is_available() else 'CPU'}")

# Step 5: Create functions for metadata extraction
def extract_year(date_str):
    """Extract year from date string"""
    try:
        # Try to find a 4-digit number in the string
        year_match = re.search(r'\b(19|20)\d{2}\b', str(date_str))
        if year_match:
            return int(year_match.group(0))

        # If no match found, return current year
        import datetime
        return datetime.datetime.now().year
    except Exception as e:
        print(f"Error extracting year: {e}")
        return 2023  # Default year

def get_llm_response(prompt, max_length=100, temperature=0.2, num_return_sequences=1):
    """Get response from FLAN-T5 model"""
    try:
        # Add retry mechanism for robustness
        max_retries = 3
        for attempt in range(max_retries):
            try:
                result = llm(
                    prompt,
                    max_length=max_length,
                    temperature=temperature,
                    num_return_sequences=num_return_sequences
                )
                return result[0]['generated_text'].strip()
            except Exception as e:
                if attempt < max_retries - 1:
                    print(f"Retry {attempt+1}/{max_retries}: {e}")
                    time.sleep(2)  # Wait before retrying
                else:
                    raise e
    except Exception as e:
        print(f"Error with LLM: {e}")
        return "Could not generate response"

def determine_setting(title, question, answer):
    """Generate an appropriate setting for the content"""

    # Extract meaningful content and truncate to reasonable lengths
    title_short = title[:100] if title else ""
    question_short = question[:200] if question else ""
    answer_short = answer[:300] if answer else ""

    # Create a detailed prompt
    prompt = f"""
    Task: Analyze the content and determine the most appropriate setting or format where this conversation or exchange would take place.

    Examples of settings include:
    - "MIT Lecture" (academic lecture at a university)
    - "Television Interview" (TV show interview)
    - "Oxford Debate" (formal debate setting)
    - "Panel Discussion" (experts discussing at a conference)
    - "Political Rally" (political speech or campaign event)
    - "Podcast Discussion" (audio format conversation)
    - "Classroom Seminar" (educational setting)
    - "Press Conference" (official statements to media)
    - "Radio Show" (audio broadcast)
    - "Online Forum" (internet discussion)

    Content to analyze:
    TITLE: {title_short}
    QUESTION: {question_short}
    ANSWER EXCERPT: {answer_short}

    Based on the tone, formality, style, and content, what is the most appropriate setting for this exchange?
    Respond with only the setting name (2-4 words):
    """

    response = get_llm_response(prompt, max_length=50, temperature=0.1)

    # Clean up response (remove quotes, periods, etc.)
    response = response.strip('"\'.,;: ')

    return response

def determine_topic(title, question, answer):
    """Generate an appropriate topic for the content"""

    # Extract meaningful content and truncate to reasonable lengths
    title_short = title[:100] if title else ""
    question_short = question[:200] if question else ""
    answer_short = answer[:300] if answer else ""

    # Create a detailed prompt with clearer instructions
    prompt = f"""
    Task: Analyze the content and identify the MAIN SUBJECT MATTER being discussed.

    IMPORTANT: Respond ONLY with the general topic area (1-3 words).
    - Do NOT name specific people (like "Noam Chomsky")
    - Do NOT name specific books or works
    - Focus on the BROADER FIELD or SUBJECT AREA

    Examples of correct topic responses:
    - "Syntax" (for discussions of linguistic structure)
    - "Semantics" (for discussions about meaning in language)
    - "Political Philosophy" (for discussions of political theory)
    - "Foreign Policy" (for discussions about international relations)
    - "Media Analysis" (for critiques of media)
    - "Linguistics" (for discussions about language)
    - "Economic Theory" (for discussions about economic systems or ideologies)
    - "Immigration Policy" (for discussions about migration issues)
    - "Vietnam War" (for discussions about this specific conflict)
    - "Middle East Politics" (for discussions about politics in this region)
    - "Educational Theory" (for discussions about teaching and learning)

    Content to analyze:
    TITLE: {title_short}
    QUESTION: {question_short}
    ANSWER EXCERPT: {answer_short}

    What broad academic field or subject matter is this primarily about?
    Respond with ONLY the general topic (1-3 words, no explanation):
    """

    response = get_llm_response(prompt, max_length=50, temperature=0.1)

    # Clean up response
    response = response.strip('"\'.,;: ')

    # Additional filtering to remove person names if they still appear
    name_indicators = ["chomsky", "zinn", "foucault", "kissinger"]
    if any(name.lower() in response.lower() for name in name_indicators):
        # Try to extract a topic from the title if the response contains a person's name
        backup_prompt = f"""
        Based only on this title: "{title_short}"

        What general academic field or subject area (NOT a person's name) would this fall under?
        Choose from: Linguistics, Politics, International Relations, Philosophy, Media Studies,
        History, Economics, Sociology, Psychology, Literature, Military Studies, Education

        Respond with only 1-3 words:
        """
        response = get_llm_response(backup_prompt, max_length=20, temperature=0.1)

    return response

def determine_persona(title, question, answer):
    """Generate an appropriate persona for the content"""

    # Extract meaningful content and truncate to reasonable lengths
    title_short = title[:100] if title else ""
    question_short = question[:200] if question else ""
    answer_short = answer[:300] if answer else ""

    # Create a detailed prompt
    prompt = f"""
    Task: Analyze the content and determine what type of person or professional role would typically give this kind of answer.

    Examples of personas include:
    - "Professor" (academic expert)
    - "Debater" (skilled in formal arguments)
    - "Political Activist" (advocate for political causes)
    - "Cognitive Scientist" (researcher of mind and intelligence)
    - "Journalist" (news reporter or analyst)
    - "Policy Expert" (specialist in public policy)
    - "Historian" (expert in historical events)
    - "Economist" (specialist in economics)
    - "Political Candidate" (person running for office)
    - "Cultural Critic" (analyst of cultural trends)

    Content to analyze:
    TITLE: {title_short}
    QUESTION: {question_short}
    ANSWER EXCERPT: {answer_short}

    Based on the tone, expertise level, perspective, and content, what persona would likely give this answer?
    Respond with only the persona type (1-3 words):
    """

    response = get_llm_response(prompt, max_length=50, temperature=0.1)

    # Clean up response
    response = response.strip('"\'.,;: ')

    return response

def generate_instruction(title, setting, topic, persona):
    """Generate an instruction for an AI based on metadata"""

    prompt = f"""
    Task: Create a clear, concise instruction for an AI assistant based on these parameters:

    TITLE: {title[:100]}
    SETTING: {setting}
    TOPIC: {topic}
    PERSONA: {persona}

    Write an instruction that tells the AI to respond as if it were a {persona} in a {setting} discussing {topic}.
    The instruction should be clear, specific, and 1-2 sentences long.
    Begin with "Respond as a..." or similar directive phrase:
    """

    response = get_llm_response(prompt, max_length=100, temperature=0.3)
    return response

# Step 6: Process all rows to extract metadata
print("\n🔍 Processing all rows to extract metadata...")
enriched_data = []

for index, row in tqdm(df.iterrows(), total=len(df)):
    # Extract raw data
    title = str(row.get('title', ''))
    date_str = str(row.get('date', ''))
    question = str(row.get('question', ''))
    answer = str(row.get('answer', ''))

    # Extract year from date
    year = extract_year(date_str)

    # Generate metadata
    setting = determine_setting(title, question, answer)
    topic = determine_topic(title, question, answer)
    persona = determine_persona(title, question, answer)
    instruction = generate_instruction(title, setting, topic, persona)

    # Create enriched entry
    enriched_entry = {
        "year": year,
        "setting": setting,
        "topic": topic,
        "persona": persona,
        "instruction": instruction,
        "input": question,
        "output": answer
    }

    enriched_data.append(enriched_entry)
    time.sleep(0.5)  # Small delay

# Step 7: Save the enriched data
print("\n💾 Saving enriched data...")

# Save as JSON with pretty formatting
output_file_json = "enriched_data_before.json"
with open(output_file_json, 'w') as f:
    json.dump(enriched_data, f, indent=2)

# Save as CSV
output_file_csv = "enriched_data_before.csv"
enriched_df = pd.DataFrame(enriched_data)
enriched_df.to_csv(output_file_csv, index=False)

print(f"\n✅ Data saved to {output_file_json} and {output_file_csv}")

# Instead of downloading (as in Colab), inform the user where to find the files locally.
print("\n📥 The enriched data files have been saved to your current working directory.")
print("\n📊 Sample of enriched data:")
print(enriched_df.head(3))

print("\n🎉 Processing complete! You can now use the generated JSON and CSV files from your local directory.")

# Optional: Functions to process more data if needed
def process_additional_data(start_idx, num_rows):
    """Process additional rows from the dataset"""
    print(f"\n🔄 Processing rows {start_idx+1} to {min(start_idx+num_rows, len(df))}")

    additional_data = []
    end_idx = min(start_idx + num_rows, len(df))

    for i in tqdm(range(start_idx, end_idx)):
        row = df.iloc[i]

        # Extract raw data
        title = str(row.get('title', ''))
        date_str = str(row.get('date', ''))
        question = str(row.get('question', ''))
        answer = str(row.get('answer', ''))

        # Extract year from date
        year = extract_year(date_str)

        # Generate metadata
        setting = determine_setting(title, question, answer)
        topic = determine_topic(title, question, answer)
        persona = determine_persona(title, question, answer)
        instruction = generate_instruction(title, setting, topic, persona)

        # Create enriched entry
        enriched_entry = {
            "year": year,
            "setting": setting,
            "topic": topic,
            "persona": persona,
            "instruction": instruction,
            "input": question,
            "output": answer
        }

        additional_data.append(enriched_entry)
        time.sleep(0.5)  # Small delay

    return additional_data

# Uncomment the following block to process additional rows locally
"""
# Process next 10 rows
more_data = process_additional_data(10, 10)

# Add to existing data
enriched_data.extend(more_data)

# Save updated data
with open(output_file_json, 'w') as f:
    json.dump(enriched_data, f, indent=2)

# Update CSV
pd.DataFrame(enriched_data).to_csv(output_file_csv, index=False)
print(f"\n✅ Updated data saved to {output_file_json} and {output_file_csv}")
"""


GPU available: True
GPU model: NVIDIA GeForce RTX 3090 Ti

✅ Successfully loaded data with 7537 rows

📊 Sample of your data:
          title  date                                           question  \
0  Noam Chomsky  2017  How do you view your own contributions to the ...   
1  Noam Chomsky  2017  What's your perspective on the Chomsky hierarc...   
2  Noam Chomsky  2017  Could you elaborate on the concept of a univer...   

                                              answer  
0  Considered the founder of modern linguistics, ...  
1  I introduced the Chomsky hierarchy, which has ...  
2  The concept of a universal grammar underlies a...  

✅ All required columns present

🤖 Loading FLAN-T5 Large model...


Device set to use cuda:0



✅ Model loaded and running on GPU

🔍 Processing all rows to extract metadata...


 83%|████████▎ | 6285/7537 [2:29:14<27:53,  1.34s/it]  Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 7537/7537 [2:55:08<00:00,  1.39s/it]



💾 Saving enriched data...

✅ Data saved to enriched_data_before.json and enriched_data_before.csv

📥 The enriched data files have been saved to your current working directory.

📊 Sample of enriched data:
   year      setting              topic              persona  \
0  2017  MIT Lecture        Linguistics            Professor   
1  2017   Radio Show         Philosophy  Cognitive Scientist   
2  2017  MIT Lecture  Universal grammar  Cognitive Scientist   

                                         instruction  \
0  Respond as a Professor in a MIT Lecture discus...   
1  Respond as a Cognitive Scientist in a Radio Sh...   
2  Respond as a Cognitive Scientist in a MIT Lect...   

                                               input  \
0  How do you view your own contributions to the ...   
1  What's your perspective on the Chomsky hierarc...   
2  Could you elaborate on the concept of a univer...   

                                              output  
0  Considered the founder of mode

'\n# Process next 10 rows\nmore_data = process_additional_data(10, 10)\n\n# Add to existing data\nenriched_data.extend(more_data)\n\n# Save updated data\nwith open(output_file_json, \'w\') as f:\n    json.dump(enriched_data, f, indent=2)\n\n# Update CSV\npd.DataFrame(enriched_data).to_csv(output_file_csv, index=False)\nprint(f"\n✅ Updated data saved to {output_file_json} and {output_file_csv}")\n'