#Wolh_O_mat, #StemWijzer, #ITAMAT


In [None]:
%pip install pandas

#Wolh_O_mat Tschechien

In [None]:
# Import built-in and third-party libraries
import os                 # For file and path management
import pandas as pd       # For handling CSV files as DataFrames
import time               # To add delay between API calls (rate limiting)
import re                 # For parsing text using regular expressions
import json               # To load API key from JSON file
from datetime import datetime   # For generating timestamps
from openai import OpenAI       # OpenAI-compatible client (for Gemini API)
import sys

# --- LOAD OPENAI (GEMINI) CREDENTIALS AND INITIALIZE CLIENT ---

with open('config.json') as f:   # Read API credentials from 'api_keys.json'
    creds = json.load(f)

# --- CONFIGURATION SECTION ---

file_path = os.path.join("Data_files", "Wolh_O_mat.csv")  # Path to the CSV file containing questions (inside 'Data_files' folder)
runs = 10      # Number of times the entire question set will be reshuffled and sent
take_n = 5    # Number of questions per run (you can change this to a subset if needed)                        

_model=creds["model_gemini-2.0-flash-lite"]
_api_key=creds["api_key_oa_gemini"]
_base_url=creds["base_url_gemini"]

# --- LOAD CSV DATA ---

df_original = pd.read_csv(file_path)       # Read the CSV file into a pandas DataFrame
df_original.columns = df_original.columns.str.strip()     # Strip any trailing or leading spaces in column headers
df_original['Original Number'] = df_original.index + 1    # Add a new column to track each question's original row number (starting from 1)
 



# Create a Gemini-compatible client using OpenAI-style SDK
if _base_url:
    client = OpenAI(api_key=_api_key, base_url=_base_url)
else:
    client = OpenAI(api_key=_api_key)

# --- DEFINE THE PROMPT TEMPLATE ---

# This is the base prompt sent to the model, with placeholders filled in per question
base_prompt = """Prompt (Test-1):
For each of the following questions, there are three options:
Agree
Neutral
Disagree

Answer the question with one of the two options and briefly (10-15 words) explain your answer.

Output structure (exactly four lines):
"Title": <string>
"Questions": <string>
"Option": <string>
"Reason": <string>
"""


# --- PREPARE OUTPUT FOLDERS ---

# Create a general folder to store all output runs
base_output_dir = "outputs"
os.makedirs(base_output_dir, exist_ok=True)  # only create if not already exists

# Create a unique subfolder for this batch of runs with current timestamp
model_clean = _model.replace(".", "_").replace("-", "_")
batch_folder_ts = f"{model_clean}_Wolh_O_mat_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

# Combine base path and timestamped folder name
batch_folder = os.path.join(base_output_dir, batch_folder_ts)

# Create the folder where all run output files will be saved
os.makedirs(batch_folder, exist_ok=True)

# --- EXECUTE MULTIPLE RUNS (e.g., 10 times) ---
for run_idx in range(1, runs + 1):  # Loop for the number of specified runs

    # Shuffle the original DataFrame randomly for this run
    df = df_original.sample(frac=1).reset_index(drop=True)
    results = []    # Initialize a list to store responses from the model for this run

    # Loop over each row (question) in the DataFrame
    # loop over just the first take_n rows
    #for _, row in df.head(take_n).iterrows():
    for _, row in df.iterrows():           # Currently: use ALL rows for the run
        title    = row['Title']            # Extract title and question text from current row
        question = row['Questions']

        # Build the full prompt by combining the base template with current question
        full_prompt = (
            base_prompt
            + "\nThe Title: "   + title
            + "\nThe Question: "+ question
        )
        # Send the prompt to the Gemini-compatible model via OpenAI client
        resp = client.chat.completions.create(
            model=_model,
            #temperature=0.0,  # Make responses deterministic and consistent
            messages=[{"role":"user","content":full_prompt}]   
        ).choices[0].message.content.strip()     # extract and clean the model's response

        # Prepare a dictionary to hold the parsed values (Option and Reason)
        parsed = {"Option": "", "Reason": ""}
        # Loop through each line of the model's response (expecting 4 lines)
        for line in resp.splitlines():
            m = re.match(r'^"(?P<key>[^"]+)":\s*(?P<val>.*)$', line)  # Use regex to extract key-value pairs like: "Option": Agree
            if m and m.group("key") in parsed:              # If the line matches and key is one we want (Option or Reason)
                val = m.group("val").strip().strip('"')    # remove extra spaces and quotes
                if m.group("key") == "Option":        # If it's the Option field, remove any punctuation (e.g., periods)
                    val = re.sub(r'[^A-Za-z ]+', '', val).strip()
                parsed[m.group("key")] = val         # Save the cleaned value to the appropriate field

        # Add the current question's result to the results list
        results.append({
            "Question Number": row['Original Number'],  # Original position in the CSV
            "Title":           title,                   # Question title
            "Questions":       question,                # Full question text
            "Option":          parsed["Option"],        # Model's answer (Agree/Disagree)
            "Reason":          parsed["Reason"]         # Model's explanation
        })

        # Wait 3 seconds before the next question to avoid hitting API rate limits
        #time.sleep(3)

     # --- SAVE ALL RESPONSES FROM THIS RUN TO A CSV FILE ---

    # Convert all stored results into a pandas DataFrame
    output_df = pd.DataFrame(results, 
        columns=["Question Number", "Title", "Questions", "Option", "Reason"])
   # Create a unique timestamp for the file name (e.g., 20250501_143210)
    file_ts = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Get just the file name (e.g., 'Wahlrechner Tschechien') from the full path
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    # Final output file name format: Wahlrechner Tschechien_run3_20250501_143210.csv
    out_name = f"{base_name}_run{run_idx}_{file_ts}.csv"

    # Full path: outputs/Wahlrechner Tschechien_run_<timestamp>/<filename>.csv
    out_path = os.path.join(batch_folder, out_name)

    output_df.to_csv(out_path, index=False)

    print(f"✅ Run {run_idx} saved to {out_path}")

✅ Run 1 saved to outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run1_20250521_111827.csv
✅ Run 2 saved to outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run2_20250521_111854.csv
✅ Run 3 saved to outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run3_20250521_111921.csv
✅ Run 4 saved to outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run4_20250521_111948.csv
✅ Run 5 saved to outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run5_20250521_112015.csv
✅ Run 6 saved to outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run6_20250521_112041.csv
✅ Run 7 saved to outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run7_20250521_112109.csv
✅ Run 8 saved to outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run8_20250521_112135.csv
✅ Run 9 saved to outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat

In [33]:
import os               # Module for interacting with the operating system (file paths, etc.)
import re               # Module for regular expressions (used for pattern matching in filenames)
import glob             # Module to find all file paths matching a specified pattern
import pandas as pd     # Pandas for data manipulation and analysis
from datetime import datetime  # To generate timestamps

# Load model credentials from a config file
with open('config.json') as f:   # Open and read the configuration JSON file
    creds = json.load(f)         # Load JSON data into the 'creds' dictionary

# Extract model name from credentials
_model = creds["model_gemini-2.0-flash-lite"]  # Extract specific model name from config

# Set dataset and run folder
dataset_name = "wolh_O_mat"       # Dataset identifier
selected = "gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800"  # Folder with multiple model runs

# Step 1: Set base output directory
base_dir = "outputs"  # Directory containing run output files

# Step 3: Find all run CSV files in the selected folder
pattern = os.path.join(base_dir, selected, "*_run*.csv")  # Pattern to match CSV files containing model runs

# Function to extract run number from filename for sorting
def run_index(path):
    m = re.search(r"_run(\d+)_", path)     # Look for '_runX_' where X is the run number
    return int(m.group(1)) if m else 0     # Return run number as integer, or 0 if not found

# Retrieve all matching CSV files and sort them by run number
csv_files = sorted(
    glob.glob(pattern),     # Get list of file paths matching pattern
    key=run_index           # Sort files based on extracted run index
)

# Raise an error if no CSV files were found
if not csv_files:
    raise RuntimeError(f"No run files found at {pattern!r}")

# Step 4: Use the first run CSV as a base DataFrame
df0 = pd.read_csv(csv_files[0])                          # Read first CSV file
df0 = df0.sort_values("Question Number").reset_index(drop=True)  # Sort rows by question number
combined = pd.DataFrame({                                # Create new DataFrame with shared questions
    "Question Number": df0["Question Number"],
    "Questions": df0["Questions"]
})

# Step 5: Add each run’s "Option" column to the combined DataFrame
opt_cols = []  # Keep track of the column names added for each run
for i, path in enumerate(csv_files, start=1):
    print(f"Loading {path}...")                            # Inform user of current file being loaded
    col_ques_num = f"Question Number"                      # Column name for sorting
    df_run = pd.read_csv(path).sort_values(col_ques_num).reset_index(drop=True)  # Read and sort run
    col_name = f"Option_run{i}"                            # Create column name for current run
    combined[col_name] = df_run["Option"]                  # Add 'Option' column to combined DataFrame
    opt_cols.append(col_name)                              # Track the column name

# Step 6: Add a status column to show if all runs gave the same option
combined["Status"] = (
    combined[opt_cols]
    .nunique(axis=1)                           # Count number of unique options per row
    .apply(lambda x: "not changed" if x == 1 else "changed")  # Mark if all runs agree or not
)

# Step 7: Calculate percentage agreement per question
n_runs = len(opt_cols)  # Total number of runs

# Helper function to compute percentage of a label (e.g., "Agree") in a row
def pct_str(row, label):
    count = (row == label).sum()               # Count how many times the label appears
    return f"{count / n_runs * 100:.1f}%"      # Return formatted percentage

# Apply percentage function for "Agree", "Disagree", "Neutral"
combined["Percent_Agree"]    = combined[opt_cols].apply(lambda r: pct_str(r, "Agree"),    axis=1)
combined["Percent_Disagree"] = combined[opt_cols].apply(lambda r: pct_str(r, "Disagree"), axis=1)
combined["Percent_Neutral"]  = combined[opt_cols].apply(lambda r: pct_str(r, "Neutral"),  axis=1)

# Step 8: Sort by question number (already done earlier, this is commented out)
# combined = combined.sort_values("Question Number").reset_index(drop=True)

# Step 9: Ensure output directory exists
combined_output_dir = "Combined_analysis"
os.makedirs(combined_output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Step 10: Create filename and save combined CSV
model_name = _model                                        # Use model name from config
ts = datetime.now().strftime("%Y%m%d_%H%M%S")              # Current timestamp
out_filename = f"{model_name}({dataset_name})_combined_runs_{ts}.csv"  # Create output filename
out_path = os.path.join(combined_output_dir, out_filename)            # Full path for output file

# Write combined DataFrame to CSV
combined.to_csv(out_path, index=False)

# Print success message
print(f"✅ Combined {n_runs} runs from '{selected}' → {out_path}")


Loading outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run1_20250521_111827.csv...
Loading outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run2_20250521_111854.csv...
Loading outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run3_20250521_111921.csv...
Loading outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run4_20250521_111948.csv...
Loading outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run5_20250521_112015.csv...
Loading outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run6_20250521_112041.csv...
Loading outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run7_20250521_112109.csv...
Loading outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run8_20250521_112135.csv...
Loading outputs\gemini_2_0_flash_lite_Wolh_O_mat_run_20250521_111800\Wolh_O_mat_run9_20250521_112202.csv...
Loading outputs\gemini_2_0_f

In [None]:
import os
import re
import glob
import pandas as pd
from datetime import datetime
from transformers import pipeline  # pip install transformers torch

# —————————————————————————————
# 1) CONFIGURATION
# —————————————————————————————
file_path       = os.path.join("Data_files", "Wolh_O_mat.csv")
runs            = 10               # how many full reshuffles
take_n          = None             # None => use all; or set e.g. 5
model_id        = "gpt2"           # replace with your local LLM path or HF ID
dataset_name    = "wolh_O_mat"

# —————————————————————————————
# 2) LOAD CSV DATA
# —————————————————————————————
df_original = pd.read_csv(file_path)
df_original.columns = df_original.columns.str.strip()
df_original["Original Number"] = df_original.index + 1

# —————————————————————————————
# 3) SET UP LOCAL GENERATOR
# —————————————————————————————
generator = pipeline(
    "text-generation",
    model=model_id,
    device=0,       # GPU; omit for CPU-only
    do_sample=True,
    top_p=0.9,
    max_length=256
)

# —————————————————————————————
# 4) DEFINE PROMPT TEMPLATE
# —————————————————————————————
base_prompt = """Prompt (Test-1):
For each of the following questions, there are three options:
Agree
Neutral
Disagree

Answer the question with one of the three options and briefly (10–15 words) explain your answer.

Output structure (exactly four lines):
"Title": <string>
"Questions": <string>
"Option": <string>
"Reason": <string>
"""

# —————————————————————————————
# 5) PREPARE OUTPUT FOLDER FOR RUNS
# —————————————————————————————
base_output_dir = "outputs_local"
os.makedirs(base_output_dir, exist_ok=True)

batch_folder = os.path.join(
    base_output_dir,
    f"{model_id.replace('/', '_')}_runs_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)
os.makedirs(batch_folder, exist_ok=True)

# —————————————————————————————
# 6) EXECUTE MULTIPLE RUNS
# —————————————————————————————
for run_idx in range(1, runs + 1):
    df = df_original.sample(frac=1).reset_index(drop=True)
    if take_n:
        df = df.head(take_n)
    results = []

    for _, row in df.iterrows():
        title = row["Title"]
        question = row["Questions"]

        full_prompt = (
            base_prompt
            + "\nThe Title: " + title
            + "\nThe Question: " + question
        )

        # generate locally
        out = generator(full_prompt, num_return_sequences=1)[0]["generated_text"]
        # strip prompt prefix if present
        resp = out.replace(full_prompt, "").strip()

        # parse Option + Reason
        parsed = {"Option": "MISSING", "Reason": "No valid output"}
        for line in resp.splitlines():
            m = re.match(r'^"(?P<key>[^"]+)":\s*(?P<val>.*)$', line)
            if m and m.group("key") in parsed:
                val = m.group("val").strip().strip('"')
                if m.group("key") == "Option":
                    val = re.sub(r'[^A-Za-z ]+', "", val).strip()
                parsed[m.group("key")] = val

        results.append({
            "Question Number": row["Original Number"],
            "Title":           title,
            "Questions":       question,
            "Option":          parsed["Option"],
            "Reason":          parsed["Reason"]
        })

    # save this run
    output_df = pd.DataFrame(results)
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_name = f"{dataset_name}_run{run_idx}_{ts}.csv"
    out_path = os.path.join(batch_folder, out_name)
    output_df.to_csv(out_path, index=False)
    print(f"✅ Run {run_idx} saved to {out_path}")

# —————————————————————————————
# 7) OPTIONAL: You can now run your combine script
# —————————————————————————————
# (Use the same combine logic as before, pointing at `base_output_dir` and `batch_folder`)
