In [1]:
import pandas as pd

df = pd.read_csv('Data/ukraine_water_reasons.csv')

In [9]:
import pandas as pd
import requests
import os
from tqdm import tqdm
import time
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Together API endpoint and key
endpoint = 'https://api.together.xyz/inference'
TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')

def prepare_prompt(text, code):
    """Prepare prompt for a specific code and text."""
    codebook = {
        'env_problems': "Is the text about an environmental problem?",
        'pollution': "Is the text about environmental pollution?",
        'treatment': "Is the text about treatment plants or environmental technologies?",
        'climate': "Is the text about climatic indicators?",
        'biomonitoring': "Is the text about biological, biotic monitoring in water or in a river basin?"
    }
    
    system_prompt = f"""You are a qualitative coder who is annotating water quality reports.
    Your task is to determine if the text matches this specific code: {codebook[code]}
    
    Text to analyze: {text}
    
    Respond ONLY with one of these two formats:
    Yes: [A clear, concise reason why the text matches the code]
    No: [A clear, concise reason why the text does not match the code]"""
    
    return system_prompt

def clean_response_text(text):
    """Clean the response text to get just Yes/No and reason."""
    # Remove any quotes and newlines
    text = text.replace('"""', '').replace("'''", '')
    text = text.replace('\\n', ' ').strip()
    
    # Remove the full prompt if it appears in the response
    if 'You are a qualitative coder' in text:
        text = text.split('You are a qualitative coder')[0].strip()
    
    # Remove "answer =" if it appears
    if 'answer =' in text:
        text = text.split('answer =')[1].strip()
    
    # Remove any remaining quotes
    text = text.strip('"').strip("'")
    
    # Split into decision and reason
    if ':' in text:
        decision, reason = text.split(':', 1)
    elif '.' in text:
        first_part = text.split('.', 1)
        if 'yes' in first_part[0].lower() or 'no' in first_part[0].lower():
            decision = first_part[0]
            reason = first_part[1] if len(first_part) > 1 else ''
        else:
            decision = 'Invalid'
            reason = text
    else:
        decision = text
        reason = ''
    
    # Clean up decision
    decision = decision.strip().lower()
    if 'yes' in decision:
        decision = 'Yes'
    elif 'no' in decision:
        decision = 'No'
    else:
        decision = 'Invalid'
    
    # Clean up reason
    reason = reason.strip()
    if reason.lower().startswith('the text'):
        reason = "T" + reason[1:]
    
    return decision, reason

def classify_text(text, code, retry_count=3, retry_delay=1):
    """Classify text for a specific code with retry logic."""
    for attempt in range(retry_count):
        try:
            headers = {
                "Authorization": f"Bearer {TOGETHER_API_KEY}",
                "Content-Type": "application/json"
            }
            
            data = {
                "model": "meta-llama/Llama-3-70b-chat-hf",
                "prompt": prepare_prompt(text, code),
                "max_tokens": 100,
                "temperature": 0.1,
                "top_p": 1,
                "top_k": 40,
                "repetition_penalty": 1
            }
            
            response = requests.post(endpoint, json=data, headers=headers)
            response.raise_for_status()
            
            response_text = response.json().get('output', {}).get('choices', [{}])[0].get('text', '')
            decision, reason = clean_response_text(response_text)
            
            # If we got an Invalid response, try one more time
            if decision == 'Invalid' and attempt < retry_count - 1:
                time.sleep(retry_delay)
                continue
                
            return decision, reason
            
        except Exception as e:
            if attempt == retry_count - 1:
                return "Error", str(e)
            time.sleep(retry_delay * (attempt + 1))
    
    return "Error", "Maximum retries exceeded"

def process_water_reports(df, test_rows=None):
    """Process water quality reports and add new model predictions while preserving existing columns."""
    # Make a copy of the dataframe (either full or test portion)
    if test_rows is not None:
        df_processed = df.head(test_rows).copy()
    else:
        df_processed = df.copy()
    
    # Add new columns for Llama3 predictions
    df_processed['new_model_code'] = None
    df_processed['new_model_reason'] = None
    
    # Process each report
    for idx, row in tqdm(df_processed.iterrows(), total=len(df_processed)):
        try:
            decision, reason = classify_text(row['text'], row['code_name'])
            df_processed.at[idx, 'new_model_code'] = decision
            df_processed.at[idx, 'new_model_reason'] = reason
        except Exception as e:
            print(f"\nError processing row {idx}: {str(e)}")
            continue
    
    return df_processed

# Example usage:
df_results = process_water_reports(df)

100%|██████████| 500/500 [11:07<00:00,  1.34s/it]


In [None]:
#df_results.to_csv('results_csvs/water_quality_full_results.csv', index=False)


In [10]:
df_results

Unnamed: 0,text_id,code_id,code_name,text,original_code,replicated_code,model_code,reason,new_model_code,new_model_reason
0,0,0,env_problems,Autumn period especially its second half is ch...,No,No,No,The text is not about an environmental problem...,No,The text does not mention any environmental pr...
1,0,1,pollution,Autumn period especially its second half is ch...,No,No,No,"The text is not about environmental pollution,...",No,The text does not mention environmental pollut...
2,0,2,treatment,Autumn period especially its second half is ch...,No,No,No,The text is not about treatment plants or envi...,No,The text does not mention treatment plants or ...
3,0,3,climate,Autumn period especially its second half is ch...,Yes,Yes,No,"The text is not about climatic indicators, but...",Yes,The text matches the code because it describes...
4,0,4,biomonitoring,Autumn period especially its second half is ch...,No,No,No,The text is not about biological or biotic mon...,No,"The text does not mention biological, biotic m..."
...,...,...,...,...,...,...,...,...,...,...
495,99,0,env_problems,Mineralization of organic phosphorus in bottom...,Yes,No,Yes,The text discusses an environmental problem re...,Yes,The text matches the code because it discusses...
496,99,1,pollution,Mineralization of organic phosphorus in bottom...,No,No,Yes,The text discusses the mineralization of organ...,Yes,The text matches the code because it discusses...
497,99,2,treatment,Mineralization of organic phosphorus in bottom...,No,No,No,The text is not about treatment plants or envi...,No,The text does not mention treatment plants or ...
498,99,3,climate,Mineralization of organic phosphorus in bottom...,No,No,No,"The text is not about climatic indicators, but...",No,"The text does not mention climatic indicators,..."


In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

def perform_similarity_analysis(df):
    # Clean the data by filling NaN values
    df_results = df.fillna('No')  # or whatever default value makes sense
    
    # Make sure all values are strings
    columns_to_convert = ['original_code', 'replicated_code', 'model_code', 'new_model_code']
    for col in columns_to_convert:
        df_results[col] = df_results[col].astype(str)
    
    # Define comparisons to analyze
    comparisons = [
        ('model_code', 'new_model_code', 'Original Model vs New Model'),
        ('original_code', 'model_code', 'Original Code vs Original Model'),
        ('original_code', 'new_model_code', 'Original Code vs New Model'),
        ('replicated_code', 'model_code', 'Replicated Code vs Original Model'),
        ('replicated_code', 'new_model_code', 'Replicated Code vs New Model')
    ]
    
    results = []
    
    for col1, col2, name in comparisons:
        # Skip if column doesn't exist
        if col1 not in df_results.columns or col2 not in df_results.columns:
            print(f"Warning: Columns {col1} or {col2} not found in DataFrame")
            continue
            
        accuracy = accuracy_score(df_results[col1], df_results[col2])
        kappa = cohen_kappa_score(df_results[col1], df_results[col2])
        matching_cases = (df_results[col1] == df_results[col2]).sum()
        total_cases = len(df_results)
        match_percentage = (matching_cases / total_cases) * 100
        
        results.append({
            'Comparison': name,
            'Accuracy': round(accuracy, 3),
            'Kappa Score': round(kappa, 3),
            'Matching Cases': matching_cases,
            'Total Cases': total_cases,
            'Match Percentage': round(match_percentage, 2)
        })
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    
    # Print summary
    print("\nSimilarity Analysis Summary:")
    print("=" * 100)
    print(results_df.to_string(index=False))
    
    # Print detailed analysis
    print("\nDetailed Analysis:")
    print("=" * 100)
    
    for col1, col2, name in comparisons:
        if col1 not in df_results.columns or col2 not in df_results.columns:
            continue
            
        matrix = confusion_matrix(df_results[col1], df_results[col2])
        print(f"\nConfusion Matrix for {name}:")
        categories = sorted(df_results[col1].unique().tolist())
        print(f"Categories: {categories}")
        print(matrix)
    
    return results_df

# Run the analysis
results_df = perform_similarity_analysis(df_results)
# save_analysis_results(results_df)


Similarity Analysis Summary:
                       Comparison  Accuracy  Kappa Score  Matching Cases  Total Cases  Match Percentage
      Original Model vs New Model     0.892        0.737             446          500              89.2
  Original Code vs Original Model     0.856        0.631             428          500              85.6
       Original Code vs New Model     0.866        0.675             433          500              86.6
Replicated Code vs Original Model     0.738        0.251             369          500              73.8
     Replicated Code vs New Model     0.758        0.360             379          500              75.8

Detailed Analysis:

Confusion Matrix for Original Model vs New Model:
Categories: ['No', 'Yes']
[[  0   0   0]
 [  3 331  34]
 [  1  16 115]]

Confusion Matrix for Original Code vs Original Model:
Categories: ['No', 'Yes']
[[331  35]
 [ 37  97]]

Confusion Matrix for Original Code vs New Model:
Categories: ['No', 'Yes']
[[  0   0   0]
 [  2 32

In [12]:
df_results.to_csv('results_csvs/water_quality_prompt_final.csv', index=False)
