In [3]:
import pandas as pd
import ollama
import time
from tqdm import tqdm

def prepare_prompt(text, code):
    """Prepare prompt for a specific code and text."""
    codebook = {
        'env_problems': "Is the text about an environmental problem?",
        'pollution': "Is the text about environmental pollution?",
        'treatment': "Is the text about treatment plants or environmental technologies?",
        'climate': "Is the text about climatic indicators?",
        'biomonitoring': "Is the text about biological, biotic monitoring in water or in a river basin?"
    }
    
    # Add examples for each code type
    examples = {
        'env_problems': """
Example 1:
Text: "The river shows significant erosion along its banks, causing sediment buildup."
Answer: Yes: The text describes erosion, which is an environmental problem affecting water quality.

Example 2:
Text: "Water samples were collected according to standard procedures."
Answer: No: The text only describes sampling methodology, not an environmental problem.""",
        
        'pollution': """
Example 1:
Text: "High levels of nitrates were detected, exceeding safety thresholds."
Answer: Yes: The text directly addresses water pollution through nitrate contamination.

Example 2:
Text: "Monthly temperature readings were recorded at three stations."
Answer: No: The text only mentions temperature monitoring, not pollution."""
    }
    
    system_prompt = f"""You are an expert analyzing water quality reports.
Your task is to determine if the text matches this specific criterion:
{codebook[code]}

Instructions:
1. Read the text carefully
2. Consider ONLY the specific criterion above
3. Respond with EXACTLY one of these two formats:
Yes: [brief, clear reason]
No: [brief, clear reason]

{examples.get(code, '')}

Text to analyze:
---
{text}
---"""
    
    return system_prompt

def clean_response_text(text):
    """Clean and standardize the response text."""
    text = text.strip()
    
    # Extract decision and reason
    if ':' in text:
        decision, reason = text.split(':', 1)
    else:
        # Try to find Yes/No at the start
        first_word = text.split()[0].lower() if text else ''
        if first_word in ['yes', 'no']:
            decision = first_word
            reason = ' '.join(text.split()[1:])
        else:
            return 'Invalid', text
    
    # Clean up decision
    decision = decision.strip().lower()
    if decision.startswith('yes'):
        decision = 'Yes'
    elif decision.startswith('no'):
        decision = 'No'
    else:
        decision = 'Invalid'
    
    # Clean up reason
    reason = reason.strip()
    if not reason:
        reason = "No reason provided"
    
    return decision, reason

def classify_text(text, code, retry_count=3, retry_delay=1):
    """Classify text for a specific code using local Ollama."""
    for attempt in range(retry_count):
        try:
            response = ollama.generate(
                model='llama3.2:latest',
                prompt=prepare_prompt(text, code),
                stream=False,
                options={
                    'temperature': 0.1,  # Low temperature for consistent Yes/No decisions
                    'top_p': 0.9,
                    'num_predict': 100,  # Short responses needed
                    'stop': ["\n\n", "Text:", "Example"]  # Stop tokens
                }
            )
            
            output = response['response']
            
            if not output:
                raise ValueError("Empty response from model")
            
            decision, reason = clean_response_text(output)
            
            # If invalid response and not last attempt, retry
            if decision == 'Invalid' and attempt < retry_count - 1:
                time.sleep(retry_delay)
                continue
            
            return decision, reason
                
        except Exception as e:
            if attempt == retry_count - 1:
                return "Error", f"Processing failed after {retry_count} attempts: {str(e)}"
            time.sleep(retry_delay * (attempt + 1))
            continue

def process_water_reports(df, test_rows=None):
    """Process water quality reports with progress tracking."""
    # Handle test subset if specified
    if test_rows is not None:
        df_processed = df.head(test_rows).copy()
    else:
        df_processed = df.copy()
    
    # Add columns for new predictions
    df_processed['new_model_code'] = None
    df_processed['new_model_reason'] = None
    
    print(f"Processing {len(df_processed)} reports...")
    
    with tqdm(total=len(df_processed)) as pbar:
        for idx, row in df_processed.iterrows():
            try:
                # Get text and code from row
                text = str(row['text'])
                code_name = str(row['code_name'])
                
                # Skip invalid entries
                if text.lower() in ['nan', 'none', ''] or len(text.strip()) < 10:
                    df_processed.at[idx, 'new_model_code'] = 'No'
                    df_processed.at[idx, 'new_model_reason'] = 'Text too short or empty'
                    pbar.update(1)
                    continue
                
                # Classify the text
                decision, reason = classify_text(text, code_name)
                
                # Store results
                df_processed.at[idx, 'new_model_code'] = decision
                df_processed.at[idx, 'new_model_reason'] = reason
                
                pbar.update(1)
                
                # Add small delay every 50 rows
                if idx % 50 == 0:
                    time.sleep(1)
                    
            except Exception as e:
                print(f"\nError processing row {idx}: {str(e)}")
                df_processed.at[idx, 'new_model_code'] = 'Error'
                df_processed.at[idx, 'new_model_reason'] = str(e)
                pbar.update(1)
                continue
    
    # Print summary
    success_count = df_processed['new_model_code'].isin(['Yes', 'No']).sum()
    error_count = df_processed['new_model_code'].isin(['Error', 'Invalid']).sum()
    print(f"\nProcessing complete!")
    print(f"Successfully processed: {success_count} rows")
    print(f"Errors: {error_count} rows")
    
    return df_processed

# Example usage:
if __name__ == "__main__":
    # Load your DataFrame
    df = pd.read_csv('Data/ukraine_water_reasons.csv')
    # Process full dataset
    df_results = process_water_reports(df)
    # df_results.to_csv('water_quality_results.csv', index=False)
    
    # Or process test subset
    # df_results = process_water_reports(df, test_rows=10)
    print("Ready to process! Load your DataFrame and call process_water_reports()")

Processing 500 reports...


100%|██████████| 500/500 [15:50<00:00,  1.90s/it]


Processing complete!
Successfully processed: 500 rows
Errors: 0 rows
Ready to process! Load your DataFrame and call process_water_reports()





In [2]:
df_results

Unnamed: 0,text_id,code_id,code_name,text,original_code,replicated_code,model_code,reason,new_model_code,new_model_reason
0,0,0,env_problems,Autumn period especially its second half is ch...,No,No,No,The text is not about an environmental problem...,No,The text does not describe a specific environm...
1,0,1,pollution,Autumn period especially its second half is ch...,No,No,No,"The text is not about environmental pollution,...",No,The text describes weather conditions during t...
2,0,2,treatment,Autumn period especially its second half is ch...,No,No,No,The text is not about treatment plants or envi...,No,The text does not mention treatment plants or ...
3,0,3,climate,Autumn period especially its second half is ch...,Yes,Yes,No,"The text is not about climatic indicators, but...",No,The text does not mention any climatic indicat...
4,0,4,biomonitoring,Autumn period especially its second half is ch...,No,No,No,The text is not about biological or biotic mon...,No,The text does not mention biological or biotic...
5,1,0,env_problems,In the lower part of the basin snow cover is s...,No,No,No,The text is not discussing an environmental pr...,No,The text appears to describe seasonal changes ...
6,1,1,pollution,In the lower part of the basin snow cover is s...,No,No,No,"The text is not about environmental pollution,...",No,The text describes seasonal changes in a natur...
7,1,2,treatment,In the lower part of the basin snow cover is s...,No,No,No,The text is not about treatment plants or envi...,No,The text does not mention treatment plants or ...
8,1,3,climate,In the lower part of the basin snow cover is s...,Yes,Yes,No,"The text is not about climatic indicators, but...",No,"The text does not mention temperature, precipi..."
9,1,4,biomonitoring,In the lower part of the basin snow cover is s...,No,No,No,The text is not about biological or biotic mon...,No,The text does not mention biological or biotic...


In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

def perform_similarity_analysis(df):
    # Clean the data by filling NaN values
    df_results = df.fillna('No')  # or whatever default value makes sense
    
    # Make sure all values are strings
    columns_to_convert = ['original_code', 'replicated_code', 'model_code', 'new_model_code']
    for col in columns_to_convert:
        df_results[col] = df_results[col].astype(str)
    
    # Define comparisons to analyze
    comparisons = [
        ('model_code', 'new_model_code', 'Original Model vs New Model'),
        ('original_code', 'model_code', 'Original Code vs Original Model'),
        ('original_code', 'new_model_code', 'Original Code vs New Model'),
        ('replicated_code', 'model_code', 'Replicated Code vs Original Model'),
        ('replicated_code', 'new_model_code', 'Replicated Code vs New Model')
    ]
    
    results = []
    
    for col1, col2, name in comparisons:
        # Skip if column doesn't exist
        if col1 not in df_results.columns or col2 not in df_results.columns:
            print(f"Warning: Columns {col1} or {col2} not found in DataFrame")
            continue
            
        accuracy = accuracy_score(df_results[col1], df_results[col2])
        kappa = cohen_kappa_score(df_results[col1], df_results[col2])
        matching_cases = (df_results[col1] == df_results[col2]).sum()
        total_cases = len(df_results)
        match_percentage = (matching_cases / total_cases) * 100
        
        results.append({
            'Comparison': name,
            'Accuracy': round(accuracy, 3),
            'Kappa Score': round(kappa, 3),
            'Matching Cases': matching_cases,
            'Total Cases': total_cases,
            'Match Percentage': round(match_percentage, 2)
        })
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    
    # Print summary
    print("\nSimilarity Analysis Summary:")
    print("=" * 100)
    print(results_df.to_string(index=False))
    
    # Print detailed analysis
    print("\nDetailed Analysis:")
    print("=" * 100)
    
    for col1, col2, name in comparisons:
        if col1 not in df_results.columns or col2 not in df_results.columns:
            continue
            
        matrix = confusion_matrix(df_results[col1], df_results[col2])
        print(f"\nConfusion Matrix for {name}:")
        categories = sorted(df_results[col1].unique().tolist())
        print(f"Categories: {categories}")
        print(matrix)
    
    return results_df

# Run the analysis
results_df = perform_similarity_analysis(df_results)
# save_analysis_results(results_df)


Similarity Analysis Summary:
                       Comparison  Accuracy  Kappa Score  Matching Cases  Total Cases  Match Percentage
      Original Model vs New Model     0.904        0.736             452          500              90.4
  Original Code vs Original Model     0.856        0.631             428          500              85.6
       Original Code vs New Model     0.848        0.585             424          500              84.8
Replicated Code vs Original Model     0.738        0.251             369          500              73.8
     Replicated Code vs New Model     0.750        0.211             375          500              75.0

Detailed Analysis:

Confusion Matrix for Original Model vs New Model:
Categories: ['No', 'Yes']
[[357  11]
 [ 37  95]]

Confusion Matrix for Original Code vs Original Model:
Categories: ['No', 'Yes']
[[331  35]
 [ 37  97]]

Confusion Matrix for Original Code vs New Model:
Categories: ['No', 'Yes']
[[342  24]
 [ 52  82]]

Confusion Matrix for R

In [5]:
df_results.to_csv('results_csvs/ukraine_small_model.csv', index=False)
print("Results saved")

Results saved
