In [1]:
import pandas as pd

df = pd.read_csv('Data/ukraine_water_reasons.csv')

In [17]:
import pandas as pd
import requests
import os
from tqdm import tqdm
import time
import logging
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
import re

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Load environment variables
load_dotenv()

# Together API endpoint and key
endpoint = 'https://api.together.xyz/inference'
TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')

def prepare_prompt(text, code):
    """Prepare prompt for a specific code and text using Llama 3.3 instruct format."""
    codebook = {
        'env_problems': "Is the text about an environmental problem?",
        'pollution': "Is the text about environmental pollution?",
        'treatment': "Is the text about treatment plants or environmental technologies?",
        'climate': "Is the text about climatic indicators?",
        'biomonitoring': "Is the text about biological, biotic monitoring in water or in a river basin?"
    }
    
    return f"""[INST] Analyze this text and determine if it matches the given code definition. You must follow the exact format below and provide a clear Yes/No decision.

TEXT TO ANALYZE:
{text}

CODE DEFINITION:
{codebook[code]}

Your response must contain these exact sections in this order:

Key Elements:
List 2-3 key elements from the text that are most relevant to the code definition.

Analysis:
Write 1-2 sentences explaining how these elements relate to the code definition.

Decision:
Write ONLY "Yes" or "No" - no other words or explanations here.

Reason:
Write exactly one clear sentence explaining your decision.

IMPORTANT: 
- Keep responses direct and structured
- Decision section must contain ONLY "Yes" or "No"
- Each section must be clearly labeled
- Reason must be one complete sentence
[/INST]"""

def clean_response_text(text):
    """Clean the response text by removing unwanted artifacts and formatting."""
    # Remove common LLM artifacts
    artifacts = [
        r'\[/?INST\]',          # [INST] tags
        r'\$\\boxed{.*?}\$',    # LaTeX boxed answers
        r'Step \d+:.*?(?=\n|$)', # Step-by-step instructions
        r'The final answer is:.*?(?=\n|$)',
        r'<rewritten_response>.*?(?=\n|$)',
        r'becomes.*?(?=\n|$)'
    ]
    
    cleaned = text
    for pattern in artifacts:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE | re.MULTILINE)
    
    # Split into lines and clean each line
    cleaned_lines = []
    for line in cleaned.split('\n'):
        line = line.strip()
        if line:
            cleaned_lines.append(line)
            
    return '\n'.join(cleaned_lines)

def extract_decision_and_reason(response_text):
    """Extract decision and reason from the structured response with enhanced reliability."""
    try:
        if not response_text:
            return "Error", "Empty response received"
            
        # Clean and normalize the text
        response_text = response_text.replace('\r', '\n')
        response_text = re.sub(r'\n+', '\n', response_text)
        
        # First try to find an explicit Yes/No decision
        decision_pattern = r'(?:Decision:|.*?\bfinal decision is\b.*?:?)\s*([Yy]es|[Nn]o)'
        decision_match = re.search(decision_pattern, response_text)
        
        if not decision_match:
            # Fallback: Look for yes/no anywhere in the text
            yes_patterns = [r'\b[Yy]es\b', r'\b[Yy]ep\b', r'\b[Yy]eah\b']
            no_patterns = [r'\b[Nn]o\b', r'\b[Nn]ope\b', r'\b[Nn]ah\b']
            
            has_yes = any(re.search(pattern, response_text) for pattern in yes_patterns)
            has_no = any(re.search(pattern, response_text) for pattern in no_patterns)
            
            if has_yes and not has_no:
                decision = "Yes"
            elif has_no and not has_yes:
                decision = "No"
            else:
                # If still no clear decision, analyze the text sentiment
                negative_indicators = ['lacks', 'does not', 'doesn\'t', 'absent', 'missing']
                positive_indicators = ['contains', 'shows', 'demonstrates', 'indicates', 'presents']
                
                has_positive = any(indicator in response_text.lower() for indicator in positive_indicators)
                has_negative = any(indicator in response_text.lower() for indicator in negative_indicators)
                
                if has_positive and not has_negative:
                    decision = "Yes"
                elif has_negative and not has_positive:
                    decision = "No"
                else:
                    # Final fallback: analyze the overall response
                    text_lower = response_text.lower()
                    if any(word in text_lower for word in ['matches', 'aligns', 'relevant', 'consistent']):
                        decision = "Yes"
                    else:
                        decision = "No"
        else:
            decision = decision_match.group(1).capitalize()

        # Extract reason with multiple fallback strategies
        reason = None
        
        # Strategy 1: Look for explicit Reason section
        reason_match = re.search(r'Reason:(.+?)(?:\n|$)', response_text, re.DOTALL)
        if reason_match:
            reason = reason_match.group(1).strip()
            
        # Strategy 2: Look for Analysis section
        if not reason or len(reason) < 20:
            analysis_match = re.search(r'Analysis:(.+?)(?:\n(?:[A-Za-z]+:|$)|$)', response_text, re.DOTALL)
            if analysis_match:
                analysis_text = analysis_match.group(1).strip()
                sentences = [s.strip() for s in analysis_text.split('.') if len(s.strip()) >= 20]
                if sentences:
                    reason = sentences[0]

        # Strategy 3: Extract any sentence containing decision-related words
        if not reason or len(reason) < 20:
            sentences = [s.strip() + '.' for s in response_text.split('.') if len(s.strip()) >= 20]
            decision_words = ['because', 'since', 'as', 'therefore', 'hence', 'indicates', 'shows']
            for sentence in sentences:
                if any(word in sentence.lower() for word in decision_words):
                    reason = sentence
                    break
            
            # If still no reason, take the longest sentence
            if not reason and sentences:
                reason = max(sentences, key=len)

        # Final fallback
        if not reason:
            reason = f"Based on the analysis of the provided text, the decision is {decision.lower()}."
        
        # Ensure reason ends with proper punctuation
        if not reason.endswith(('.', '!', '?')):
            reason += '.'
            
        return decision, reason
        
    except Exception as e:
        logging.error(f"Error parsing response: {str(e)}\nFull response: {response_text}")
        return "Error", str(e)

def classify_text(text, code, retry_count=3):
    """Classify text with retries."""
    for attempt in range(retry_count):
        try:
            response = requests.post(
                endpoint,
                headers={
                    "Authorization": f"Bearer {TOGETHER_API_KEY}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
                    "prompt": prepare_prompt(text, code),
                    "max_tokens": 400,
                    "temperature": 0.1,
                    "top_p": 0.7,
                    "top_k": 40,
                    "repetition_penalty": 1.1
                }
            )
            response.raise_for_status()
            
            response_json = response.json()
            response_text = (
                response_json.get('output', {}).get('choices', [{}])[0].get('text', '')
                if isinstance(response_json.get('output'), dict)
                else response_json.get('output', '')
            )
            
            return extract_decision_and_reason(clean_response_text(response_text))
            
        except Exception as e:
            if attempt == retry_count - 1:
                return "Error", str(e)
            time.sleep(2 ** attempt)
    
    return "Error", "Maximum retries exceeded"

def process_single_report(args):
    """Process a single water quality report for parallel execution."""
    idx, row = args
    try:
        decision, reason = classify_text(row['text'], row['code_name'])
        return {
            'index': idx,
            'prediction': decision,
            'reasoning': reason,
            'success': decision not in ["Error", None]
        }
    except Exception as e:
        logging.error(f"Error processing row {idx}: {str(e)}")
        return {
            'index': idx,
            'prediction': "Error",
            'reasoning': str(e),
            'success': False
        }

def process_water_reports(df, max_workers=10):
    """Process water quality reports in parallel with true concurrency."""
    df_processed = df.copy()
    df_processed['model_prediction'] = None
    df_processed['model_reasoning'] = None
    
    success_count = 0
    error_count = 0
    
    # Create list of (index, row) tuples for parallel processing
    tasks = list(df_processed.iterrows())
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_single_report, task) for task in tasks]
        
        with tqdm(total=len(df_processed), desc="Processing reports") as pbar:
            for future in as_completed(futures):
                result = future.result()
                df_processed.at[result['index'], 'model_prediction'] = result['prediction']
                df_processed.at[result['index'], 'model_reasoning'] = result['reasoning']
                
                if result['success']:
                    success_count += 1
                else:
                    error_count += 1
                    
                pbar.update(1)
    
    logging.info(f"\nProcessing complete!")
    logging.info(f"Successfully processed: {success_count} rows")
    logging.info(f"Errors: {error_count} rows")
    
    return df_processed
df_results = process_water_reports(df, max_workers=5)

Processing reports: 100%|██████████| 500/500 [14:45<00:00,  1.77s/it]
2024-12-23 17:00:55,359 - INFO - 
Processing complete!
2024-12-23 17:00:55,359 - INFO - Successfully processed: 500 rows
2024-12-23 17:00:55,360 - INFO - Errors: 0 rows


In [16]:
# Display all rows where there was an error
error_rows = df_results[df_results['model_prediction'] == 'Error']
print(error_rows[['text', 'code_name', 'model_prediction', 'model_reasoning']])

                                                  text      code_name  \
56   In the Southern Bug basin the main part of the...      pollution   
177  It is worth of mentioning the low level of tre...      treatment   
215  This method allows extracting only a part of p...   env_problems   
226  Phosphorus belongs to the elements which hardl...      pollution   
276  Compared with vodocanals discharging 488 tons ...      pollution   
354  Southern Bug basin is located on the territory...  biomonitoring   

    model_prediction                       model_reasoning  
56             Error  Could not find clear Yes/No decision  
177            Error  Could not find clear Yes/No decision  
215            Error  Could not find clear Yes/No decision  
226            Error  Could not find clear Yes/No decision  
276            Error  Could not find clear Yes/No decision  
354            Error  Could not find clear Yes/No decision  


In [19]:
#df_results.to_csv('results_csvs/water_quality_llama_33.csv', index=False)


In [18]:
df_results

Unnamed: 0,text_id,code_id,code_name,text,original_code,replicated_code,model_code,reason,model_prediction,model_reasoning
0,0,0,env_problems,Autumn period especially its second half is ch...,No,No,No,The text is not about an environmental problem...,No,The text describes typical seasonal weather co...
1,0,1,pollution,Autumn period especially its second half is ch...,No,No,No,"The text is not about environmental pollution,...",No,The text describes natural weather conditions ...
2,0,2,treatment,Autumn period especially its second half is ch...,No,No,No,The text is not about treatment plants or envi...,No,The text does not discuss treatment plants or ...
3,0,3,climate,Autumn period especially its second half is ch...,Yes,Yes,No,"The text is not about climatic indicators, but...",Yes,The text specifically mentions various weather...
4,0,4,biomonitoring,Autumn period especially its second half is ch...,No,No,No,The text is not about biological or biotic mon...,No,The text does not discuss biological or biotic...
...,...,...,...,...,...,...,...,...,...,...
495,99,0,env_problems,Mineralization of organic phosphorus in bottom...,Yes,No,Yes,The text discusses an environmental problem re...,No,The text does not explicitly describe or imply...
496,99,1,pollution,Mineralization of organic phosphorus in bottom...,No,No,Yes,The text discusses the mineralization of organ...,No,The text discusses a natural process of minera...
497,99,2,treatment,Mineralization of organic phosphorus in bottom...,No,No,No,The text is not about treatment plants or envi...,No,The text does not explicitly discuss treatment...
498,99,3,climate,Mineralization of organic phosphorus in bottom...,No,No,No,"The text is not about climatic indicators, but...",No,The text primarily discusses a biochemical pro...


In [7]:
df_results['model_prediction'].value_counts()

model_prediction
No       302
Yes      197
Error      1
Name: count, dtype: int64

In [8]:
print(df_results[df_results['model_prediction'] == 'Error'])

     text_id  code_id     code_name  \
200       40        0  env_problems   

                                                  text original_code  \
200  The average amount of precipitation in the low...            No   

    replicated_code model_code  \
200              No         No   

                                                reason model_prediction  \
200  The text is providing information about precip...            Error   

                          model_reasoning  
200  Could not find clear Yes/No decision  


In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

def perform_similarity_analysis(df):
    # Clean the data by filling NaN values
    df_results = df.fillna('No')  # or whatever default value makes sense
    
    # Make sure all values are strings
    columns_to_convert = ['original_code', 'replicated_code', 'model_code', 'model_prediction']
    for col in columns_to_convert:
        df_results[col] = df_results[col].astype(str)
    
    # Define comparisons to analyze
    comparisons = [
        ('model_code', 'model_prediction', 'Original Model vs New Model'),
        ('original_code', 'model_code', 'Original Code vs Original Model'),
        ('original_code', 'model_prediction', 'Original Code vs New Model'),
        ('replicated_code', 'model_code', 'Replicated Code vs Original Model'),
        ('replicated_code', 'model_prediction', 'Replicated Code vs New Model')
    ]
    
    results = []
    
    for col1, col2, name in comparisons:
        # Skip if column doesn't exist
        if col1 not in df_results.columns or col2 not in df_results.columns:
            print(f"Warning: Columns {col1} or {col2} not found in DataFrame")
            continue
            
        accuracy = accuracy_score(df_results[col1], df_results[col2])
        kappa = cohen_kappa_score(df_results[col1], df_results[col2])
        matching_cases = (df_results[col1] == df_results[col2]).sum()
        total_cases = len(df_results)
        match_percentage = (matching_cases / total_cases) * 100
        
        results.append({
            'Comparison': name,
            'Accuracy': round(accuracy, 3),
            'Kappa Score': round(kappa, 3),
            'Matching Cases': matching_cases,
            'Total Cases': total_cases,
            'Match Percentage': round(match_percentage, 2)
        })
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    
    # Print summary
    print("\nSimilarity Analysis Summary:")
    print("=" * 100)
    print(results_df.to_string(index=False))
    
    # Print detailed analysis
    print("\nDetailed Analysis:")
    print("=" * 100)
    
    for col1, col2, name in comparisons:
        if col1 not in df_results.columns or col2 not in df_results.columns:
            continue
            
        matrix = confusion_matrix(df_results[col1], df_results[col2])
        print(f"\nConfusion Matrix for {name}:")
        categories = sorted(df_results[col1].unique().tolist())
        print(f"Categories: {categories}")
        print(matrix)
    
    return results_df

# Run the analysis
results_df = perform_similarity_analysis(df_results)
# save_analysis_results(results_df)


Similarity Analysis Summary:
                       Comparison  Accuracy  Kappa Score  Matching Cases  Total Cases  Match Percentage
      Original Model vs New Model     0.904        0.761             452          500              90.4
  Original Code vs Original Model     0.856        0.631             428          500              85.6
       Original Code vs New Model     0.864        0.663             432          500              86.4
Replicated Code vs Original Model     0.738        0.251             369          500              73.8
     Replicated Code vs New Model     0.758        0.342             379          500              75.8

Detailed Analysis:

Confusion Matrix for Original Model vs New Model:
Categories: ['No', 'Yes']
[[337  31]
 [ 17 115]]

Confusion Matrix for Original Code vs Original Model:
Categories: ['No', 'Yes']
[[331  35]
 [ 37  97]]

Confusion Matrix for Original Code vs New Model:
Categories: ['No', 'Yes']
[[326  40]
 [ 28 106]]

Confusion Matrix for R

In [24]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from scipy.stats import chi2_contingency

class ModelOutputAnalyzer:
    def __init__(self, df_results):
        """
        Initialize with the df_results DataFrame containing model outputs
        Required columns: text_id, code_id, model_prediction, model_reasoning
        """
        self.df = df_results.copy()
        self._validate_data()
        
    def _validate_data(self):
        required_cols = ['text_id', 'code_id', 'model_prediction', 'model_reasoning']
        missing_cols = [col for col in required_cols if col not in self.df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
    
    def analyze_internal_consistency(self):
        """Analyze how consistent the model is across different codes for the same text"""
        consistency = self.df.groupby('text_id').agg({
            'model_prediction': lambda x: len(set(x)),  # Unique predictions per text
            'code_id': 'count'  # Total codes per text
        })
        
        results = {
            'fully_consistent_texts': (consistency['model_prediction'] == 1).mean(),
            'avg_unique_predictions': consistency['model_prediction'].mean(),
            'total_texts': len(consistency)
        }
        
        return results
    
    def analyze_code_patterns(self):
        """Analyze patterns in how different codes are assigned"""
        code_stats = self.df.groupby('code_id').agg({
            'model_prediction': lambda x: (x == 'Yes').mean(),
            'text_id': 'count'
        }).round(3)
        
        code_stats.columns = ['positive_rate', 'total_assignments']
        return code_stats
    
    def analyze_reasoning_patterns(self):
        """Analyze patterns in the reasoning provided"""
        # Extract key phrases (words following common indicators)
        indicators = ['because', 'as', 'since', 'due to', 'indicates', 'shows']
        
        def extract_key_phrase(text):
            text = str(text).lower()
            for indicator in indicators:
                if indicator in text:
                    idx = text.index(indicator) + len(indicator)
                    return text[idx:idx + 100].strip()  # Get 100 chars after indicator
            return None
            
        reasoning_analysis = {
            'avg_length': self.df['model_reasoning'].str.len().mean(),
            'unique_patterns': len(self.df['model_reasoning'].unique()),
            'common_phrases': Counter([
                phrase for phrase in self.df['model_reasoning'].apply(extract_key_phrase)
                if phrase is not None
            ]).most_common(5)
        }
        
        return reasoning_analysis
    
    def analyze_prediction_dependencies(self):
        """Analyze if predictions for different codes seem dependent"""
        code_pairs = []
        unique_codes = self.df['code_id'].unique()
        
        for i in range(len(unique_codes)):
            for j in range(i + 1, len(unique_codes)):
                code1, code2 = unique_codes[i], unique_codes[j]
                
                # Get predictions for texts that have both codes
                texts_with_both = set(self.df[self.df['code_id'] == code1]['text_id']) & \
                                set(self.df[self.df['code_id'] == code2]['text_id'])
                
                if texts_with_both:
                    pred1 = self.df[
                        (self.df['code_id'] == code1) & 
                        (self.df['text_id'].isin(texts_with_both))
                    ]['model_prediction']
                    
                    pred2 = self.df[
                        (self.df['code_id'] == code2) & 
                        (self.df['text_id'].isin(texts_with_both))
                    ]['model_prediction']
                    
                    # Create contingency table
                    cont_table = pd.crosstab(pred1, pred2)
                    
                    # Calculate chi-square test
                    try:
                        _, p_value, _, _ = chi2_contingency(cont_table)
                        code_pairs.append({
                            'code_id_1': code1,
                            'code_id_2': code2,
                            'p_value': p_value,
                            'n_texts': len(texts_with_both)
                        })
                    except:
                        continue
                        
        return pd.DataFrame(code_pairs) if code_pairs else pd.DataFrame()
    
    def get_full_analysis(self):
        """Run all analyses and return comprehensive results"""
        return {
            'consistency': self.analyze_internal_consistency(),
            'code_patterns': self.analyze_code_patterns(),
            'reasoning_patterns': self.analyze_reasoning_patterns(),
            'dependencies': self.analyze_prediction_dependencies()
        }

# Example usage with df_results:
analyzer = ModelOutputAnalyzer(df_results)
analysis_results = analyzer.get_full_analysis()

print("\nInternal Consistency Analysis:")
print(f"Percentage of fully consistent texts: {analysis_results['consistency']['fully_consistent_texts']:.1%}")
print(f"Average unique predictions per text: {analysis_results['consistency']['avg_unique_predictions']:.2f}")
print(f"Total number of texts analyzed: {analysis_results['consistency']['total_texts']}")

print("\nCode Pattern Analysis:")
print("Distribution of Yes/No predictions by code ID:")
print(analysis_results['code_patterns'])

print("\nReasoning Pattern Analysis:")
print(f"Average reasoning length: {analysis_results['reasoning_patterns']['avg_length']:.1f} characters")
print(f"Number of unique reasoning patterns: {analysis_results['reasoning_patterns']['unique_patterns']}")
print("\nMost common reasoning phrases:")
for phrase, count in analysis_results['reasoning_patterns']['common_phrases']:
    print(f"- {phrase[:50]}... ({count} occurrences)")

print("\nCode Dependencies Analysis:")
dependencies = analysis_results['dependencies']
if isinstance(dependencies, pd.DataFrame) and not dependencies.empty:
    significant_deps = dependencies[dependencies['p_value'] < 0.05]
    print(f"Found {len(significant_deps)} significant dependencies between codes")
    if len(significant_deps) > 0:
        print("\nMost significant dependencies:")
        print(significant_deps.sort_values('p_value').head())
else:
    print("No significant dependencies found between codes")


Internal Consistency Analysis:
Percentage of fully consistent texts: 25.0%
Average unique predictions per text: 1.75
Total number of texts analyzed: 100

Code Pattern Analysis:
Distribution of Yes/No predictions by code ID:
         positive_rate  total_assignments
code_id                                  
0                 0.47                100
1                 0.48                100
2                 0.18                100
3                 0.19                100
4                 0.14                100

Reasoning Pattern Analysis:
Average reasoning length: 192.3 characters
Number of unique reasoning patterns: 499

Most common reasoning phrases:
- in.... (13 occurrences)
- defined by the code.... (2 occurrences)
- ins.... (2 occurrences)
- pects.... (2 occurrences)
- snow cover and frost penetration depths.... (2 occurrences)

Code Dependencies Analysis:
No significant dependencies found between codes


In [12]:
#df_results.to_csv('results_csvs/water_quality_prompt_final.csv', index=False)
