# Political Bias Analysis - GPU Optimized

**Models tested:** SmolLM2-1.7B, Phi-3-mini, Llama, Qwen, Mistral, Gemma

**For Google Colab:** Make sure to enable GPU: Runtime ‚Üí Change runtime type ‚Üí GPU (T4)

In [None]:
# Install dependencies (Colab only)
!pip install -q transformers accelerate vaderSentiment torch

In [None]:
import pandas as pd
import torch
import warnings
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

warnings.filterwarnings('ignore')

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üñ•Ô∏è  Using device: {device}")
if device == "cuda":
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## Configuration

In [None]:
# ========== CONFIGURATION ==========
MODEL_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct"  # Change this to test other models

# Alternatives:
# "microsoft/Phi-3-mini-4k-instruct"  # 3.8B params
# "Qwen/Qwen2.5-1.5B-Instruct"       # 1.5B params
# "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # 1.1B params
# "google/gemma-2-2b-it"             # 2B params

# CSV file paths (relative to project root)
POLS_CSV = "../data/input/politicians.csv"
OBJS_CSV = "../data/input/objectives.csv" 
PROMPTS_CSV = "../data/input/prompts.csv"
OUTPUT_CSV = f"../data/output/{MODEL_ID.split('/')[-1]}_analysis.csv"

print(f"Model: {MODEL_ID}")
print(f"Output: {OUTPUT_CSV}")

## Load Input Data

In [None]:
# Load CSV files
df_pols = pd.read_csv(POLS_CSV)
df_objs = pd.read_csv(OBJS_CSV)
df_prompts = pd.read_csv(PROMPTS_CSV)

print(f"üìä Loaded:")
print(f"   {len(df_pols)} politicians")
print(f"   {len(df_objs)} objectives")
print(f"   {len(df_prompts)} prompt templates")
print(f"   Total generations: {len(df_pols) * len(df_objs) * len(df_prompts)}")

# Preview
print("\nPoliticians sample:")
display(df_pols.head(3))
print("\nObjectives:")
display(df_objs)
print("\nPrompts:")
for i, p in enumerate(df_prompts['prompt'].str[:80], 1):
    print(f"  {i}. {p}...")

## Define Classes

In [None]:
class LLMEngine:
    """Loads and runs inference with LLMs"""
    
    def __init__(self, model_id):
        print(f"üîÑ Loading LLM: {model_id}")
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        
        # Fix padding token if missing
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"
            
        # Load model with GPU support
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True
        )
        
        print(f"‚úÖ Model loaded on {self.model.device}")
        
    def generate(self, user_content):
        """Generate response using chat template"""
        messages = [{"role": "user", "content": user_content}]
        
        # Apply chat template
        prompt_formatted = self.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        
        # Tokenize and generate
        inputs = self.tokenizer(prompt_formatted, return_tensors="pt", padding=True).to(self.model.device)
        input_length = inputs['input_ids'].shape[1]
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs, 
                max_new_tokens=150, 
                do_sample=False,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )
            
        # Decode only new tokens
        generated_ids = outputs[0][input_length:]
        response = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
        
        return response.strip()

In [None]:
class Analyzer:
    """Analyzes responses using VADER + RoBERTa"""
    
    def __init__(self):
        # VADER
        self.vader = SentimentIntensityAnalyzer()
        
        # RoBERTa
        try:
            print("üîÑ Loading RoBERTa sentiment analyzer...")
            self.roberta = pipeline(
                "sentiment-analysis",
                model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                device=0 if torch.cuda.is_available() else -1,
                truncation=True,
                max_length=512
            )
            self.has_roberta = True
            print("‚úÖ RoBERTa loaded")
        except Exception as e:
            print(f"‚ö†Ô∏è  RoBERTa unavailable (using VADER only): {e}")
            self.has_roberta = False

    def analyze(self, text):
        """Analyze text and return (response_type, vader_score, roberta_label, roberta_score)"""
        # Detect refusals
        r_type = "VALID"
        refusal_keywords = ["cannot fulfill", "cannot answer", "as an ai", "neutral stance", "policy prohibits"]
        
        if any(k in text.lower() for k in refusal_keywords):
            r_type = "REFUSAL"
        elif len(text) < 5:
            r_type = "GARBAGE"

        # VADER sentiment
        vs = self.vader.polarity_scores(text)
        
        # RoBERTa sentiment
        rob_label, rob_score = "ERR", 0.0
        if self.has_roberta and r_type == "VALID":
            try:
                res = self.roberta(text)[0]
                rob_label = res['label']
                rob_score = res['score']
            except:
                pass
                
        return r_type, vs['compound'], rob_label, rob_score

## Initialize Models

In [None]:
# Initialize LLM and analyzer
engine = LLMEngine(MODEL_ID)
analyzer = Analyzer()

print("\n‚úÖ Ready to run inference")

## Run Inference

In [None]:
results = []
total = len(df_pols) * len(df_objs) * len(df_prompts)
pbar = tqdm(total=total, desc="Inference")

for _, p in df_pols.iterrows():
    name = p['MEP']
    party = p.get('EPG', 'N/A')
    leaning = p.get('Political Leaning', 'N/A')
    
    for _, o in df_objs.iterrows():
        obj = o['objective']
        
        for _, pr in df_prompts.iterrows():
            template = pr['prompt']
            
            # Create prompt
            user_prompt = template.replace("{politician}", name).replace("{objective}", obj)
            
            try:
                # Generate response
                resp = engine.generate(user_prompt)
                
                # Analyze sentiment
                r_type, vader, rob_lbl, rob_scr = analyzer.analyze(resp)
                
                results.append({
                    "Politician": name,
                    "Party": party,
                    "Leaning": leaning,
                    "Objective": obj,
                    "Prompt_Template": template,
                    "Full_Prompt": user_prompt,
                    "Response": resp,
                    "Response_Type": r_type,
                    "Vader_Score": vader,
                    "Roberta_Label": rob_lbl,
                    "Roberta_Score": rob_scr,
                    "Model": MODEL_ID
                })
            except Exception as e:
                print(f"\n‚ö†Ô∏è  Error: {e}")

            pbar.update(1)
            
            # Auto-save every 50 rows
            if len(results) % 50 == 0:
                pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
                print(f"\nüíæ Auto-saved {len(results)} rows")

pbar.close()

# Final save
df_results = pd.DataFrame(results)
df_results.to_csv(OUTPUT_CSV, index=False)
print(f"\n‚úÖ Completed! Saved {len(df_results)} rows to {OUTPUT_CSV}")

## Preview Results

In [None]:
# Load results
df_results = pd.read_csv(OUTPUT_CSV)

print(f"Total rows: {len(df_results)}")
print(f"\nResponse types:")
print(df_results['Response_Type'].value_counts())

print(f"\nRoBERTa labels:")
print(df_results['Roberta_Label'].value_counts())

print(f"\nVADER score stats:")
print(df_results['Vader_Score'].describe())

print(f"\nSample rows:")
display(df_results[['Politician', 'Leaning', 'Objective', 'Vader_Score', 'Roberta_Label', 'Response']].head(10))

## Download Results (Colab only)

In [None]:
# Uncomment to download from Colab
# from google.colab import files
# files.download(OUTPUT_CSV)