In [1]:
import pandas as pd

df = pd.read_csv('Data/trump_tweets_reasons.csv')

In [2]:
df.head(20)

Unnamed: 0,text_id,code_id,code_name,text,original_code,replicated_code,model_code,reason
0,1,1,HSTG,A great guy (with great ratings)! https://t.co...,No,No,No,"There are no hashtags used in this tweet, only..."
1,1,2,ATSN,A great guy (with great ratings)! https://t.co...,No,No,No,There are no at signs (@) present in this twee...
2,1,3,CRIT,A great guy (with great ratings)! https://t.co...,No,No,No,The tweet does not criticize another person or...
3,1,4,MEDI,A great guy (with great ratings)! https://t.co...,No,No,No,This tweet does not contain any derogatory or ...
4,1,5,FAMY,A great guy (with great ratings)! https://t.co...,No,No,No,The tweet does not reference any members of Do...
5,1,6,PLCE,A great guy (with great ratings)! https://t.co...,No,No,No,The tweet does not reference the police or ind...
6,1,7,MAGA,A great guy (with great ratings)! https://t.co...,No,No,No,The tweet does not reference Donald Trump's 20...
7,1,8,CAPT,A great guy (with great ratings)! https://t.co...,No,No,No,The tweet does not contain any words in all ca...
8,1,9,INDV,A great guy (with great ratings)! https://t.co...,Yes,Yes,Yes,"The tweet references an individual person, eve..."
9,1,10,MARG,A great guy (with great ratings)! https://t.co...,No,No,No,The tweet does not reference any marginalized ...


In [9]:
import pandas as pd
import numpy as np
import requests
import os
from tqdm import tqdm
import time
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Together API endpoint and key
endpoint = 'https://api.together.xyz/inference'
TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')

def prepare_prompt(text, code):
    """Prepare prompt for a specific code and text."""
    codebook = {
        'HSTG': "Are hashtags used in this tweet? Exclude hyperlinks from this decision.",
        'ATSN': "Are at signs (\"@\") used in this tweet? Include \"@\" that are present in retweets. Exclude hyperlinks from this decision.",
        'CRIT': "Does Donald Trump criticize another person or idea in this tweet? If the author suggests at any point in the tweet that some person or entity did something wrong, code 'Yes'.",
        'MEDI': "Does Donald Trump make derogatory or condescending statements about the news media in this tweet?",
        'FAMY': "Does Donald Trump reference members of his immediate family in this tweet?",
        'PLCE': "Does this tweet reference the police?",
        'MAGA': "Does this tweet reference Donald Trump's 2016 campaign slogan?",
        'CAPT': "Are there words that contain only capital letters in this tweet?",
        'INDV': "Does this tweet reference an individual person? Exclude self-references to Donald Trump.",
        'MARG': "Does the tweet explicitly reference a marginalized group or groups?",
        'INTN': "Does this tweet reference international topics outside of USA?",
        'PRTY': "Does this tweet reference US political parties?",
        'IMMG': "Does this tweet reference immigration in the US?"
    }
    
    system_prompt = f"""You are a qualitative coder who is annotating tweets from Donald Trump's Twitter feed.
    To code this tweet, do the following:
    - First, read the codebook and the tweet.
    - Next, analyze the tweet step-by-step based on the codebook definition for {code}.
    - Explain your reasoning in detail, considering all relevant aspects of the tweet.
    - Finally, decide if the code {code} is applicable and provide your decision.

    Codebook definition for {code}:
    {codebook[code]}

    Tweet:
    {text}

    Respond with:
    - Step-by-step Reasoning: [Explain your analysis in detail]
    - Decision: [Yes or No, followed by a colon and your reason]
    """
    return system_prompt

def clean_response_text(text):
    """Clean the response text by removing unwanted artifacts."""
    # Remove triple quotes and their variations
    text = text.replace('"""', '').replace("'''", '')
    
    # Remove newlines and extra spaces
    text = ' '.join(text.split())
    
    # Remove any remaining escape characters
    text = text.replace('\\n', ' ').replace('\\t', ' ')
    
    return text.strip()

def extract_decision_and_reason(response_text):
    """Extract decision and reason from the structured response."""
    try:
        # Look for the Decision section
        if "Decision:" in response_text:
            decision_part = response_text.split("Decision:")[1].strip()
        else:
            # Fall back to original parsing if no "Decision:" found
            return response_text.split(':', 1)
            
        # Split the decision part into Yes/No and reason
        parts = decision_part.split(':', 1)
        decision = parts[0].strip()
        reason = parts[1].strip() if len(parts) > 1 else ""
        
        # Clean up the decision to ensure it's just Yes or No
        decision = decision.split()[0] if decision.split() else "Error"
        
        return decision, reason
        
    except Exception as e:
        return "Error", f"Failed to parse response: {str(e)}"

def classify_text(text, code, retry_count=3, retry_delay=1):
    """Classify text for a specific code with retry logic."""
    for attempt in range(retry_count):
        try:
            headers = {
                "Authorization": f"Bearer {TOGETHER_API_KEY}",
                "Content-Type": "application/json"
            }
            
            data = {
                "model": "meta-llama/Llama-3-70b-chat-hf",
                "prompt": prepare_prompt(text, code),
                "max_tokens": 300,  # Increased to accommodate longer responses
                "temperature": 0.1,
                "top_p": 1,
                "top_k": 40,
                "repetition_penalty": 1
            }
            
            response = requests.post(endpoint, json=data, headers=headers)
            response.raise_for_status()
            
            response_text = response.json().get('output', {}).get('choices', [{}])[0].get('text', '')
            
            # Clean the response text
            response_text = clean_response_text(response_text)
            
            # Extract decision and reason using the new parser
            return extract_decision_and_reason(response_text)
            
        except Exception as e:
            if attempt == retry_count - 1:
                return "Error", str(e)
            time.sleep(retry_delay * (attempt + 1))
    
    return "Error", "Maximum retries exceeded"

def process_tweets(df):
    """Process all tweets and add model predictions and reasons."""
    # Make a copy of the full dataframe
    df_processed = df.copy()
    
    # Rename existing model_code column if it exists
    if 'model_code' in df_processed.columns:
        df_processed = df_processed.rename(columns={
            'model_code': 'original_model_code',
            'reason': 'original_reason'
        })
    
    # Add new columns for the new model's predictions
    df_processed['new_model_code'] = None
    df_processed['new_model_reason'] = None
    
    # Process each tweet-code combination
    for idx, row in tqdm(df_processed.iterrows(), total=len(df_processed)):
        try:
            decision, reason = classify_text(row['text'], row['code_name'])
            df_processed.at[idx, 'new_model_code'] = decision
            df_processed.at[idx, 'new_model_reason'] = reason
        except Exception as e:
            print(f"\nError processing row {idx}: {str(e)}")
            continue
    
    return df_processed

# Example usage:
df_results = process_tweets(df)

100%|██████████| 1300/1300 [57:10<00:00,  2.64s/it] 


In [10]:
df_results

Unnamed: 0,text_id,code_id,code_name,text,original_code,replicated_code,original_model_code,original_reason,new_model_code,new_model_reason
0,1,1,HSTG,A great guy (with great ratings)! https://t.co...,No,No,No,"There are no hashtags used in this tweet, only...",No,"The tweet does not contain any hashtags, expli..."
1,1,2,ATSN,A great guy (with great ratings)! https://t.co...,No,No,No,There are no at signs (@) present in this twee...,No,"The ""@"" symbol is not used in this tweet."
2,1,3,CRIT,A great guy (with great ratings)! https://t.co...,No,No,No,The tweet does not criticize another person or...,No,The tweet does not contain any criticism towar...
3,1,4,MEDI,A great guy (with great ratings)! https://t.co...,No,No,No,This tweet does not contain any derogatory or ...,No,The tweet does not contain any derogatory or c...
4,1,5,FAMY,A great guy (with great ratings)! https://t.co...,No,No,No,The tweet does not reference any members of Do...,No,The tweet does not explicitly mention any memb...
...,...,...,...,...,...,...,...,...,...,...
1295,100,9,INDV,RT @realDonaldTrump: National Pearl Harbor Rem...,No,No,No,The tweet does not reference any individual pe...,Yes,The tweet references Franklin D. Roosevelt ind...
1296,100,10,MARG,RT @realDonaldTrump: National Pearl Harbor Rem...,No,No,No,The tweet does not reference any marginalized ...,No,The tweet does not explicitly reference a marg...
1297,100,11,INTN,RT @realDonaldTrump: National Pearl Harbor Rem...,No,No,No,The tweet does not reference any international...,No,The tweet primarily focuses on a national even...
1298,100,12,PRTY,RT @realDonaldTrump: National Pearl Harbor Rem...,No,No,No,This tweet does not reference any US political...,No,The tweet does not reference US political part...


In [12]:
df_results.to_csv('results_csvs/tweet_analysis_prompt_final.csv', index=False)

In [2]:
df_results=pd.read_csv('results_csvs/tweet_analysis_results.csv')

In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

# First, clean the data by filling NaN values
df_results = df_results.fillna('No')  # or whatever default value makes sense

# Make sure all values are strings
columns_to_convert = ['original_code', 'replicated_code', 'original_model_code', 'new_model_code']
for col in columns_to_convert:
    df_results[col] = df_results[col].astype(str)

def perform_similarity_analysis(df):
    # Rest of the code remains the same
    comparisons = [
        ('original_model_code', 'new_model_code', 'Original Model vs New Model'),
        ('original_code', 'original_model_code', 'Original Code vs Original Model'),
        ('original_code', 'new_model_code', 'Original Code vs New Model'),
        ('replicated_code', 'original_model_code', 'Replicated Code vs Original Model'),
        ('replicated_code', 'new_model_code', 'Replicated Code vs New Model')
    ]
    
    results = []
    
    for col1, col2, name in comparisons:
        accuracy = accuracy_score(df[col1], df[col2])
        kappa = cohen_kappa_score(df[col1], df[col2])
        matching_cases = (df[col1] == df[col2]).sum()
        total_cases = len(df)
        match_percentage = (matching_cases / total_cases) * 100
        
        results.append({
            'Comparison': name,
            'Accuracy': accuracy,
            'Kappa Score': kappa,
            'Matching Cases': matching_cases,
            'Total Cases': total_cases,
            'Match Percentage': match_percentage
        })
    
    results_df = pd.DataFrame(results)
    
    print("Similarity Analysis Summary:")
    print("=" * 100)
    print(results_df.to_string(index=False))
    
    print("\nDetailed Analysis:")
    print("=" * 100)
    
    for col1, col2, name in comparisons:
        matrix = confusion_matrix(df[col1], df[col2])
        print(f"\nConfusion Matrix for {name}:")
        print(f"Categories: {sorted(df[col1].unique().tolist())}")
        print(matrix)

# Run the analysis
perform_similarity_analysis(df_results)

Similarity Analysis Summary:
                       Comparison  Accuracy  Kappa Score  Matching Cases  Total Cases  Match Percentage
      Original Model vs New Model  0.865385     0.611887            1125         1300         86.538462
  Original Code vs Original Model  0.853846     0.546323            1110         1300         85.384615
       Original Code vs New Model  0.932308     0.774775            1212         1300         93.230769
Replicated Code vs Original Model  0.863846     0.572559            1123         1300         86.384615
     Replicated Code vs New Model  0.937692     0.789834            1219         1300         93.769231

Detailed Analysis:

Confusion Matrix for Original Model vs New Model:
Categories: ['No', 'Yes']
[[  0   0   0   0   0]
 [  1 924   1   0  64]
 [  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0 108   0   1 201]]

Confusion Matrix for Original Code vs Original Model:
Categories: ['No', 'Yes']
[[946 146]
 [ 44 164]]

Confusion Matrix for Original

# Analysis Summary

## 1. Overall Performance
- **Accuracy Range**: All models show relatively high accuracy (83-87%).
- **Best Accuracy**: "Replicated Code vs New Model" achieved the highest accuracy at **86.69%**.
- **Lowest Accuracy**: "Original Model vs New Model" had the lowest accuracy at **83.38%**.

---

## 2. Model Comparison
- The **new model** appears to perform slightly better than the original model when compared to human-coded data (both original and replicated).
- **Agreement between Models**: "Original Model vs New Model" agreement is **83.38%**, indicating that the models make somewhat different decisions.

---

## 3. Kappa Scores
- **Range**: Kappa scores indicate moderate agreement, ranging from **0.546 to 0.604**.
- **Highest Kappa**: "Replicated Code vs New Model" with a score of **0.604**.
- **Lowest Kappa**: "Original Code vs Original Model" with a score of **0.546**.

---

## 4. Human Agreement
- The **new model** slightly outperforms the original model when compared to human-coded data.
- The **replicated human coding** agrees slightly better with both models compared to the original coding.

---

## 5. Confusion Matrices
- **Issue Identified**: Confusion matrices for some comparisons are **5x5** (with mostly zeros), whereas **2x2** matrices (Yes/No) were expected.
  - This suggests a potential issue in the calculation of confusion matrices.
  
- **Insights from Valid Matrices**:
  - Both models tend to be **conservative with "Yes" predictions**.
  - **False Negatives > False Positives**.
  - The majority of predictions are "No" (evidenced by larger numbers in the top-left cells).

---
