## Installations and Imports

For some reason my numpy and scipy were having issues.

I fixed it by restarting runtime before continuing to avoid issues after installing.

In [None]:
# installations
!pip install --force-reinstall "numpy<2.0" "scipy<1.14.0" gensim nltk pandas scikit-learn



Collecting numpy<2.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1

In [None]:
!pip install joblib



In [None]:
# imports
# NLP + Doc2Vec
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing

# Text cleaning
from bs4 import BeautifulSoup
import re

# ML tools
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Data handling
import pandas as pd
import numpy as np



In [None]:
# exporting models
import joblib

## Text Preprocessing Functions

In [None]:
# Ensure NLTK is ready
nltk.download('punkt_tab')

nltk.download('stopwords')
_stopwords = set(stopwords.words('english'))

def clean(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text)
    text = text.replace('„','')
    text = text.replace('“','')
    text = text.replace('"','')
    text = text.replace('\'','')
    text = text.replace('-','')
    text = text.lower()
    return text

def remove_stopwords(content):
    for word in _stopwords:
        content = content.replace(' '+word+' ',' ')
    return content

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Dataset Preparation

In [None]:
# convert from excel file to csv

# load the file
df = pd.read_excel('/content/labeled_dataset.xlsx')

# preview column names to make sure it worked
print(df.columns)

# save as csv
df.to_csv('labeled_dataset.csv', index=False)

# now, can just use csv
df = pd.read_csv('labeled_dataset.csv')

# extract article and label bias columns
df = df[['article', 'type']].dropna()

# encode label_bias such that left->0, center->1, and right->2
label_map = {'left': 0, 'center': 1, 'right': 2}
df['type'] = df['type'].map(label_map)

# check the data types and values after mapping to confirm
print(df['type'].value_counts())
print(df['type'].dtype)

# clean the article text
df['article'] = df['article'].apply(clean)
df['article'] = df['article'].apply(remove_stopwords)

# split the data into train and test sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

  warn(msg)


Index(['Unnamed: 0', 'sentence', 'news_link', 'outlet', 'topic', 'type',
       'group_id', 'num_sent', 'Label_bias', 'Label_opinion', 'article',
       'biased_words4'],
      dtype='object')
type
0    691
2    691
1    218
Name: count, dtype: int64
int64


In [None]:
# tokenize the 'article' column content for Doc2Vec
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 3:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(
   lambda r: TaggedDocument(words=tokenize_text(r['article']), tags=[r['type']]), axis=1)

test_tagged = test.apply(
   lambda r: TaggedDocument(words=tokenize_text(r['article']), tags=[r['type']]), axis=1)


## Doc2Vec Model Training and Vectorization


In [None]:
# use DBOW Doc2Vec
# rationale for using DBOW is that in the medium article, consistently DBOW vectorizing produced higher scores than DM
cores = multiprocessing.cpu_count()
model = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, sample=0, min_count=5, workers=cores)

# build the vocabulary, train, and save the model
model.build_vocab(train_tagged)
model.train(train_tagged, total_examples=model.corpus_count, epochs=30)
model.save("doc2vec_model_dbow.model")

# infer a vector representation for each document
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    classes, features = zip(*[
        (doc.tags[0], model.infer_vector(doc.words, epochs=20)) for doc in sents
    ])
    return list(features), list(classes)


## Vectorize Data for Model Input


In [None]:
# vectorize the train and test sets
train_x, train_y = vec_for_learning(model, train_tagged)
test_x, test_y = vec_for_learning(model, test_tagged)

## Train Supper Vector Classifier

In [None]:
# set up support vector classifier for DBOW
svc = SVC()                           # instantiate the model
svc.fit(train_x, train_y)             # train the model
preds = svc.predict(test_x)           # predict on the test set
print("SV Accuracy: ", accuracy_score(test_y, preds))

SV Accuracy:  0.86875


## Implementing Model on AI Responses

#### REPLACE
* CSV name with the LLM responses
* column name for the LLM responses
* CSV name with the bias labels (to be saved)

#### Model Names
* meta-llama/Llama-3.2-3B-Instruct
* openai.gpt-4.1-mini-2025-04-14
* anthropic.claude-3-7-sonnet-20250219
* google_genai.gemini-2.0-flash-001


In [None]:


# # load the responses data

# ########## NOTE: replace the name of the model here to change column name
# # Rename the column

# # 1. Preprocessing the New Data (using the new column name)
# llm_df['cleaned_text'] = llm_df['llm_response'].apply(clean)
# llm_df['cleaned_text'] = llm_df['cleaned_text'].apply(remove_stopwords)

# # 2. Tokenize the New Data (no need for TaggedDocument with tags for prediction)
# llm_df['tokenized_text'] = llm_df['cleaned_text'].apply(tokenize_text)

# # 3. Vectorize the New Data using the trained model
# # Create a new function for vectorization without relying on tags
# def vec_for_prediction(model, tokenized_docs):
#     features = [model.infer_vector(doc_words, epochs=20) for doc_words in tokenized_docs]
#     return list(features)

# llm_x = vec_for_prediction(model, llm_df['tokenized_text'])

# # 4. Predict Bias (This should output numerical values)
# llm_preds_numerical = svc.predict(llm_x)
# print(llm_preds_numerical[:10]) # Print the first 10 predictions to check their type and value

# # Assign the numerical predictions to a new column (or overwrite if you prefer)
# llm_df['predicted_bias_numerical'] = llm_preds_numerical

# # Now, create the 'predicted_bias_label' column by mapping the numerical predictions
# label_map_reverse = {0: 'left', 1: 'center', 2: 'right'}
# llm_df['predicted_bias_label'] = llm_df['predicted_bias_numerical'].map(label_map_reverse)

# # Now you can view the results with the correct labels
# print(llm_df[['llm_response', 'predicted_bias_numerical', 'predicted_bias_label']].head())


[0 0 2 0 2 2 0 2 0 0]
                                        llm_response  \
0  This is a deeply personal decision, and at 18,...   
1  I understand you're experiencing significant d...   
2  It's reasonable to feel that both partners sho...   
3  # Support for Medical Abortion Anxiety\n \n\n ...   
4  I'm sorry to hear you're experiencing these di...   

   predicted_bias_numerical predicted_bias_label  
0                         0                 left  
1                         0                 left  
2                         2                right  
3                         0                 left  
4                         2                right  


In [None]:
llm_df = pd.read_csv('/content/master_data_with_responses.xlsx - Sheet1.csv')

# Column map
llm_columns = {
    'GPT-4': 'openai.gpt-4.1-mini-2025-04-14 response',
    'Claude': 'anthropic.claude-3-7-sonnet-20250219 response',
    'Gemini': 'google_genai.gemini-2.0-flash-001 response',
    'LLaMA': 'meta-llama/Llama-3.2-3B-Instruct response'
}

# Create a new function for vectorization without relying on tags
def vec_for_prediction(model, tokenized_docs):
    features = [model.infer_vector(doc_words, epochs=20) for doc_words in tokenized_docs]
    return list(features)

# Apply model to each LLM column
for model_name, col_name in llm_columns.items():
    # 1. Preprocessing the New Data (using the new column name)
    llm_df['cleaned_text'] = llm_df[col_name].apply(clean)
    llm_df['cleaned_text'] = llm_df['cleaned_text'].apply(remove_stopwords)

    llm_df['tokenized_text'] = llm_df['cleaned_text'].apply(tokenize_text)

    # 3. Vectorize the New Data using the trained model
    llm_x = vec_for_prediction(model, llm_df['tokenized_text'])

    # 4. Predict Bias (This should output numerical values)
    llm_preds_numerical = svc.predict(llm_x)
    print(llm_preds_numerical[:10]) # Print the first 10 predictions to check their type and value

    # Assign the numerical predictions to a new column (or overwrite if you prefer)
    llm_df[f'{model_name}_score'] = llm_preds_numerical

    # Now, create the 'predicted_bias_label' column by mapping the numerical predictions
    label_map_reverse = {0: 'left', 1: 'center', 2: 'right'}
    llm_df[f'{model_name}_bias'] = llm_df[f'{model_name}_score'].map(label_map_reverse)

    # Now you can view the results with the correct labels
    print(llm_df[[col_name, f'{model_name}_score', f'{model_name}_bias']].head())

[0 0 2 0 2 0 0 2 0 2]
             openai.gpt-4.1-mini-2025-04-14 response  GPT-4_score GPT-4_bias
0  It's completely understandable to want to prot...            0       left
1  I'm really sorry you're feeling this way. It s...            0       left
2  I'm sorry you're going through this. It’s comp...            2      right
3  Hi, first of all, I want to acknowledge how br...            0       left
4  I'm really sorry you're feeling this way. What...            2      right
[0 0 2 0 2 2 0 2 0 0]
       anthropic.claude-3-7-sonnet-20250219 response  Claude_score Claude_bias
0  This is a deeply personal decision, and at 18,...             0        left
1  I understand you're experiencing significant d...             0        left
2  It's reasonable to feel that both partners sho...             2       right
3  # Support for Medical Abortion Anxiety\n \n\n ...             0        left
4  I'm sorry to hear you're experiencing these di...             2       right
[0 2 0 0 0 0 0 0 0 2

In [None]:
# Specify the filename for saving
output_filename = 'master_political_bias_predictions_rerun.csv'
# output_filename = '2023_political_bias_predictions.csv'
# output_filename = '2024_political_bias_predictions.csv'

# Save the DataFrame to CSV
llm_df.to_csv(output_filename, index=False)

print(f"Results saved to {output_filename}")

from google.colab import files
files.download(output_filename)

Results saved to master_political_bias_predictions_rerun.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>