In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil
import os

# Define paths
drive_dataset_path = '/content/drive/MyDrive/datasets/nlp'
colab_temp_path = '/content/nlp'  # Destination in Colab

# Copy all files recursively
shutil.copytree(drive_dataset_path, colab_temp_path)

print(f"Datasets copied to: {colab_temp_path}")
print("Files:", os.listdir(colab_temp_path))

Datasets copied to: /content/nlp
Files: ['annotators.csv', 'labeled_dataset.xlsx', 'annotations.xlsx']


In [None]:
# Core libraries
import numpy as np
import pandas as pd
import os
from tqdm import tqdm  # Progress bars

# BERT Embeddings
from transformers import BertTokenizer, BertModel
import torch

# Visualization (Optional)
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE  # For embedding visualization

# Warnings (Optional)
import warnings
warnings.filterwarnings('ignore')  # Suppress non-critical alerts

In [None]:
df = pd.read_excel('/content/nlp/labeled_dataset.xlsx')

In [None]:
print(df.head())
print(df['Label_bias'].value_counts())


   Unnamed: 0                                           sentence  \
0           0  YouTube is making clear there will be no “birt...   
1           1  The increasingly bitter dispute between Americ...   
2           2  So while there may be a humanitarian crisis dr...   
3           3  A professor who teaches climate change classes...   
4           4  Looking around the United States, there is nev...   

                                           news_link      outlet  \
0  https://eu.usatoday.com/story/tech/2020/02/03/...   usa-today   
1  https://www.nbcnews.com/news/sports/women-s-te...       msnbc   
2  https://www.alternet.org/2019/01/here-are-5-of...    alternet   
3  https://www.breitbart.com/politics/2019/05/09/...   breitbart   
4  https://thefederalist.com/2020/03/11/woman-who...  federalist   

            topic    type  group_id  num_sent  Label_bias  \
0  elections-2020  center         1         1      Biased   
1           sport    left         1         1  Non-biased   

In [None]:
import ast
df = df[['sentence', 'Label_bias', 'biased_words4']]

# Convert string-lists to actual lists (for biased_words4)
df['biased_words'] = df['biased_words4'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

# Drop rows where Label_bias is "No agreement"
df = df[df['Label_bias'] != 'No agreement']

# Map labels to binary (Biased=1, Non-biased=0)
df['label'] = df['Label_bias'].map({'Biased': 1, 'Non-biased': 0})

# Verify
print(df['label'].value_counts())
print(df.head())


label
1    1018
0     533
Name: count, dtype: int64
                                            sentence  Label_bias  \
0  YouTube is making clear there will be no “birt...      Biased   
1  The increasingly bitter dispute between Americ...  Non-biased   
2  So while there may be a humanitarian crisis dr...      Biased   
3  A professor who teaches climate change classes...  Non-biased   
4  Looking around the United States, there is nev...      Biased   

                                       biased_words4  \
0                          ['belated', 'birtherism']   
1                                         ['bitter']   
2                                         ['crisis']   
3                                     ['legitimate']   
4  ['killing', 'never', 'developing', 'humans', '...   

                                   biased_words  label  
0                         [belated, birtherism]      1  
1                                      [bitter]      0  
2                              

Preprocessing the sentences

In [None]:
import re

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

df['clean_text'] = df['sentence'].apply(clean_text)

In [None]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
Y = df['label']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
X_test.head()

Unnamed: 0,clean_text
1666,spencer speculated that the media may be soull...
683,officials at dartmouth college looked the othe...
385,as the worlds scientists and pharmaceutical co...
386,though it was major league baseball that he me...
561,the fact that the abortion rate among american...


Apply Embeddings

In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model_bert(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# # Example for one sentence
# sample_embedding = get_bert_embeddings(X_train.iloc[0])
# print(sample_embedding.shape)  # Should be (768,)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
sample_embedding = get_bert_embeddings(X_test.iloc[0])
print(sample_embedding.shape)  # Should be (768,)

(768,)


In [None]:
X_train_bert = np.array([get_bert_embeddings(text) for text in X_train])
np.save('X_train_bert.npy', X_train_bert)

In [None]:
X_test_bert = np.array([get_bert_embeddings(text) for text in X_test])
np.save('X_test_bert.npy', X_test_bert)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout, BatchNormalization

model = Sequential([
    Dense(256, activation='relu', input_dim=768, kernel_regularizer='l2'),
    BatchNormalization(),
    Dropout(0.6),  # Increased from 0.5
    Dense(128, activation='relu', kernel_regularizer='l2'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
# Custom learning rate schedule
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.losses import BinaryCrossentropy


optimizer = Adam(learning_rate=0.0001)
model.compile(
    optimizer=optimizer,
    loss=BinaryCrossentropy(label_smoothing=0.1),
    metrics=['accuracy']
)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# 1. Define callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(
    'best_model.h5',
    monitor='val_accuracy',  # Can also use 'val_loss'
    mode='max',             # 'max' for accuracy, 'min' for loss
    save_best_only=True,
    verbose=1
)

# 2. Train with both callbacks
history = model.fit(
    X_train_bert, Y_train,
    validation_data=(X_test_bert, Y_test),
    epochs=20,
    batch_size=16,
    callbacks=[early_stop, lr_scheduler, model_checkpoint]  # Added ModelCheckpoint
)

# 3. Load the best saved model
from tensorflow.keras.models import load_model
best_model = load_model('best_model.h5')

Epoch 1/20
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.5545 - loss: 6.4556
Epoch 1: val_accuracy improved from -inf to 0.63023, saving model to best_model.h5




[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 37ms/step - accuracy: 0.5545 - loss: 6.4547 - val_accuracy: 0.6302 - val_loss: 6.0087 - learning_rate: 1.0000e-04
Epoch 2/20
[1m64/78[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.5881 - loss: 6.1971
Epoch 2: val_accuracy improved from 0.63023 to 0.69132, saving model to best_model.h5




[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5908 - loss: 6.1866 - val_accuracy: 0.6913 - val_loss: 5.8282 - learning_rate: 1.0000e-04
Epoch 3/20
[1m62/78[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.6007 - loss: 6.0430
Epoch 3: val_accuracy improved from 0.69132 to 0.71704, saving model to best_model.h5




[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6011 - loss: 6.0313 - val_accuracy: 0.7170 - val_loss: 5.6745 - learning_rate: 1.0000e-04
Epoch 4/20
[1m63/78[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.6089 - loss: 5.8954
Epoch 4: val_accuracy improved from 0.71704 to 0.74920, saving model to best_model.h5




[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6158 - loss: 5.8822 - val_accuracy: 0.7492 - val_loss: 5.5282 - learning_rate: 1.0000e-04
Epoch 5/20
[1m60/78[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.6840 - loss: 5.6728
Epoch 5: val_accuracy improved from 0.74920 to 0.75241, saving model to best_model.h5




[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6839 - loss: 5.6663 - val_accuracy: 0.7524 - val_loss: 5.4132 - learning_rate: 1.0000e-04
Epoch 6/20
[1m62/78[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.6967 - loss: 5.5213
Epoch 6: val_accuracy did not improve from 0.75241
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6925 - loss: 5.5222 - val_accuracy: 0.7492 - val_loss: 5.3121 - learning_rate: 1.0000e-04
Epoch 7/20
[1m59/78[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.6977 - loss: 5.4130
Epoch 7: val_accuracy improved from 0.75241 to 0.75884, saving model to best_model.h5




[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6916 - loss: 5.4123 - val_accuracy: 0.7588 - val_loss: 5.1969 - learning_rate: 1.0000e-04
Epoch 8/20
[1m63/78[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.6886 - loss: 5.3625
Epoch 8: val_accuracy did not improve from 0.75884
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6942 - loss: 5.3487 - val_accuracy: 0.7588 - val_loss: 5.0990 - learning_rate: 1.0000e-04
Epoch 9/20
[1m61/78[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.7038 - loss: 5.1822
Epoch 9: val_accuracy improved from 0.75884 to 0.76849, saving model to best_model.h5




[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7076 - loss: 5.1749 - val_accuracy: 0.7685 - val_loss: 4.9936 - learning_rate: 1.0000e-04
Epoch 10/20
[1m62/78[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.7178 - loss: 5.0948
Epoch 10: val_accuracy did not improve from 0.76849
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7161 - loss: 5.0899 - val_accuracy: 0.7685 - val_loss: 4.8956 - learning_rate: 1.0000e-04
Epoch 11/20
[1m61/78[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.7127 - loss: 5.0416
Epoch 11: val_accuracy improved from 0.76849 to 0.78457, saving model to best_model.h5




[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7130 - loss: 5.0317 - val_accuracy: 0.7846 - val_loss: 4.7967 - learning_rate: 1.0000e-04
Epoch 12/20
[1m63/78[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.7283 - loss: 4.9071
Epoch 12: val_accuracy did not improve from 0.78457
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7276 - loss: 4.9025 - val_accuracy: 0.7749 - val_loss: 4.7104 - learning_rate: 1.0000e-04
Epoch 13/20
[1m62/78[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.7199 - loss: 4.7867
Epoch 13: val_accuracy did not improve from 0.78457
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7195 - loss: 4.7878 - val_accuracy: 0.7460 - val_loss: 4.6463 - learning_rate: 1.0000e-04
Epoch 14/20
[1m71/78[0m [32m━━━━━━━━━━━━━━━━━━[0m



[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7330 - loss: 4.7069 - val_accuracy: 0.7910 - val_loss: 4.5450 - learning_rate: 1.0000e-04
Epoch 15/20
[1m73/78[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 4ms/step - accuracy: 0.7375 - loss: 4.6146
Epoch 15: val_accuracy improved from 0.79100 to 0.80064, saving model to best_model.h5




[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7382 - loss: 4.6114 - val_accuracy: 0.8006 - val_loss: 4.4542 - learning_rate: 1.0000e-04
Epoch 16/20
[1m71/78[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 4ms/step - accuracy: 0.7923 - loss: 4.4646
Epoch 16: val_accuracy did not improve from 0.80064
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7899 - loss: 4.4665 - val_accuracy: 0.7910 - val_loss: 4.3907 - learning_rate: 1.0000e-04
Epoch 17/20
[1m75/78[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.7659 - loss: 4.4016
Epoch 17: val_accuracy did not improve from 0.80064
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7655 - loss: 4.4013 - val_accuracy: 0.7717 - val_loss: 4.3166 - learning_rate: 1.0000e-04
Epoch 18/20
[1m65/78[0m [32m━━━━━━━━━━━━━━━━[0m[3



In [None]:
from tensorflow.keras.models import load_model
import numpy as np

# Load the best model
best_model = load_model('best_model.h5')
# Evaluate the best saved model
loss, accuracy = best_model.evaluate(X_test_bert, Y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# 1. Get model architecture
print("="*50)
print("Model Architecture Summary:")
best_model.summary()

# 2. Get training metrics at save point
print("\n" + "="*50)
print("Performance When Saved:")
print(f"Validation Accuracy: {0.80064:.4f}")  # From your logs
print(f"Training Accuracy: ~0.7382")  # From epoch 15 logs

# 3. Get layer details
print("\n" + "="*50)
print("Layer Details:")
for i, layer in enumerate(best_model.layers):
    print(f"Layer {i+1}: {layer.name}")
    print(f"   Config: {layer.get_config()}")
    if hasattr(layer, 'weights'):
        print(f"   # Weights: {len(layer.weights)}")

# 4. Verify current performance
loss, accuracy = best_model.evaluate(X_test_bert, Y_test)
print("\n" + "="*50)
print(f"Current Test Accuracy: {accuracy:.4f}")




[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.8138 - loss: 4.4515
Test Loss: 4.4542
Test Accuracy: 0.8006
Model Architecture Summary:



Performance When Saved:
Validation Accuracy: 0.8006
Training Accuracy: ~0.7382

Layer Details:
Layer 1: dense_8
   Config: {'name': 'dense_8', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None}, 'units': 256, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.01}, 'registered_name': None}, 'bias_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}
   # Weights: 2
Layer 2: batch_normalization_4
   Config: {'name': 'batch_normalization_4', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered

In [None]:
def predict_bias(text, model, tokenizer, bert_model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy().reshape(1, -1)

    prediction = model.predict(embedding)[0][0]
    label = "Biased" if prediction > 0.5 else "Unbiased"
    print(f"Prediction Score: {prediction:.4f} → {label}")
    return label


In [None]:
# ✅ UNBIASED SENTENCES (Expected Output: Unbiased)
unbiased_texts = [
    "The economy of the United Kingdom grew by 2% last year.",
    "Scientists discovered a new exoplanet orbiting a nearby star.",
    "Public transport in the city has been improved recently.",
    "She enjoys painting landscapes in her free time.",
    "Healthcare reform remains a debated topic among policymakers.",
    "The new law was passed with bipartisan support.",
    "The library opens at 9 AM every weekday.",
    "Apples are rich in fiber and vitamin C.",
    "He submitted the assignment before the deadline.",
    "The course curriculum was updated in 2023."
]

# ❌ BIASED SENTENCES (Expected Output: Biased)
biased_texts = [
    "Immigrants are ruining our country.",
    "The media is always lying to us.",
    "Those people don't belong here.",
    "Only the elite benefit from the current system.",
    "The president is a complete failure at everything.",
    "She got the job only because of her gender.",
    "That religion is inherently violent.",
    "Foreigners are taking all our jobs.",
    "He’s too old to understand technology.",
    "Politicians are all corrupt and useless."
]

# ⚠️ EDGE CASES / MIXED SENTIMENT (Expected Output: Depends on model sensitivity)
mixed_texts = [
    "Immigrants often face challenges, but contribute significantly to society.",
    "The government's new policy may disproportionately affect certain communities.",
    "Some believe the media has a hidden agenda, while others disagree.",
    "Critics argue that the system favors the wealthy.",
    "While controversial, the decision was supported by many experts.",
    "He tends to speak without thinking, which some find offensive.",
    "The film portrayed a particular culture in a stereotypical way.",
    "We must be cautious about making generalizations.",
    "There's growing concern about misinformation online.",
    "Opponents of the bill say it targets specific groups unfairly."
]


In [None]:
for text in unbiased_texts + biased_texts + mixed_texts:
    print(text)
    predict_bias(text, best_model, tokenizer, model_bert)


The economy of the United Kingdom grew by 2% last year.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Prediction Score: 0.0077 → Unbiased
Scientists discovered a new exoplanet orbiting a nearby star.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Prediction Score: 0.1834 → Unbiased
Public transport in the city has been improved recently.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Prediction Score: 0.0182 → Unbiased
She enjoys painting landscapes in her free time.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Prediction Score: 0.0198 → Unbiased
Healthcare reform remains a debated topic among policymakers.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Prediction Score: 0.5257 → Biased
The new law was passed with bipartisan support.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Prediction Score: 0.0583 → Unbiased
The library opens at