In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
ls

[0m[01;34mgdrive[0m/  [01;34msample_data[0m/


In [3]:
cd gdrive/MyDrive/projects-bias-bot-2/src

/content/gdrive/MyDrive/projects-bias-bot-2/src


In [4]:
import numpy as np
import pandas as pd
import pickle
import re
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

from tqdm import tqdm
import gensim.downloader as api
from sklearn.metrics import classification_report

from Article import Article

In [5]:
from tqdm import tqdm

In [6]:
class LSTM(nn.Module):
	def __init__(self, input_size, emb_dim, output_size, num_layers, embeds=None):
		super().__init__()
		self.emb = nn.Embedding(input_size, emb_dim)
		if embeds is not None:
			self.emb.weight = nn.Parameter(torch.Tensor(embeds))
		
		self.lstm = nn.LSTM(emb_dim, emb_dim, num_layers=num_layers, bidirectional=True)
		self.linear = nn.Linear(emb_dim*2, output_size)
		
	def forward(self, input_seq):

		embeds = self.emb( input_seq )

		output_seq , (h_last, c_last) = self.lstm( embeds )

		h_direc_1 = h_last[4,:,:]
		h_direc_2 = h_last[5,:,:]
		h_direc_12 = torch.cat( (h_direc_1, h_direc_2), dim=1 )

		return self.linear(h_direc_12)

In [7]:
def load_vocab(data):
	word_to_index = {"UNK":0,"FOX":1,"CNN":2,"BBC":3,"Liberal":4,"Conservative":5,"Independent":6,"Other":7}
	vocab = []
	count = len(word_to_index)+1
	for item in data:
		for word in item.headline + item.text:
			if word not in word_to_index:
				vocab.append(word)
				word_to_index[word] = count 
				count += 1
	return vocab, word_to_index

def load_bigrams(data):
	bigram_to_index = dict()
	count = 0
	for article in data:
		full_text = article.headline + article.text
		for i, word in enumerate(full_text):
			if i==0:
				continue
			bigram = (full_text[i-1],word)
			if bigram not in bigram_to_index:
				bigram_to_index[bigram]=count
				count+=1
	return bigram_to_index

def make_unigrams(data, word_to_index, party=None):
	processed_data = []
	for article in data:
		datapoint = []
		if party==None and article.party not in ["Liberal", "Conservative"]:
			continue
		elif article.party != party and party!="Combined":
			continue
		elif party=="Combined":
			datapoint = [word_to_index[article.party]]
		datapoint += [word_to_index[article.source]] + [word_to_index[word] for word in article.headline] + [word_to_index[word] for word in article.text]
		label = label_to_index[article.label]

		processed_data.append( (datapoint, label) )
	return processed_data

def make_bigrams(data, bigram_to_index, party):
	processed_data = []
	for article in data:
		if article.party != party:
			continue
		# if article.party not in ["Liberal", "Conservative"]:
		# 	continue

		datapoint = []
		full_text = article.headline + article.text
		for i, word in enumerate(full_text):
			if i==0:
				continue
			datapoint.append(bigram_to_index[(full_text[i-1],word)])
		label = label_to_index[article.label]

		processed_data.append( (datapoint, label) )
	return processed_data
	
def split_data(processed_data):
	return processed_data[:math.floor(0.8*len(processed_data))], processed_data[math.floor(0.8*len(processed_data)):]


def process_batch(batch):
	x = torch.zeros((len(batch), max_len), dtype=torch.long)
	y = torch.zeros((len(batch)), dtype=torch.long)
	for idx, (text, label) in enumerate(batch):
		# print(torch.Tensor(text))
		# print(len(text))
		# print(torch.Tensor(text).size())
		# print()
		x[idx,:len(text)] = torch.Tensor(text)
		y[idx] = label
	return x.to(device), y.to(device)

def get_error(scores, labels):
    bs=scores.size(0)
    predicted_labels = scores.argmax(dim=1)
    indicator = (predicted_labels == labels)
    num_matches=indicator.sum()
    
    return 1-num_matches.float()/bs  

def evaluate(model, test_data):
	with torch.no_grad():
		model.eval()
		x_test, y_test = process_batch(test_data)
	
		# for i in range(10):
		# 	item = x_test[i].tolist()
		# 	print(item)
		# 	print([index_to_word[index] for index in item])
	 
		# print("X test")
		# print(x_test)
		# print(x_test.size())


		x_test = x_test.transpose(0,1)
	
		# print("X test")
		# print(x_test)
		# print(x_test.size())

		pred_y_test = model(x_test)
	
		# print("Pred y test")
		# print(pred_y_test)
		# print(pred_y_test.size())

		labels = y_test.tolist()
		predictions = [torch.argmax(pred).item() for pred in pred_y_test]

		# print("Labels")
		# print(labels)
		# print(len(labels))

		# print("Predictions")
		# print(predictions)
		# print(len(predictions))

		# print("Evaluation on test set:")
		print(classification_report(labels, predictions, target_names=["is-biased","is-not-biased"], zero_division=0))
		print("Error:",get_error(pred_y_test, y_test).item())



In [8]:
device= torch.device("cuda")
print(device)

# Run only for a single political group
# party = "Liberal"
# party = "Conservative"
party = "Combined"

mode = "unigram"

# Load data
print("Loading data...")
with open("../data/processed_articles.p", "rb") as f:
	data = pickle.load(f)

vocab, word_to_index = load_vocab(data)
index_to_word = {v: k for k, v in word_to_index.items()}

print(word_to_index)
print(index_to_word)

bigram_to_index = load_bigrams(data)
index_to_bigram = {v: k for k, v in bigram_to_index.items()}


label_to_index = {"is-biased":0, "is-not-biased":1}
max_len = max([len(article.headline + article.text) for article in data]) + 3

print("Creating train data set...")

if mode=="unigram":
	unigrams = make_unigrams(data, word_to_index, party)
	train_data, test_data = split_data(unigrams)
	input_size = len(word_to_index)+1
elif mode=="bigram":
	bigrams = make_bigrams(data, bigram_to_index, party)
	train_data, test_data = split_data(bigrams)
	input_size = len(bigram_to_index)+1



# Hyper parameters
output_size = 2 
num_layers = 3
batch_size = 30
learning_rate = 0.001
epochs = 20

#Load pre-trained word embeddings, if using them.
# embeds = api.load('glove-twitter-25').vectors
# emb_dim = embeds.shape[1]

embeds = None
emb_dim = 200


# Build model
model = LSTM(input_size, emb_dim, output_size, num_layers, embeds).to(device)
criterion = nn.CrossEntropyLoss()

# train_data = train_data[0:400]

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=False, collate_fn=process_batch)

cuda
Loading data...
Creating train data set...


In [9]:
print(len(data))

3692


In [10]:
print(len(train_data))
print(len(test_data))

2953
739


In [11]:
print(max_len)

1513


In [12]:
print(len([item for item in train_data if item[1]==1]))
print(len([item for item in train_data if item[1]==0]))

1072
1881


In [13]:
for i in range(10):
  item1 = train_data[i]
  print([index_to_word[index] for index in item1[0]])

['Independent', 'FOX', 'ghislaine', 'maxwell', 'juror', 'calls', 'false', 'answer', 'questionnaire', 'biggest', 'mistake', 'fox', 'news', 'flash', 'top', 'headlines', 'check', "'s", 'clicking', 'foxnewscom', 'ghislaine', 'maxwell', 'juror', "n't", 'disclose', 'sexually', 'abused', 'pretrial', 'questionnaire', 'said', 'hearing', 'tuesday', 'deeply', 'regretted', 'error', 'called', 'inadvertent', "'s", 'biggest', 'mistakes', "'ve", 'ever', 'made', 'life', 'said', 'seated', 'courtroom', 'witness', 'box', 'response', 'questioning', 'us', 'district', 'judge', 'alison', 'nathan', 'manhattan', 'federal', 'court', 'juror', 'accompanied', 'attorney', 'todd', 'spodek', 'new', 'york', 'ny', 'june', '11', 'ghislaine', 'maxwell', 'attends', 'national', 'urban', 'tech', 'center', '2014', 'gala', 'three', 'sixty', 'june', '11', '2014', 'new', 'york', 'city', 'photo', 'jimi', 'celestepatrick', 'mcmullan', 'via', 'getty', 'image', 'error', 'could', 'put', 'british', 'socialite', "'s", 'sex', 'trafficki

In [14]:
index_to_word[0]

'UNK'

In [16]:
train_data_subset = train_data[0:10]
evaluate(model, train_data_subset)

               precision    recall  f1-score   support

    Is Biased       0.50      1.00      0.67         5
Is Not Biased       0.00      0.00      0.00         5

     accuracy                           0.50        10
    macro avg       0.25      0.50      0.33        10
 weighted avg       0.25      0.50      0.33        10

Error: 0.5


In [None]:
# Train loop
for epoch in range(epochs):

	print(f"\n\nEpoch {epoch}")

	if epoch >= 5:
		learning_rate = learning_rate/2
	optimizer = optim.Adam(model.parameters(), lr=learning_rate)

	model.train()

	running_error = 0
	count = 0

	for x,y in tqdm(train_dataloader):
   
		if x.size()[0] != batch_size:
			continue 

		# print("before")
		# for i in range(10):
		# 	item = x[i].tolist()
		# 	print(item)
		# 	print([index_to_word[index] for index in item])

		x = x.transpose(0,1)
	
		# print("\n\n\nafter:")
		# for i in range(10):
		# 	item = x[i].tolist()
		# 	print(item)
		# 	print([index_to_word[index] for index in item])


		scores = model(x)
		scores = scores.view(-1,2)

		loss = criterion(scores, y)
		loss.backward()
		optimizer.step()
		optimizer.zero_grad()

		error = get_error(scores, y)
		running_error += error.item()
		count += 1


	# print("Evaluate on test:")
	# evaluate(model, test_data)
	print("Evaluate on train:")
	evaluate(model, train_data[0:400])
 

	print("Running Error:", running_error/count)

# Evaluate
torch.save(model.state_dict(),f"model_{party}")






Epoch 0


100%|██████████| 99/99 [01:55<00:00,  1.17s/it]


Evaluate on train:
               precision    recall  f1-score   support

    Is Biased       0.48      0.82      0.61       190
Is Not Biased       0.56      0.20      0.30       210

     accuracy                           0.50       400
    macro avg       0.52      0.51      0.45       400
 weighted avg       0.52      0.50      0.45       400

Error: 0.5024999976158142
Running Error: 0.37619044221177395


Epoch 1


100%|██████████| 99/99 [01:55<00:00,  1.17s/it]


Evaluate on train:
               precision    recall  f1-score   support

    Is Biased       0.53      0.67      0.59       190
Is Not Biased       0.61      0.47      0.53       210

     accuracy                           0.56       400
    macro avg       0.57      0.57      0.56       400
 weighted avg       0.57      0.56      0.56       400

Error: 0.4375
Running Error: 0.34693873840935374


Epoch 2


100%|██████████| 99/99 [01:55<00:00,  1.16s/it]


Evaluate on train:
               precision    recall  f1-score   support

    Is Biased       0.55      0.70      0.61       190
Is Not Biased       0.64      0.48      0.54       210

     accuracy                           0.58       400
    macro avg       0.59      0.59      0.58       400
 weighted avg       0.59      0.58      0.58       400

Error: 0.41750001907348633
Running Error: 0.33129247779748877


Epoch 3


100%|██████████| 99/99 [01:55<00:00,  1.17s/it]


Evaluate on train:
               precision    recall  f1-score   support

    Is Biased       0.52      0.81      0.63       190
Is Not Biased       0.64      0.31      0.42       210

     accuracy                           0.55       400
    macro avg       0.58      0.56      0.52       400
 weighted avg       0.58      0.55      0.52       400

Error: 0.45249998569488525
Running Error: 0.32857139256535745


Epoch 4


100%|██████████| 99/99 [01:55<00:00,  1.17s/it]


Evaluate on train:
               precision    recall  f1-score   support

    Is Biased       0.56      0.75      0.64       190
Is Not Biased       0.67      0.46      0.54       210

     accuracy                           0.60       400
    macro avg       0.61      0.60      0.59       400
 weighted avg       0.62      0.60      0.59       400

Error: 0.4025000333786011
Running Error: 0.32006798775828615


Epoch 5


 42%|████▏     | 42/99 [00:49<01:07,  1.18s/it]

In [None]:
"model = LSTM(input_size, emb_dim, output_size, num_layers, embeds).to(device)
model.load_state_dict(torch.load("model_Combined"), strict=False)

In [None]:
evaluate(model, test_data[0:100])

In [None]:
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=process_batch)

In [None]:
def assign_label(party, source, text):
  sentence = [party] + [source] + text.split()
  tokenized = [word_to_index[word] for word in sentence]
  label = 0 
  data = [(tokenized,label)]
  x,y = process_batch(data)
  x = x.view(-1,1)
  scores = model(x)
  predicted_label = torch.argmax(scores).item() 

  if predicted_label == 0:
    return "Un-biased"
  else:
    return "Biased"

In [None]:
assign_label("Independent","BBC","test")

In [None]:
assign_label("Conservative","CNN","cops bad")