Install packages

In [1]:
!pip install transformers
!pip install gluonnlp
!pip install mxnet

Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m344.5/344.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp310-cp310-linux_x86_64.whl size=661772 sha256=690b8bed82e8b8db2ac3deeb885397ace4e2b7ccf248eace539d41a3f7d9d2e7
  Stored in directory: /root/.cache/pip/wheels/1a/1e/0d/99f55911d90f2b95b9f7c176d5813ef3622894a4b30fde6bd3
Successfully built gluonnlp
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.10.0
Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Dow

Import packages

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import csv

Choose device settings according to runtime environment

In [3]:
# device = torch.device("cuda:0") # GPU
device = torch.device("cpu") # CPU

In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
gdrive_path = "/content/gdrive/My Drive/Hackathon/" # change according to your file path

Mounted at /content/gdrive


Load fine-tuned model

In [6]:
import tensorflow as tf
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

# Load a trained model and vocabulary that you have fine-tuned
output_dir = gdrive_path + "model_save/"
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Copy the model to the GPU.
model.to(device)
model.eval()

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'Hackathon/model_save/'. Use `repo_type` argument if needed.

In [None]:
# setup variables
batch_size = 32
max_len = 128

Political bias evaluation

In [None]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

def evaluate(texts):
  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  attention_masks = []

  # For every sentence...
  for text in texts:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation = True
                   )
    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)

  prediction_data = TensorDataset(input_ids, attention_masks)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_loader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


  # evaluate

  eval = []

  # Predict
  for batch in prediction_loader:
    print("processing new batch")
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch

    # Telling the model not to compute or store gradients, saving memory and
    # speeding up prediction
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()

    # Calculate evaluations
    sigmoid = lambda x:1 / (1 + np.exp(-x))
    eval += [sigmoid(val[1] - val[0]) for val in list(logits)]
    print("finished batch")
  return eval


Evaluate all the articles from csv file and save the data as csv file.

In [None]:
def evaluate_csv(csv_path, out_dir, out_name, max_len = None, encoding = 'UTF-8', mode = None):
  dataset = pd.read_csv(csv_path, names=['time',	'category_name',	'text_company',	'text_headline',	'text_sentence',	'content_url'], encoding = encoding).drop(0)
  if max_len: dataset = dataset.sample(max_len)
  titles = dataset.text_headline.values
  texts = dataset.text_sentence.values
  print(len(titles))

  eval = evaluate(texts)

  list = dataset.values.tolist()
  pair_data, notext_data = [], []
  print(eval)
  for i in range(len(eval)):
    pair_data.append([list[i][3], list[i][4], eval[i].item()])
    notext_data.append(list[i][:4]+list[i][5:]+[eval[i].item()])

  pair_data.sort(key = lambda x: x[2])
  pair_data = [['text_headline', 'text_sentence', 'bias_label']] + pair_data
  notext_data = [['time',	'category_name',	'text_company',	'text_headline',	'content_url', "bias_label"]] + notext_data

  csv_path_1 = out_dir + out_name + "_pair_sorted.csv"
  csv_path_2 = out_dir + out_name + "_notext.csv"
  with open(csv_path_1, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerows(pair_data)
  csvfile.close()
  if not mode == "pair":
    with open(csv_path_2, 'w', newline='') as csvfile:
      csv_writer = csv.writer(csvfile)
      csv_writer.writerows(notext_data)
    csvfile.close()

  print("pair data at "+csv_path_1)
  print("notext data at "+csv_path_2)

  return

In [None]:
import csv

dataset_path = gdrive_path + "dataset/articles/Article_정치_20240101_20240201.csv"

evaluate_csv(dataset_path, gdrive_path + "output/", "정치")


349




processing new batch
finished batch
processing new batch
finished batch
processing new batch
finished batch
processing new batch
finished batch
processing new batch
finished batch
processing new batch
finished batch
processing new batch
finished batch
processing new batch
finished batch
processing new batch
finished batch
processing new batch
finished batch
processing new batch
finished batch
[0.9997192649091218, 0.9993543237144058, 0.0002966254509851064, 0.0002763185491441242, 0.030206482714213976, 0.9998016827194881, 0.0003012854650358806, 0.9991854137048427, 0.9994011092115678, 0.9997633144169369, 0.9231334253900888, 0.9998588053157546, 0.009147887332299143, 0.00025552823100350976, 0.9997608203727333, 0.01056274125829042, 0.00046869809606274145, 0.00048808551792117516, 0.9962173360596647, 0.999741978073166, 0.01053559674399003, 0.05779041561000817, 0.011263312409975221, 0.999794051521481, 0.000230246652851118, 0.008908945737824065, 0.0002779138359905338, 0.04294639490634105, 0.00033

merge all pair_sorted files

In [None]:
all_data = []

file_names = ['특검', '태영', '총선_2024', '총선_2020', '총선_2016', '정치', '생활문화', '북한', '금리']
for name in file_names:
  csv_path = gdrive_path + "output/done/" + name + "_pair_sorted.csv"
  encoding = 'utf-8'
  # if name == '생활문화': encoding = 'cp949'
  dataset = pd.read_csv(csv_path, names=['text_headline',	'text_sentence',	'bias_label'], encoding = encoding).drop(0)
  all_data += dataset.values.tolist()

all_data.sort(key = lambda x: x[2])
all_data = [['text_headline', 'text_sentence', 'bias_label']] + all_data
csv_path = gdrive_path + "output/all_pairs.csv"
with open(csv_path, 'w', newline='') as csvfile:
  csv_writer = csv.writer(csvfile)
  csv_writer.writerows(all_data)
csvfile.close()

merge all notext files except 생활문화, because of encoding issue

In [None]:
import pandas as pd
import csv

all_data = []

file_names = ['특검', '태영', '총선', '총선_2020', '총선_2016', '정치', '북한', '금리']

for name in file_names:
  csv_path = gdrive_path + "output/done/" + name + "_notext.csv"
  encoding = 'utf-8'
  # if name == '생활문화': encoding = 'cp949'
  dataset = pd.read_csv(csv_path, names=['time',	'category_name',	'text_company',	'text_headline',	'content_url', "bias_label"], encoding = encoding).drop(0)
  dataset = dataset.sample(min(40, len(dataset)))
  all_data += dataset.values.tolist()

all_data = [['time',	'category_name',	'text_company',	'text_headline',	'content_url', "bias_label"]] + all_data
csv_path = gdrive_path + "output/all_notext.csv"
with open(csv_path, 'w', newline='') as csvfile:
  csv_writer = csv.writer(csvfile)
  csv_writer.writerows(all_data)
csvfile.close()