<a href="https://colab.research.google.com/github/monash-tp37-project/iteration_1_model/blob/master/bert_model_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load and use pretrained BERT model for news classification

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
#Installing the Hugging Face PyTorch Interface for Bert
# !pip install pytorch-pretrained-bert pytorch-nlp
!pip install -q transformers

In [15]:
import os
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup

In [16]:
current_dir = os.getcwd()
current_dir

'/content'

In [17]:
#import torch
#from transformers import AutoTokenizer, AutoModelForSequenceClassification

# read in trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained(f"{current_dir}/drive/MyDrive/colab_data/tokenizer_200k")
model = BertForSequenceClassification.from_pretrained(f"{current_dir}/drive/MyDrive/colab_data/model_200k_128")

In [18]:
sequences = ["Previously unreported surveillance video captured one such effort in August in the rural Colorado town of Kiowa. Footage obtained by Reuters through a public-records request shows Elbert County Clerk Dallas Schroeder, the county’s top election official, fiddling with cables and typing on his phone as he copied computer drives containing sensitive voting information."]

# tokenize text input
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

In [19]:
with torch.no_grad():
  # make predictions
  output = model(**tokens)

  # using a softmax activation function get discrete predictions
  predictions = F.softmax(output.logits, dim=1)
  labels = int(torch.argmax(predictions, dim=1))

  # using a sigmoid function get continous predictions 
  label = torch.sigmoid(predictions).tolist()[0]
  reliable = round(label[0],2)
  unreliable = round(label[1],2)
  print(f'Your news has a {unreliable}% probability of being unreliable and a {reliable}% of being reliable.')

  if reliable < unreliable:

    print('Therefore, our model predicts your news is likely to be unreliable.')

  else:

    print('Therefore, our model predicts your news is likely to be reliable.')

Your news has a 0.5% probability of being unreliable and a 0.73% of being reliable.
Therefore, our model predicts your news is likely to be reliable.


In [36]:
!pip3 install keybert
!pip3 install keyphrase-vectorizers
from keybert import KeyBERT
import keyphrase_vectorizers
from keyphrase_vectorizers import KeyphraseCountVectorizer

Collecting keyphrase-vectorizers
  Downloading keyphrase_vectorizers-0.0.7-py3-none-any.whl (24 kB)
Collecting nltk>=3.6.1
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 10.8 MB/s 
[?25hCollecting spacy>=3.0.1
  Downloading spacy-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 68.6 MB/s 
Collecting psutil>=5.8.0
  Downloading psutil-5.9.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[K     |████████████████████████████████| 280 kB 72.7 MB/s 
[?25hCollecting scipy>=1.7.3
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
[K     |████████████████████████████████| 38.1 MB 1.2 MB/s 
Collecting regex>=2021.8.3
  Downloading regex-2022.4.24-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 5

In [37]:
kw_model = KeyBERT()

In [39]:
keywords = kw_model.extract_keywords(docs=sequences, vectorizer=KeyphraseCountVectorizer())

  "Although extracting keywords for multiple documents is faster "
1it [00:00, 434.15it/s]


In [47]:
[x[0] for x in sorted(keywords[0], key=lambda x: x[1], reverse=True)][0:3]

['elbert county clerk dallas schroeder',
 'sensitive voting information',
 'county']