<a href="https://colab.research.google.com/github/nug1209/Cek_Hoaks/blob/main/CekHoaks_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [122]:
# !pip install transformers
# !pip install torch
# !pip install scikit-learn

import pandas as pd
import numpy as np
import re
import string
from torch import clamp
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from datetime import date

In [123]:
class TokenSimilarity:

  def load_pretrained(self, from_pretrained:str='indobenchmark/indobert-base-p1'):
    self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained)
    self.model = AutoModel.from_pretrained(from_pretrained)
  
  def __cleaning(self, text:str):
    
    text = text.translate(str.maketrans('', ''))
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'/s+', ' ', text).strip()

    return text

  def __process(self, first_token:str, second_token:str):

    inputs = self.tokenizer([first_token, second_token], max_length=self.max_length, truncation=self.truncation, padding=self.padding, return_tensors='pt')
    attention = inputs.attention_mask
    outputs = self.model(**inputs)
    embeddings = outputs[0]
    embeddings = outputs.last_hidden_state
    mask = attention.unsqueeze(-1).expand(embeddings.shape).float()
    masked_embeddings = embeddings * mask
    summed = masked_embeddings.sum(1)
    counts = clamp(mask.sum(1), min = 1e-9)
    mean_pooled = summed / counts

    return mean_pooled.detach().numpy()

  def predict(self, first_token:str, second_token:str, return_as_embeddings:bool=False, max_length:int=16, truncation:bool=True, padding:str='max_length'):
    
    self.max_length = max_length
    self.truncation = truncation
    self.padding = padding
    first_token = self.__cleaning(first_token)
    second_token = self.__cleaning(second_token)
    mean_pooled_arr = self.__process(first_token, second_token)

    if return_as_embeddings:
      return mean_pooled_arr
    
    similarity = cosine_similarity([mean_pooled_arr[0]], [mean_pooled_arr[1]])

    return similarity

In [124]:
model = TokenSimilarity()
model.load_pretrained('indobenchmark/indobert-base-p2')

In [125]:
df = pd.read_csv('hoax.csv', sep=';')
# df.head()

In [126]:
to_check = 'barista starbucks'

In [127]:
def check_text(to_check):
  
  results = []
  for i in np.arange(len(df['text'])):
    result = model.predict(to_check, df['text'][i])
    result = result.item()
    results.append(result)

  return results


In [128]:
if to_check:
  results = check_text(to_check)  

# results

In [129]:
df['similarity'] = results
# df.head()

In [130]:
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')

In [131]:
df = df.sort_values(by=['similarity'], ascending=False)
df = df.reset_index(drop=True)

In [132]:
today = pd.Timestamp('today')
df['time_delta'] = (today - df['date']).dt.days

In [134]:
df.head()

Unnamed: 0,title,link,date,text,similarity,time_delta
0,JUDUL2,LINK2,2023-05-16,[SALAH] Loker Kasir dan Barista Starbucks Impo...,0.551712,10
1,JUDUL3,LINK3,2023-05-16,[SALAH] Anies Membaca buku “RAHASIA MENYIMPAN ...,0.331022,10
2,JUDUL1,LINK1,2023-05-16,NARASI: “Tour 7 hari Jadwal Kunjungan Kota Yog...,0.28994,10
3,JUDUL6,LINK6,2023-05-01,Cek Fakta: Hoaks Kabar Jokowi Ambil Alih Parta...,0.284268,25
4,JUDUL7,LINK7,2023-04-16,[SALAH] “GANJAR PERINTAHKAN UNTUK RUSAK ATRIBU...,0.276881,40


In [135]:
df.to_csv('results.csv', index=False)