In [18]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
# Load sastrawi tokenized data
df = pd.read_csv("./text_tokenized_sastrawi.csv")
df.head()

Unnamed: 0,title_clean,content_clean,title_tokens_sastrawi,content_tokens_sastrawi
0,kejagung sudah periksa 20 orang lebih terkait ...,kejaksaan agung ri tengah mengusut kasus dugaa...,"['jagung', 'periksa', '20', 'orang', 'lebih', ...","['jaksa', 'agung', 'ri', 'tengah', 'usut', 'ka..."
1,jimly asshiddiqie-mahfud ke istana jelang pela...,sejumlah tokoh merapat ke istana kepresidenan ...,"['jimly', 'asshiddiqie', 'mahfud', 'istana', '...","['jumlah', 'tokoh', 'rapat', 'istana', 'presid..."
2,kejagung limpahkan berkas nadiem makarim dkk k...,kejaksaan agung ri akan melimpahkan berkas per...,"['jagung', 'limpah', 'berkas', 'nadiem', 'maka...","['jaksa', 'agung', 'ri', 'limpah', 'berkas', '..."
3,pramono anung cek lokasi ledakan di sman 72 ja...,gubernur dki jakarta pramono anung tiba di lok...,"['pramono', 'anung', 'cek', 'lokasi', 'ledak',...","['gubernur', 'dki', 'jakarta', 'pramono', 'anu..."
4,pan akan taati putusan mkd dpr soal uya kuya d...,waketum pan eddy soeparno menanggapi putusan m...,"['pan', 'taat', 'putus', 'mkd', 'dpr', 'soal',...","['waketum', 'pan', 'eddy', 'soeparno', 'tangga..."


In [20]:
# Join tokens back to space-separated strings
df["title_text"] = df["title_tokens_sastrawi"].astype(str).str.strip("[]").str.replace(",", "").str.replace("'", "")
df["content_text"] = df["content_tokens_sastrawi"].astype(str).str.strip("[]").str.replace(",", "").str.replace("'", "")

print("Joined Title Tokens: \n", df["title_text"][:5])
print("\n")
print("Joined Content Tokens: \n", df["content_text"][:5])

Joined Title Tokens: 
 0    jagung periksa 20 orang lebih kait kasus limba...
1    jimly asshiddiqie mahfud istana jelang lantik ...
2    jagung limpah berkas nadiem makarim dkk jpu pe...
3       pramono anung cek lokasi ledak sman 72 jakarta
4      pan taat putus mkd dpr soal uya kuya eko patrio
Name: title_text, dtype: object


Joined Content Tokens: 
 0    jaksa agung ri tengah usut kasus duga korupsi ...
1    jumlah tokoh rapat istana presiden jakarta jel...
2    jaksa agung ri limpah berkas perkara sangka ka...
3    gubernur dki jakarta pramono anung tiba lokasi...
4    waketum pan eddy soeparno tanggap putus mahkam...
Name: content_text, dtype: object


In [21]:
# Fit TF-IDF on combined corpus for consistent vocabulary
corpus = pd.concat([df["title_text"], df["content_text"]], axis=0)
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(corpus)

# Split tfidf vectors back into title & content matrices
n = len(df)
title_vecs = tfidf_matrix[:n]
content_vecs = tfidf_matrix[n:]
# print(title_vecs)

# Cosine similarity title vs content
cos_sim = cosine_similarity(title_vecs, content_vecs).diagonal()
df["cosine_similarity"] = cos_sim

print("cosine sim 5 first row: \n", df["cosine_similarity"][:5])

cosine sim 5 first row: 
 0    0.243354
1    0.397082
2    0.478500
3    0.577085
4    0.696648
Name: cosine_similarity, dtype: float64


In [22]:
# Token overlap (Jaccard-like on sets)
def token_overlap(row):
  t = set(str(row["title_tokens_sastrawi"]).strip("[]").replace("'","").split())
  c = set(str(row["content_tokens_sastrawi"]).strip("[]").replace("'","").split())
  if not t:
    return 0.0
  return len(t & c) / len(t)

df["token_overlap"] = df.apply(token_overlap, axis=1)

print("token_overlap first row: \n", df["token_overlap"][:5])

token_overlap first row: 
 0    0.888889
1    0.888889
2    0.666667
3    0.750000
4    0.800000
Name: token_overlap, dtype: float64


In [23]:
# Determine moderate thresholds from data distribution (quantile-based)
cos_th = df["cosine_similarity"].quantile(0.3)  # lower 30%
ov_th = df["token_overlap"].quantile(0.3)

print("token_overlap first row: \n", ov_th)

token_overlap first row: 
 0.7233766233766226


In [24]:
# Auto label: clickbait if both similarity & overlap low
df["is_clickbait_auto"] = ((df["cosine_similarity"] < cos_th) & (df["token_overlap"] < ov_th)).astype(int)

In [25]:
# Save full CSV
# csv_path = "./text_with_similarity_labels.csv"
# df.to_csv(csv_path, index=False)

# # Save 10 random samples to JSON
# json_sample_path = "./text_with_similarity_labels.jsonl"
# df.sample(10, random_state=42).to_json(json_sample_path, orient="records", lines=True, force_ascii=False)

# (csv_path, json_sample_path, df[["cosine_similarity","token_overlap","is_clickbait_auto"]].head())