In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

import re
import string
import collections
import fse
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torchnlp.metrics import get_moses_multi_bleu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(1)

import nltk
nltk.download('punkt')

# Any results you write to the current directory are saved as output.

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nurrizkyimani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Import the csv
all_agree_path = "2022_thesis_styletransfer/annotated/combined/csv/all_agree.csv"
all_agree_pd = pd.read_csv(all_agree_path)
df_clickbait = all_agree_pd[all_agree_pd['label'] == "clickbait"]
df_nonclickbait = all_agree_pd[all_agree_pd['label'] == "non-clickbait"]
df_both = pd.concat((df_clickbait, df_nonclickbait), ignore_index=True)

df_both_copy = df_both.copy()
df_both_copy['label'] = df_both_copy['label'].astype('category')
df_both_copy.head(-1)

Unnamed: 0,title,label,label_score
0,Viral! Driver Ojol di Bekasi Antar Pesanan Mak...,clickbait,1
1,"Ada Motor Nyangkut di Atas Bambu di Sleman, Ko...",clickbait,1
2,Pesan Gamblang Poyuono Menolak Revisi UU KPK,clickbait,1
3,Kocak! Maling di Rumah Mewah Jakut Terekam CCT...,clickbait,1
4,"Viral Video Diduga Baku Tembak di Sleman, Ini ...",clickbait,1
...,...,...,...
8607,"Smart SIM Diluncurkan, Wakapolri Harap Bisa Me...",non-clickbait,0
8608,"Wamena Papua Kembali Membara, Kantor Bupati Di...",non-clickbait,0
8609,BMKG Angkat Bicara Soal Langit Merah Jambi,non-clickbait,0
8610,Operasional Bandara Wamena Dihentikan Akibat R...,non-clickbait,0


In [3]:
df_both_label = df_both_copy.drop('label_score', axis=1)
df_both = df_both_copy.drop('label', axis=1)
df_both.head(-5)

Unnamed: 0,title,label_score
0,Viral! Driver Ojol di Bekasi Antar Pesanan Mak...,1
1,"Ada Motor Nyangkut di Atas Bambu di Sleman, Ko...",1
2,Pesan Gamblang Poyuono Menolak Revisi UU KPK,1
3,Kocak! Maling di Rumah Mewah Jakut Terekam CCT...,1
4,"Viral Video Diduga Baku Tembak di Sleman, Ini ...",1
...,...,...
8603,Komnas HAM Minta Aparat Keamanan Tak Gunakan K...,0
8604,Mahasiswa Minta TNI Turun Bersama Kawal Aksi D...,0
8605,Polisi Pulangkan 56 Mahasiswa Setelah Sempat D...,0
8606,KPK Tepis Tudingan Moeldoko Soal Hambat Investasi,0


In [4]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df_both_copy['label_encoding'] = labelencoder.fit_transform(df_both_copy['label'])
df_encode_clean = df_both_copy.copy()
df_encode_clean = df_both = df_both_copy.drop(['label_score'], axis=1)
df_encode_clean.head()

Unnamed: 0,title,label,label_encoding
0,Viral! Driver Ojol di Bekasi Antar Pesanan Mak...,clickbait,0
1,"Ada Motor Nyangkut di Atas Bambu di Sleman, Ko...",clickbait,0
2,Pesan Gamblang Poyuono Menolak Revisi UU KPK,clickbait,0
3,Kocak! Maling di Rumah Mewah Jakut Terekam CCT...,clickbait,0
4,"Viral Video Diduga Baku Tembak di Sleman, Ini ...",clickbait,0


# Preprocessing : Lower Casing

In [5]:
# preprocessing : lower casing
df_encode_clean['title'] = df_encode_clean['title'].str.lower()
df_encode_clean.title

0       viral! driver ojol di bekasi antar pesanan mak...
1       ada motor nyangkut di atas bambu di sleman, ko...
2            pesan gamblang poyuono menolak revisi uu kpk
3       kocak! maling di rumah mewah jakut terekam cct...
4       viral video diduga baku tembak di sleman, ini ...
                              ...                        
8608    wamena papua kembali membara, kantor bupati di...
8609           bmkg angkat bicara soal langit merah jambi
8610    operasional bandara wamena dihentikan akibat r...
8611     asap karhutla riau mulai merambah ke nias, bm...
8612    tolak ruu pertanahan, ribuan petani siap gelar...
Name: title, Length: 8613, dtype: object

Convert the titl into lower case for all label

In [6]:
df_encode_clean_t =  df_encode_clean.copy()
df_encode_clean_t['title'] = df_encode_clean_t['title'].apply(lambda y: " ".join((re.sub(r'([!/?/./,/%/\'/\"/-/:])', lambda x: ' ' + x.group()+' ' , y)).split()) , 1)
df_encode_clean_t

Unnamed: 0,title,label,label_encoding
0,viral ! driver ojol di bekasi antar pesanan ma...,clickbait,0
1,"ada motor nyangkut di atas bambu di sleman , k...",clickbait,0
2,pesan gamblang poyuono menolak revisi uu kpk,clickbait,0
3,kocak ! maling di rumah mewah jakut terekam cc...,clickbait,0
4,"viral video diduga baku tembak di sleman , ini...",clickbait,0
...,...,...,...
8608,"wamena papua kembali membara , kantor bupati d...",non-clickbait,1
8609,bmkg angkat bicara soal langit merah jambi,non-clickbait,1
8610,operasional bandara wamena dihentikan akibat r...,non-clickbait,1
8611,"asap karhutla riau mulai merambah ke nias , bm...",non-clickbait,1


In [7]:
df_clickbait_clean = df_encode_clean_t[df_encode_clean_t['label'] == "clickbait"]
df_nonclickbait_clean = df_encode_clean_t[df_encode_clean_t['label'] == "non-clickbait"]

In [8]:
# create a d_both list
import itertools
d_both = pd.concat((df_clickbait_clean, df_nonclickbait_clean ), ignore_index=True)
d_both = d_both.drop(['label', 'label_encoding'], axis=1)
d_both = d_both.values.tolist()
d_both = list(itertools.chain.from_iterable(d_both))

In [9]:
d_both

['viral ! driver ojol di bekasi antar pesanan makanan pakai sepeda',
 'ada motor nyangkut di atas bambu di sleman , kok bisa ?',
 'pesan gamblang poyuono menolak revisi uu kpk',
 'kocak ! maling di rumah mewah jakut terekam cctv bingung cari jalan kabur',
 'viral video diduga baku tembak di sleman , ini kata polisi',
 'waspada ! ada penipuan catut pertamina , korbannya rugi puluhan juta',
 'jaksa beberkan senpi-peluru tajam pembelian kivlan zen cs , ini rinciannya',
 'sering quality time bersama keluarga ? ternyata ini 3 manfaatnya !',
 'menteri jokowi 55 % profesional , ini kandidatnya ?',
 "pria misterius berjubah putih viral di sumut , ingatkan ' tuhan murka '",
 'video porno di sumedang disebar pemeran pria , apa motifnya ?',
 'terungkap ! video porno di sumedang diperankan pasangan selingkuh',
 'kendaraan listrik bebas dari aturan ganjil-genap , ini alasannya',
 'menanti " kebijakan gila " jokowi',
 'akhirnya ! pemkot pekanbaru liburkan sd dan smp karena kabut asap',
 '2 ribu hekt

In [10]:
# create list of clickbait and nonclickbait headline

df_clickbait_t = df_clickbait_clean.drop(['label', 'label_encoding'], axis=1).values.tolist()
clickbait_l_c = list(itertools.chain.from_iterable(df_clickbait_t))

df_nonclickbait_t = df_nonclickbait_clean.drop(['label', 'label_encoding'], axis=1).values.tolist()
nonclickbait_l_c = list(itertools.chain.from_iterable(df_nonclickbait_t))

print(nonclickbait_l_c[0:5])
print(clickbait_l_c[0:5])

['masuk radar pilwalkot medan , menantu jokowi bertemu dpw nasdem sumut', 'malaysia sudutkan ri : isu kabut asap hingga invasi babi', 'kemensos salurkan rp 7 , 3 m bagi korban kerusuhan sosial di papua', 'mpr : amandemen uud 1945 tak akan melebar ke mana-mana', 'peringati tahun baru islam , banyuwangi kembali gelar festival muharam']
['viral ! driver ojol di bekasi antar pesanan makanan pakai sepeda', 'ada motor nyangkut di atas bambu di sleman , kok bisa ?', 'pesan gamblang poyuono menolak revisi uu kpk', 'kocak ! maling di rumah mewah jakut terekam cctv bingung cari jalan kabur', 'viral video diduga baku tembak di sleman , ini kata polisi']


# Preprocessed: Ngram [100% DONE]

In [11]:
#these are methods that will become useful when extracting attribute markers
#why do we need all this? well... that's like 5 hours of debugging...
def flatten(foo):
    return list(_flatten(foo))

def _flatten(foo):
    for x in foo:
        if isinstance(x, collections.Iterable) and not isinstance(x, str):
            for y in _flatten(x):
                yield y
        else:
            yield x

def array_to_string(a):
    return ' '.join(flatten(a))

def is_in_string_array(elements, original): #deprecated, does not take into account sequence order
    return np.isin(array_to_string(elements).split(), array_to_string(original).split()).any()
# no usage in the func
def insert_string(string, inserted_string, index):
    return string[:index] + inserted_string + string[index:]

# modified from https://stackoverflow.com/questions/41752946/replacing-a-character-from-a-certain-index
def replace_string(s, newstring, index, nofail=False):
    # raise an error if index is outside of the string
    if not nofail and index not in range(len(s)):
        raise ValueError("index outside given string. index:" + index)

    # if not erroring, but the index is still not in the correct range..
    if index < 0:  # add it to the beginning
        return newstring + s
    if index > len(s):  # add it to the end
        return s + newstring

    # insert the new string between "slices" of the original
    return s[:index] + newstring + s[index + len(newstring):]

In [12]:
# creating function for making ngram from list of headline
import nltk
from nltk.util import ngrams
nltk.download('punkt')
from collections import Counter

def ngram_maker_v2(list_sentences, min_length, max_length):
    lengths = range(min_length, max_length + 1)
    res_ngram = {length : [] for length in lengths}

    for leng in lengths:
        for sentence in list_sentences:
            temp_set = set()
            n_grams = ngrams(nltk.word_tokenize(sentence), leng)

            for grams in n_grams:
                res = ' '.join(grams)
                temp_set.add(res)
                # print(temp_set)

            for key in temp_set:
                split_t = tuple(key.split(" "))
                res_ngram[leng].append(split_t)

    return res_ngram

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nurrizkyimani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
# create a function that count the ngram occurance
def counter_for_ngram(ngram_dict, min_length, max_length):
    lengths = range(min_length, max_length + 1)
    ngram_count_res = {length : [] for length in lengths}

    for key in ngram_dict:
        ng_count = Counter(ngram_dict[key])
        ngram_count_res[key] = ng_count

    return ngram_count_res

In [122]:
param_span_t = 3
clickbait_ngrams_v2= ngram_maker_v2(list_sentences=clickbait_l_c, min_length=1, max_length=param_span_t)
clickbait_ngram_count_v2 = counter_for_ngram(ngram_dict=clickbait_ngrams_v2,min_length=1, max_length=param_span_t)

non_clickbait_ngrams_v2= ngram_maker_v2(list_sentences=nonclickbait_l_c, min_length=1, max_length=param_span_t)
non_clickbait_ngram_count_v2 = counter_for_ngram(ngram_dict=non_clickbait_ngrams_v2,min_length=1, max_length=param_span_t)

In [15]:
# creat function for get ngram from the word in the one sentence reference only
def ngram_from_sentence_v3(ori_sentence, min_length, max_length):
    sent_in_arr = [ori_sentence]
    res_ngram_sentence = []
    stc_ngram_only = ngram_maker_v2(list_sentences=sent_in_arr, min_length=min_length, max_length=max_length)
    return stc_ngram_only

def count_ngram_sentence_onlyV3(ngram_dictionary, ngram_from_sentence, min_length, max_length):
    lengths = range(min_length, max_length + 1)
    ngram_compare_arr = []

    for length in lengths:
        for split in ngram_from_sentence[length]:
            ngram_compare_arr.append([split, ngram_dictionary[length][split]])

    return np.array(ngram_compare_arr)

In [213]:
# create function for get the attribute of the sentence

smoothing_parameter = 1

def array_to_string(a):
    return ' '.join(flatten(a))

# the word is in the final array;
def is_in_string_array_v2(elements, original):
    return np.isin(array_to_string(elements).split(), array_to_string(original).split()).any()

# the the attribute based on the sentence, with min_length, max_length and the reference clickbait/nonclickbait dictinoary
def get_attribute_sentence(sentence, min_length, max_length, ref_bait, param_threshold):
    # ngram sentence as reference
    sentence_ngram_ref = ngram_from_sentence_v3(ori_sentence=sentence, min_length=min_length, max_length=max_length)

    # count the clickbait sentence, clickbait ngram count as based reference
    counts_sentence_clickbait = count_ngram_sentence_onlyV3(clickbait_ngram_count_v2,sentence_ngram_ref,min_length,max_length)

    # count the non-clickbait sentence, non-clickbait ngram count as based reference
    counts_sentence_nonclickbait = count_ngram_sentence_onlyV3(non_clickbait_ngram_count_v2,sentence_ngram_ref,min_length,max_length)

    importances = 0

    csn_int = counts_sentence_nonclickbait[:,1]
    csn_ngram = counts_sentence_nonclickbait[:,0]

    csc_int = counts_sentence_clickbait[:,1]
    csc_ngram = counts_sentence_clickbait[:,0]

    if ref_bait == "to-clickbait":
        top = (csc_int + smoothing_parameter)
        bottom = (csn_int + smoothing_parameter)
        importances = (top/bottom)
    elif ref_bait == "to-non-clickbait":
        top = (csn_int+ smoothing_parameter)
        bottom = (csc_int + smoothing_parameter)
        importances = (top/bottom)

    # print(counts_sentence_clickbait)

    de = counts_sentence_clickbait[:, 0]
    print(de)
    print(importances)
    importances_t = np.vstack((importances, de)).T

    attribute_words = []
    for importance in importances_t:
        if importance[0] > param_threshold and not is_in_string_array_v2(importance[1], attribute_words):
            attribute_words.append(' '.join(importance[1]))

    return attribute_words

In [212]:
# run the get attribute sentence
test11 = get_attribute_sentence("viral ini ! bj habibie meninggal", 1, 2, "to-clickbait", 4 )

[[('ini',) 931]
 [('meninggal',) 34]
 [('viral',) 94]
 [('habibie',) 95]
 [('!',) 257]
 [('bj',) 82]
 [('bj', 'habibie') 82]
 [('!', 'bj') 0]
 [('habibie', 'meninggal') 13]
 [('ini', '!') 8]
 [('viral', 'ini') 0]]


  return np.array(ngram_compare_arr)


In [214]:
def separate(sentence, style_src):
    attributes = get_attribute_sentence(sentence, 1, 2, "to-clickbait", 2 )
    c = sentence

    replace_indexes = []

    for a in attributes:
        replace_index = -1
        replace_index = c.find(a)
        replace_indexes.append(replace_index)
        c = c.replace(a, " "*len(a))

    if len(attributes) == 0:
        return {'c': c, 'a': [], 'i': [], 's': sentence}

    replace_indexes, attributes = zip(*sorted(zip(replace_indexes, attributes)))
    return {'c': c, 'a': attributes, 'i': replace_indexes, 's': sentence}

def get_c(sentence, style):
    return re.sub(' +', ' ', separate(sentence, style)['c'])

def get_a(sentence, style):
    a = separate(sentence, style)['a']
    if len(a) > 0:
        return ' '.join(a)
    else:
        return ""

In [215]:
stc_test_retrieve = "kocak ! maling di rumah mewah jakut terekam cctv bingung cari jalan kabur"

get_attribute_sentence(stc_test_retrieve, 1, 2, "to-clickbait", 5)

separate(stc_test_retrieve, "to-clickbait")

[('cari',) ('kabur',) ('jakut',) ('kocak',) ('maling',) ('jalan',)
 ('bingung',) ('di',) ('cctv',) ('rumah',) ('!',) ('terekam',) ('mewah',)
 ('kocak', '!') ('cari', 'jalan') ('cctv', 'bingung') ('rumah', 'mewah')
 ('terekam', 'cctv') ('mewah', 'jakut') ('di', 'rumah') ('jalan', 'kabur')
 ('jakut', 'terekam') ('maling', 'di') ('bingung', 'cari') ('!', 'maling')]
[0.625 6.0 0.5555555555555556 10.0 2.0 0.36585365853658536 3.5
 0.5658499234303216 0.5 0.6511627906976745 43.0 5.0 2.3333333333333335 3.0
 2.0 2.0 1.0 2.0 2.0 1.2 2.0 2.0 2.0 2.0 2.0]
[('cari',) ('kabur',) ('jakut',) ('kocak',) ('maling',) ('jalan',)
 ('bingung',) ('di',) ('cctv',) ('rumah',) ('!',) ('terekam',) ('mewah',)
 ('kocak', '!') ('cari', 'jalan') ('cctv', 'bingung') ('rumah', 'mewah')
 ('terekam', 'cctv') ('mewah', 'jakut') ('di', 'rumah') ('jalan', 'kabur')
 ('jakut', 'terekam') ('maling', 'di') ('bingung', 'cari') ('!', 'maling')]
[0.625 6.0 0.5555555555555556 10.0 2.0 0.36585365853658536 3.5
 0.5658499234303216 0.5

  return np.array(ngram_compare_arr)


{'c': '        maling di rumah       jakut         cctv         cari jalan      ',
 'a': ('kocak', '!', 'mewah', 'terekam', 'bingung', 'kabur'),
 'i': (0, 6, 24, 36, 49, 68),
 's': 'kocak ! maling di rumah mewah jakut terekam cctv bingung cari jalan kabur'}

# Preprocessed : TFIDF

to get similar sentece in the clickbait headline list

In [80]:
# 3 of all headline list
# d_both
# nonclickbait_l_c
# clickbait_l_c

In [144]:
# import all needed librarty
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(use_idf=True, stop_words=None)
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(d_both)
dict_idf = dict(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer.idf_))



In [209]:
def get_overlap(a, b):
    #     print(a, b)
    a_counter = collections.Counter(a.split())
    b_counter = collections.Counter(b.split())
    overlap = a_counter & b_counter
    return overlap

def get_weighted_overlap(a, b):
    overlap = get_overlap(a, b)
    a_counter = collections.Counter(a.split())
    #calculate
    weighted_overlap = 0
    for word in overlap:

        word_tf = a_counter[word]#/len(a.split()) -> commented out cause division by constant value doesn't matter

        get_idf = dict_idf.get(word)
        word_idf = 1 if get_idf == None else get_idf #get rid of error when idf not in dict

        word_tfidf = word_tf*word_idf
        weighted_overlap+=overlap[word]*word_tfidf

    return weighted_overlap

# 0 = nonclickbait_l_c
# 1 = clickbait_l_c

def get_closest_sentence_tfidf(sentence, style_src):
    opposite_dataset = clickbait_l_c if style_src else nonclickbait_l_c

    highest_overlap = 0
    closest_sentence = ""

    min_attribute_markers=len(get_attribute_sentence(sentence, 1, 2, "clickbait", 4 ))

    print(min_attribute_markers)
    num_markers = 0

    previous_sentences = []
    backoff_count = 0
    while(num_markers < min_attribute_markers and backoff_count < param_backoff_limit):
        for sentence_b in opposite_dataset:
            weighted_overlap = get_weighted_overlap(sentence, sentence_b)
            if weighted_overlap > highest_overlap and sentence_b not in previous_sentences:
                highest_overlap = weighted_overlap
                closest_sentence = sentence_b
        highest_overlap = 0
        backoff_count += 1
        previous_sentences.append(closest_sentence)
        num_markers = len((get_attribute_sentence(closest_sentence, 1, 2, "clickbait", 4 )))

    return closest_sentence

# Retrieve using tfidf
def retrieve(sentence, style_src):
    similar_stc =  get_closest_sentence_tfidf(sentence, 1)
    return separate(similar_stc, "clickbait")

In [216]:
# stc_test_retrieve = "rumah mewah"
# stc_test_retrieve = "jalan kabur"
# stc_test_retrieve = "tokoh papua"
stc_test_2 = "polisi pulangkan 56 mahasiswa setelah sempat ditahan usai demo di dpr"

t2 = get_closest_sentence_tfidf(stc_test_2, 1)
# retrieve(stc_test_retrieve, 0)

[('ditahan',) ('56',) ('setelah',) ('pulangkan',) ('polisi',)
 ('mahasiswa',) ('dpr',) ('usai',) ('di',) ('demo',) ('sempat',)
 ('ditahan', 'usai') ('di', 'dpr') ('mahasiswa', 'setelah')
 ('setelah', 'sempat') ('polisi', 'pulangkan') ('pulangkan', '56')
 ('usai', 'demo') ('56', 'mahasiswa') ('demo', 'di') ('sempat', 'ditahan')]
0


  return np.array(ngram_compare_arr)


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1 and the array at index 1 has size 21

# Generate: Dictionary Preprocess

NameError: name 'test_retrive' is not defined

# Dataset:  Train, Validation, Test Split

# NN Model : Encoder, Decoder, Seq2Seq

# Training: Training & Validation