In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
# !wget https://huggingface.co/datasets/mesolitica/malaysian-news/resolve/main/news-2022-11-18.json

In [3]:
import json
import malaya
from tqdm import tqdm
from unidecode import unidecode

In [4]:
import re

def cleaning(string):
    splitted = malaya.text.function.split_into_sentences(string)
    if not len(splitted):
        splitted = '. '.join([k.strip() for k in string.split('.') if len(k.strip())])
    if splitted[0][0] == '-':
        splitted[0] = splitted[0].replace('- ','')
    points = [f'{no + 1}. {s}' for no, s in enumerate(splitted)]
    points = ' '.join(points)
    return points

def simple_cleaning(string):
    return re.sub(r'[ ]+', ' ', unidecode(string).replace('\n', ' ').replace('--', ' ').replace('/', ' ')).strip()

In [5]:
!tail -n 2 populate-news.json.semisupervised

{"title": "Pelancong ke M'sia nak tengok Mat, bukan Doraemon", "url": "https://www.malaysiakini.com/letters/473507", "authors": [], "top-image": "https://i.malaysiakini.com/777/13e94b92eae8996a850667bb8c2c06a2.jpg", "text": "Saya pertama kali menonton watak Doraemon di televisyen sekitar tahun 1993, ketika sedang dalam latihan amali di sebuah kilang di Sungai Way.\n\nSaya dalam satu rombongan ke Cameron Highlands bersama-sama pekerja kilang dan kami menginap di sebuah banglo di sana. Sebaik sampai, kami berehat dan menonton televisyen beramai-ramai di ruang tamu dan itulah pertama kali menyaksikan watak kartun itu. Sebelumnya, saya hanya mendengar dari cakap-cakap orang sahaja.\n\nMaknanya, Doraemon ini sudah masuk ke negara kita begitu lama. Sebelum tahun 1993, ianya telah bertapak sebagai satu watak kegemaran kanak-kanak (dan sebilangan orang dewasa juga).\n\nIa seperti juga watak Ultraman, telah menjadi topik perbualan sejak dari waktu itu lagi. Menariknya, Ultraman telah bertukar g

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from malaya.text.vectorizer import SkipGramCountVectorizer

stopwords = malaya.text.function.get_stopwords()
bow = CountVectorizer(
    ngram_range = (1, 4),
    stop_words = stopwords,
    lowercase = False,
)

stopwords = malaya.text.function.get_stopwords()
skip_bow = SkipGramCountVectorizer(
    ngram_range = (1, 4),
    stop_words = stopwords,
    lowercase = False,
    skip = 2
)

In [7]:
before, after = [], []

count = 0
rejected = []
languages = []
accepted = []
para = []
malaysian_news = {
    'kosmo',
    'hmetro',
    'malaymail',
    'projekmm',
    'bharian',
    'utusan',
    'astroawani',
    'themalaysianinsight',
    'malaysiakini',
    'bernama'
}

def reject(data):
    if data['news'] in malaysian_news:
        return False
    if any([n in data['top-image'] for n in malaysian_news]):
        return False
    if any([n in data['url'] for n in malaysian_news]):
        return False
    if 'com.my' in data['top-image']:
        return False
    if data['language'] == 'malay':
        return False
    if 'Siaran Pers' in data['news']:
        return True
    if '.id' in data['news']:
        return True
    
    return True

with open('populate-news.json.semisupervised') as fopen:
    for l in tqdm(fopen):
        data = json.loads(l)
        text = re.sub(r'[ ]+', ' ', data['text']).strip()
        if 'kindly register' in text.lower() or 'disabled in your browser' in text.lower():
            continue
        if len(text.split()) < 30:
            rejected.append(data)
            continue
            
        if data['language'] == 'ENGLISH':
            continue
            
        accepted.append(data)
            
        
#         befores = data['semisupervised-summaries']
#         for b in befores:
#             b = cleaning(b)
#             if len(b.split()) < 3:
#                 continue
#             before.append(b)
#             after.append(text)
            
#         if count == 10:
#             break
            
        count += 1

81717it [00:05, 15792.67it/s]


In [8]:
len(accepted)

79560

In [9]:
accepted[0]

{'title': 'Ibu saudara Haziq terharu sokongan rakyat Malaysia',
 'url': 'https://www.themalaysianinsight.com/bahasa/s/142491',
 'news': 'themalaysianinsight',
 'language': 'malay',
 'top-image': 'https://www.themalaysianinsight.com/resources/stories_images/142491/perhimpunanan_solidarity_kedamaian_03__full.jpg',
 'text': 'WARIS keluarga Allahyarham Muhammad Haziq Mohd Tarmizi, 17, yang terkorban dalam tragedi tembakan di Christchurch, New Zealand, pada 15 Mac lepas, melahirkan rasa terharu akan sokongan diberi rakyat Malaysia semasa perhimpunan Solidariti Kedamaian.\n\nZarina Shuib , ibu saudara Muhammad Haziq, memanjatkan kesyukuran kepada Allah SWT kerana berkesempatan menyertai rakyat Malaysia dalam perhimpunan itu di Kuala Lumpur hari ini.',
 'date': '2019-03-23T03:52:02',
 'date_utc': '2019-03-22T19:52:02',
 'semisupervised-summaries': ['Perhimpunan Solidariti Kedamaian () adalah perhimpunan solidariti yang diadakan di Kuala Lumpur, Malaysia, pada 15 Mac 2013.',
  'WARIS keluarga 

In [60]:
rejected = {'saya', 'awak', 'kami', 'dia', 'mereka', 'anda', 'kita'}

tokenize = malaya.tokenizer.Tokenizer().tokenize

def headline(string, length = 300):
    splitted = malaya.text.function.split_into_sentences(string)
    splitted = [s for s in splitted if all([r not in s for r in rejected])]
    selected, index, results = '', 0, []
    while len(selected) <= length and index < len(splitted):
        selected += splitted[index]
        results.append(splitted[index])
        index += 1
    return ' '.join(results)

def reject(string):
    splitted = malaya.text.function.split_into_sentences(string)
    splitted = [s for s in splitted if not len(set(tokenize(s.lower())) & rejected)]
    return ' '.join(splitted)

In [57]:
s = 'IBU negara Korea Selatan, Seoul menduduki tempat ketujuh dalam senarai bandar raya paling mahal untuk didiami pada tahun ini, lapor agensi berita Yonhap.'

In [58]:
reject(accepted[i]['text'])

['KUALA LUMPUR, 26 Mac (Bernama) -- Malaysia akan sentiasa bertegas dan vokal dalam isu yang melibatkan penindasan ke atas etnik Rohingya di Myanmar dan konsisten untuk membantu pelarian tersebut daripada terus dinafikan hak mereka, kata Menteri Luar Datuk Saifuddin Abdullah.', 'Beliau berkata tiga perkara utama yang akan diteruskan kerajaan ialah menegakkan keadilan dan membawa mereka yang terlibat dalam pelanggaran hak asasi manusa itu ke pengadilan dan diadili mengikut undang-undang antarabangsa, pemberian status kewarganegaraan Myanmar kepada etnik Rohingya dan membawa pulang mereka yang melarikan diri ke Bangladesh untuk pulang ke Myanmar".', 'Malaysia memainkan peranan yang besar khususnya di Hospital Cox\'s Bazar, Bangladesh dan kita juga membantu mereka di dalam negara".', 'Semuanya telah kita laksanakan tetapi isu ini masih juga belum selesai.', 'Kita bimbang jika ia tidak diselesaikan, ia semakin berpanjangan dan lebih ramai (Etnik Rohingya di Myanmar) yang akan melarikan dir

'KUALA LUMPUR, 26 Mac (Bernama) -- Malaysia akan sentiasa bertegas dan vokal dalam isu yang melibatkan penindasan ke atas etnik Rohingya di Myanmar dan konsisten untuk membantu pelarian tersebut daripada terus dinafikan hak mereka, kata Menteri Luar Datuk Saifuddin Abdullah. Beliau berkata tiga perkara utama yang akan diteruskan kerajaan ialah menegakkan keadilan dan membawa mereka yang terlibat dalam pelanggaran hak asasi manusa itu ke pengadilan dan diadili mengikut undang-undang antarabangsa, pemberian status kewarganegaraan Myanmar kepada etnik Rohingya dan membawa pulang mereka yang melarikan diri ke Bangladesh untuk pulang ke Myanmar". Malaysia memainkan peranan yang besar khususnya di Hospital Cox\'s Bazar, Bangladesh dan kita juga membantu mereka di dalam negara". Semuanya telah kita laksanakan tetapi isu ini masih juga belum selesai. Kita bimbang jika ia tidak diselesaikan, ia semakin berpanjangan dan lebih ramai (Etnik Rohingya di Myanmar) yang akan melarikan diri dan lebih r

In [13]:
import random

In [14]:
keywords = malaya.keyword.extractive.rake(accepted[2]['text'], 
                                                      vectorizer = bow, 
                                                      top_k = random.randint(3, 10))
keywords = [k[1] for k in keywords]
keywords_rake = malaya.keyword.extractive.rake(accepted[2]['text'], atleast = 1,
                                          top_k = random.randint(3, 10))
keywords_rake = [k[1] for k in keywords_rake]
keywords, keywords_rake

(['Hospital Pulau Pinang mangsa',
  'Pulau Pinang mangsa menafikan',
  'mangsa laporan polis mangsa',
  'kelmarin Christopher individu nama'],
 ['Ketua Polis Daerah Barat Daya',
  '000 menerusi nombor akaun CIMB',
  'individu nama Lee Song',
  'mangsa menafikan mengenali individu',
  'nama Christopher Anak Anor',
  'disambungkan menerusi telefon',
  'PULAU PINANG 23 Mac'])

In [15]:
months = {
    'january',
    'jan',
    'januari',
    'february',
    'feb',
    'februari',
    'march',
    'mac',
    'april',
    'apr',
    'may',
    'mei',
    'june',
    'jun',
    'july',
    'julai',
    'august',
    'ogos',
    'aug',
    'september',
    'sep',
    'october',
    'oktober',
    'oct',
    'november',
    'nov',
    'december',
    'disember',
    'dec',
    'utusan',
    'malaysiakini',
    'astroawani',
    'bernama',
    'com',
}

In [16]:
from malaya.text.rouge import postprocess_summary, filter_rouge, _get_word_ngrams, _rouge_clean, cal_rouge
from malaya.text.function import split_into_sentences

def filtering_rouge(summary, contents, start = 0.15, increment = 0.05, break_at = 120, n = 1):
    filtered = contents[:]
    i = 0
    while len(filtered.split()) > break_at:
        filtered = filter_rouge(summary, filtered, n = n, threshold = start + increment * i)
        i += 1
    return filtered

In [17]:
i = 2
keywords = malaya.keyword.extractive.rake(accepted[i]['text'], 
                                                  vectorizer = bow, 
                                                  top_k = 50)
keywords = [simple_cleaning(k[1]) for k in keywords if len(k[1].split()) > 1 and len(k[1]) > 10 \
                and len(set(k[1].lower().replace('-', '').split()) & months) == 0]
keywords = keywords[:random.randint(3, 10)]
keywords, 

(['Hospital Pulau Pinang mangsa',
  'Pulau Pinang mangsa menafikan',
  'mangsa laporan polis mangsa',
  'kelmarin Christopher individu nama'],)

In [61]:
before, after = [], []
for i in tqdm(range(len(accepted))):
    h = reject(accepted[i]['text'])
    try:

        keywords = malaya.keyword.extractive.rake(h, vectorizer = bow, top_k = 100)
        keywords = [simple_cleaning(k[1]) for k in keywords if len(k[1].split()) > 1 and len(k[1]) > 10 \
                        and len(set(k[1].lower().replace('-', '').split()) & months) == 0]
        keywords = keywords[:random.randint(3, 10)]
        
        r = filtering_rouge('. '.join(keywords), h, n = 2, start = 0.01)
        
        if len(r.split()) > 20:
            before.append(keywords)
            after.append(r)
        
        keywords_rake = malaya.keyword.extractive.rake(h, top_k = 100)
        keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) > 1 and len(k[1]) > 10 \
                        and len(set(k[1].lower().replace('-', '').split()) & months) == 0]
        keywords_rake = keywords_rake[:random.randint(3, 10)]
        r = filtering_rouge('. '.join(keywords_rake), h, n = 2, start = 0.01)
        
        if len(r.split()) > 20:
            before.append(keywords_rake)
            after.append(r)
        
    except Exception as e:
        # print(h, accepted[i]['text'])
        print(e)

  0%|▏                                     | 274/79560 [00:02<10:00, 132.05it/s]

empty vocabulary; perhaps the documents only contain stop words


  0%|▏                                     | 381/79560 [00:03<10:40, 123.55it/s]

empty vocabulary; perhaps the documents only contain stop words


  1%|▎                                     | 608/79560 [00:04<09:35, 137.28it/s]

empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words


  1%|▍                                     | 933/79560 [00:07<11:19, 115.76it/s]

empty vocabulary; perhaps the documents only contain stop words


  2%|▌                                    | 1293/79560 [00:10<10:57, 119.01it/s]

empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words


  3%|█▏                                   | 2453/79560 [00:21<10:18, 124.76it/s]

empty vocabulary; perhaps the documents only contain stop words


  4%|█▎                                   | 2793/79560 [00:23<10:32, 121.30it/s]

empty vocabulary; perhaps the documents only contain stop words


  4%|█▌                                    | 3374/79560 [00:29<16:11, 78.46it/s]

empty vocabulary; perhaps the documents only contain stop words


  5%|█▋                                    | 3639/79560 [00:32<13:03, 96.85it/s]

empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words


  5%|█▊                                   | 3849/79560 [00:34<11:24, 110.65it/s]

empty vocabulary; perhaps the documents only contain stop words


  6%|██▏                                  | 4630/79560 [00:43<11:49, 105.64it/s]

empty vocabulary; perhaps the documents only contain stop words


  6%|██▏                                  | 4727/79560 [00:44<11:11, 111.48it/s]

empty vocabulary; perhaps the documents only contain stop words


  7%|██▌                                  | 5637/79560 [00:53<09:05, 135.57it/s]

empty vocabulary; perhaps the documents only contain stop words


  8%|██▊                                  | 6078/79560 [00:57<09:06, 134.44it/s]

empty vocabulary; perhaps the documents only contain stop words


  9%|███▏                                 | 6831/79560 [01:03<09:12, 131.66it/s]

empty vocabulary; perhaps the documents only contain stop words


  9%|███▎                                 | 7000/79560 [01:04<10:05, 119.76it/s]

empty vocabulary; perhaps the documents only contain stop words


  9%|███▎                                 | 7228/79560 [01:06<10:13, 117.95it/s]

empty vocabulary; perhaps the documents only contain stop words


 10%|███▊                                 | 8270/79560 [01:16<09:55, 119.64it/s]

empty vocabulary; perhaps the documents only contain stop words


 11%|████                                 | 8632/79560 [01:19<10:30, 112.43it/s]

empty vocabulary; perhaps the documents only contain stop words


 13%|████▋                                | 9977/79560 [01:32<10:05, 114.89it/s]

empty vocabulary; perhaps the documents only contain stop words


 14%|████▉                               | 11029/79560 [01:43<08:10, 139.58it/s]

empty vocabulary; perhaps the documents only contain stop words


 14%|█████                               | 11267/79560 [01:44<09:06, 125.06it/s]

empty vocabulary; perhaps the documents only contain stop words


 15%|█████▎                              | 11654/79560 [01:47<08:53, 127.32it/s]

empty vocabulary; perhaps the documents only contain stop words


 15%|█████▎                              | 11683/79560 [01:48<08:41, 130.14it/s]

empty vocabulary; perhaps the documents only contain stop words


 15%|█████▎                              | 11828/79560 [01:49<09:42, 116.35it/s]

empty vocabulary; perhaps the documents only contain stop words


 15%|█████▍                              | 12046/79560 [01:51<09:38, 116.70it/s]

empty vocabulary; perhaps the documents only contain stop words


 16%|█████▋                              | 12516/79560 [01:55<10:58, 101.85it/s]

empty vocabulary; perhaps the documents only contain stop words


 17%|██████                              | 13291/79560 [02:04<10:32, 104.80it/s]

empty vocabulary; perhaps the documents only contain stop words


 18%|██████▎                             | 13953/79560 [02:09<08:03, 135.82it/s]

empty vocabulary; perhaps the documents only contain stop words


 18%|██████▎                             | 14042/79560 [02:10<10:07, 107.84it/s]

empty vocabulary; perhaps the documents only contain stop words


 18%|██████▎                             | 14078/79560 [02:10<09:58, 109.38it/s]

empty vocabulary; perhaps the documents only contain stop words


 18%|██████▍                             | 14142/79560 [02:11<08:57, 121.80it/s]

empty vocabulary; perhaps the documents only contain stop words


 18%|██████▍                             | 14193/79560 [02:11<09:33, 113.91it/s]

empty vocabulary; perhaps the documents only contain stop words


 20%|███████▏                            | 15787/79560 [02:27<08:24, 126.39it/s]

empty vocabulary; perhaps the documents only contain stop words


 21%|███████▍                            | 16450/79560 [02:32<07:53, 133.28it/s]

empty vocabulary; perhaps the documents only contain stop words


 21%|███████▌                            | 16833/79560 [02:35<09:11, 113.75it/s]

empty vocabulary; perhaps the documents only contain stop words


 21%|███████▋                            | 17019/79560 [02:37<09:51, 105.73it/s]

empty vocabulary; perhaps the documents only contain stop words


 23%|████████▍                           | 18681/79560 [02:52<08:06, 125.25it/s]

empty vocabulary; perhaps the documents only contain stop words


 24%|████████▋                           | 19092/79560 [02:56<09:22, 107.56it/s]

empty vocabulary; perhaps the documents only contain stop words


 24%|████████▉                            | 19331/79560 [02:58<10:11, 98.45it/s]

empty vocabulary; perhaps the documents only contain stop words


 25%|█████████                           | 19965/79560 [03:05<09:30, 104.47it/s]

empty vocabulary; perhaps the documents only contain stop words


 25%|█████████                           | 20004/79560 [03:06<08:24, 117.99it/s]

empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words


 26%|█████████▌                          | 21062/79560 [03:14<06:44, 144.68it/s]

empty vocabulary; perhaps the documents only contain stop words


 27%|█████████▊                          | 21691/79560 [03:20<08:32, 112.87it/s]

empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words


 28%|██████████                          | 22116/79560 [03:23<07:57, 120.34it/s]

empty vocabulary; perhaps the documents only contain stop words


 28%|██████████                          | 22197/79560 [03:24<07:41, 124.31it/s]

empty vocabulary; perhaps the documents only contain stop words


 28%|██████████                          | 22275/79560 [03:25<07:47, 122.50it/s]

empty vocabulary; perhaps the documents only contain stop words


 28%|██████████▏                         | 22472/79560 [03:26<07:32, 126.03it/s]

empty vocabulary; perhaps the documents only contain stop words


 32%|███████████▊                         | 25345/79560 [03:57<12:51, 70.28it/s]

empty vocabulary; perhaps the documents only contain stop words


 32%|███████████▌                        | 25419/79560 [03:57<07:57, 113.44it/s]

empty vocabulary; perhaps the documents only contain stop words


 33%|████████████                        | 26597/79560 [04:10<07:12, 122.38it/s]

empty vocabulary; perhaps the documents only contain stop words


 34%|████████████▏                       | 26865/79560 [04:13<08:21, 105.04it/s]

empty vocabulary; perhaps the documents only contain stop words


 39%|██████████████▍                      | 30987/79560 [05:03<10:00, 80.90it/s]

empty vocabulary; perhaps the documents only contain stop words


 43%|███████████████▊                     | 33992/79560 [05:42<09:08, 83.12it/s]

empty vocabulary; perhaps the documents only contain stop words


 46%|█████████████████▏                   | 36901/79560 [06:20<08:47, 80.90it/s]

empty vocabulary; perhaps the documents only contain stop words


 47%|█████████████████▍                   | 37466/79560 [06:27<09:01, 77.76it/s]

empty vocabulary; perhaps the documents only contain stop words


 48%|█████████████████▎                  | 38235/79560 [06:37<06:27, 106.66it/s]

empty vocabulary; perhaps the documents only contain stop words


 51%|██████████████████▎                 | 40597/79560 [07:06<06:23, 101.58it/s]

empty vocabulary; perhaps the documents only contain stop words


 56%|████████████████████▊                | 44695/79560 [07:58<08:03, 72.08it/s]

empty vocabulary; perhaps the documents only contain stop words


 61%|██████████████████████▌              | 48464/79560 [08:44<05:56, 87.20it/s]

empty vocabulary; perhaps the documents only contain stop words


 67%|████████████████████████▋            | 53097/79560 [09:39<05:07, 86.08it/s]

empty vocabulary; perhaps the documents only contain stop words


 73%|██████████████████████████▉          | 57977/79560 [10:40<04:08, 86.93it/s]

empty vocabulary; perhaps the documents only contain stop words


 80%|█████████████████████████████▍       | 63253/79560 [11:46<03:06, 87.26it/s]

empty vocabulary; perhaps the documents only contain stop words


 81%|████████████████████████████▉       | 64066/79560 [11:56<02:30, 102.75it/s]

empty vocabulary; perhaps the documents only contain stop words


 84%|██████████████████████████████▎     | 67048/79560 [12:31<01:14, 167.35it/s]

empty vocabulary; perhaps the documents only contain stop words


 88%|███████████████████████████████▌    | 69800/79560 [13:04<01:29, 109.24it/s]

empty vocabulary; perhaps the documents only contain stop words


 88%|███████████████████████████████▌    | 69849/79560 [13:05<01:26, 111.98it/s]

empty vocabulary; perhaps the documents only contain stop words


 90%|█████████████████████████████████▎   | 71756/79560 [13:26<01:42, 75.89it/s]

empty vocabulary; perhaps the documents only contain stop words


 94%|█████████████████████████████████▉  | 74950/79560 [14:03<00:40, 113.07it/s]

empty vocabulary; perhaps the documents only contain stop words


 94%|██████████████████████████████████▉  | 75101/79560 [14:04<00:52, 85.63it/s]

empty vocabulary; perhaps the documents only contain stop words


 96%|███████████████████████████████████▌ | 76374/79560 [14:18<00:34, 92.28it/s]

empty vocabulary; perhaps the documents only contain stop words


 97%|███████████████████████████████████▋ | 76779/79560 [14:23<00:28, 96.07it/s]

empty vocabulary; perhaps the documents only contain stop words


 99%|████████████████████████████████████▋| 78945/79560 [14:48<00:06, 97.15it/s]

empty vocabulary; perhaps the documents only contain stop words


100%|█████████████████████████████████████| 79560/79560 [14:55<00:00, 88.85it/s]


In [62]:
len(before), len(after)

(155967, 155967)

In [64]:
before[100], after[100]

(['Perdana Menteri Pakistan Imran Khan menyifatkan rakan sejawatnya Pertana Menteri Malaysia',
  'krisis ekonomi melanda Asia Tenggara',
  'meningkatkan taraf hidup rakyatnya',
  'tenaga kerja warganegara Pakistan',
  'isu-isu membabitkan dunia Islam',
  'pengganas menembak mati',
  'peluang-peluang pekerjaan diwujudkan',
  'umat Islam dilayan',
  '3 bilion umat Islam',
  'menjejaskan dunia Islam'],
 ': Perdana Menteri Pakistan Imran Khan menyifatkan rakan sejawatnya Pertana Menteri Malaysia Tun Dr Mahathir Mohamad sebagai negarawan Islam yang telah merubah Malaysia menjadi sebuah model teladan untuk dunia Islam. Beliau berkata Pakistan berbangga dengan cara Malaysia membangun dan meningkatkan taraf hidup rakyatnya di bawah kepimpinan Dr Mahathir". Seperti memerangi keganasan, bagaimana ia telah menjejaskan dunia Islam. Bagaimana umat Islam dilayan disebabkan perjuangan politik yang mengelirukan dan disengajakan". Kini tenaga kerja warganegara Pakistan sudah mula pergi ke Malaysia... p

In [65]:
with open('keywords-headline.json', 'w') as fopen:
    json.dump({'before': before, 'after': after}, fopen)