In [1]:
import json
import spacy
from glob import glob

nlp = spacy.load('en_core_web_md')

In [2]:
files = glob('KPTimes.*.jsonl')
files

['KPTimes.valid.jsonl', 'KPTimes.train.jsonl', 'KPTimes.test.jsonl']

In [3]:
def case_of(text):
    return (
        str.upper
        if text.isupper()
        else str.lower
        if text.islower()
        else str.title
        if text.istitle()
        else str
    )

def capitalize(string, keyword):
    keywords = keyword.split(';')
    doc = nlp(string)
    entities = [entity.text for entity in doc.ents]
    entities_short = [''.join([w[0] for w in word.split()]) for word in entities]
    tainted = {}
    
    actual_keywords = []
    for key in keywords:
        for no, entity in enumerate(entities_short):
            if entity.lower().find(key) > 0:
                result = []
                index = 0
                for i in range(entity.lower().index(key), len(entity)):
                    result.append(case_of(entity[i])(key[index]))
                    index += 1
                actual_keywords.append(''.join(result))
                actual_keywords.append(entities[no])
                tainted[key] = True
                
    
    string_lower = string.lower().split()
    string = string.split()
    for key in keywords:
        if key in string_lower:
            actual_keywords.append(string[string_lower.index(key)])
            tainted[key] = True
            
    
    for key in keywords:
        if not tainted.get(key, False):
            result = []
            for token in nlp(key):
                if len(token.ent_type_):
                    if token.ent_type_ in ['TIME', 'MONEY']:
                        t = token.text.upper()
                    else:
                        t = token.text.title()

                else:
                    t = token.text
                result.append(t)
            actual_keywords.append(' '.join(result))
    
    actual_keywords = list(set(actual_keywords))
        
    return actual_keywords

In [4]:
from tqdm import tqdm

X, Y, titles = [], [], []

for file in files:
    with open(file) as fopen:
        data = list(filter(None, fopen.read().split('\n')))
    
    print(file)
    
    for i in tqdm(range(len(data))):
    
        row = json.loads(data[i])
        keywords = capitalize(row['abstract'], row['keyword'])
        X.append(row['abstract'])
        Y.append(keywords)
        titles.append(row['title'])

  0%|          | 0/10000 [00:00<?, ?it/s]

KPTimes.valid.jsonl


100%|██████████| 10000/10000 [30:21<00:00,  5.49it/s] 
  0%|          | 0/259923 [00:00<?, ?it/s]

KPTimes.train.jsonl


 31%|███       | 81003/259923 [4:03:40<13:10:07,  3.77it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 34%|███▍      | 89552/259923 [4:29:25<8:35:11,  5.51it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 39%|███▉      | 101362/259923 [5:05:04<6:56:11,  6.35it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config vari

KPTimes.test.jsonl


  1%|          | 165/20000 [00:22<45:17,  7.30it/s]  


IndexError: string index out of range

In [9]:
file = 'KPTimes.test.jsonl'

with open(file) as fopen:
    data = list(filter(None, fopen.read().split('\n')))

print(file)

for i in tqdm(range(165, len(data))):
    try:
        row = json.loads(data[i])
        keywords = capitalize(row['abstract'], row['keyword'])
        X.append(row['abstract'])
        Y.append(keywords)
        titles.append(row['title'])
    except:
        pass

  0%|          | 1/19835 [00:00<53:12,  6.21it/s]

KPTimes.test.jsonl


100%|██████████| 19835/19835 [57:25<00:00,  5.76it/s]  


In [10]:
titles[-1]

'E.U. Leaders Chart Future as David Cameron Leaves Brussels'

In [11]:
Y[-1]

['Great Britain', 'David Cameron', 'Eu', 'Brexit']

In [13]:
with open('kptimes.json', 'w') as fopen:
    json.dump({'X': X, 'Y': Y, 'titles': titles}, fopen)

In [15]:
import re
import json

def cleaning(string):
    string = re.sub(r'[ ]+', ' ', string.replace('\n',' ')).strip()
    return string

def limit(string, max_len = 3500):
    string = string.split()
    r = ''
    for s in string:
        if len(r + ' ' + s) > max_len:
            break
        r = r + ' ' + s
    return cleaning(r)

In [16]:
limit(X[-1])

'BRUSSELS — With Britain’s prime minister omitted for the first time, European Union leaders at a summit meeting wrestled on Wednesday with an existential question: how to salvage a venture that has provided peace and relative prosperity to 500 million people but has lost public support. “We all need to wake up and smell the coffee,” President Dalia Grybauskaite of Lithuania said at the start of private talks in Brussels on how to relaunch the European Union after the shock of Britain’s vote last week to leave. In the absence of Prime Minister David Cameron of Britain, who was already back in London after attending an initial day of talks, the group’s remaining 27 leaders all agreed that the European Union needs to change the way it works if it is to curb a rising tide of populism driven in large part by hostility toward Brussels. Presenting the leaders with his own analysis of why 52 percent of Britons had voted to withdraw from the European Union, Mr. Cameron, at a somber dinner late

In [18]:
titles[-1]

'E.U. Leaders Chart Future as David Cameron Leaves Brussels'

In [19]:
combined = f"{titles[-1]} [[EENNDD]] {'; '.join(Y[-1])}"
combined

'E.U. Leaders Chart Future as David Cameron Leaves Brussels [[EENNDD]] Great Britain; David Cameron; Eu; Brexit'

In [22]:
import re
from tqdm import tqdm
import json

data = []

for i in tqdm(range(len(X))):
    string = X[i]
    string = cleaning(string)
    string = limit(string)
    keywords = Y[i]
    combined = f"{string} [[EENNDD]] {'; '.join(keywords)}"
    
    data.append({'string': string, 'keywords': keywords, 'combined': combined})
    
    combined = f"{titles[i]} [[EENNDD]] {'; '.join(keywords)}"
    
    data.append({'string': titles[i], 'keywords': keywords, 'combined': combined})

100%|██████████| 289904/289904 [06:01<00:00, 802.24it/s] 


In [23]:
len(data)

579808

In [24]:
batch_size = 50000
for i in range(0, len(data), batch_size):
    index = min(i + batch_size, len(data))
    x = data[i: index]
    with open(f'kptimes-{i}.json', 'w') as fopen:
        json.dump(x, fopen)