#### Dataset preparation

##### Data loading

In [2]:
import json, gzip

def read_jsonl_gz(path):
    with gzip.open(path) as f:
        for l in f:
            yield json.loads(l)

val_data = list(read_jsonl_gz('D:/code/nlp_projects/l5/dataset/val.jsonl.gz'))
test_data = list(read_jsonl_gz('D:/code/nlp_projects/l5/dataset/test.jsonl.gz'))
train_data = list(read_jsonl_gz('D:/code/nlp_projects/l5/dataset/train.jsonl.gz'))

##### Data transform

In [6]:
import pandas as pd

def transform_row(ch):
    summ = ch['summary']
    articles = []
    for a in ch['articles']:
        articles.append({'text':a['text'], 'summary':summ})
    return articles

def transform_dataset(data):
    articles = []
    for r in data:
        articles.extend(transform_row(r))
    return pd.DataFrame(articles)
        
        
val_df = transform_dataset(val_data)
train_df = transform_dataset(train_data).append(transform_dataset(test_data), ignore_index=True)
print(train_df.head())

                                                text  \
0  Rodrigo Duterte, the new president of the Phil...   
1  Rodrigo Duterte, the new president of the Phil...   
2  Is the media serving justice to alleged victim...   
3  Jaypee Bertes was bruised and battered, his ar...   
4  CAGAYAN DE ORO CITY (MindaNews/24 June) — Memb...   

                                             summary  
0  The death toll in Rodrigo Duterte's war on dru...  
1  The death toll in Rodrigo Duterte's war on dru...  
2  The death toll in Rodrigo Duterte's war on dru...  
3  The death toll in Rodrigo Duterte's war on dru...  
4  The death toll in Rodrigo Duterte's war on dru...  


  train_df = transform_dataset(train_data).append(transform_dataset(test_data), ignore_index=True)


##### Data cleaning

In [7]:
import re

#Removes non-alphabetic characters:
def text_strip(column):
    for row in column:
        
        row=re.sub("(\\t)", ' ', str(row)).lower() #delete escape charecters
        row=re.sub("(\\r)", ' ', str(row)).lower() 
        row=re.sub("(\\n)", ' ', str(row)).lower()
        
        row=re.sub("(__+)", ' ', str(row)).lower()   #delete _ if more than one time repeatedly
        row=re.sub("(--+)", ' ', str(row)).lower()   #delete - if more than one time repeatedly
        row=re.sub("(~~+)", ' ', str(row)).lower()   #delete ~ if more than one time repeatedly
        row=re.sub("(\+\++)", ' ', str(row)).lower()   #delete + if more than one time repeatedly
        row=re.sub("(\.\.+)", ' ', str(row)).lower()   #delete . if more than one time repeatedly
        
        row=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower() #delete <>()|&©ø"',;?~*!
        
        row=re.sub("(mailto:)", ' ', str(row)).lower() #delete mailto:
        row=re.sub(r"(\\x9\d)", ' ', str(row)).lower() #delete \x9* in text
        row=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower() #replace INC nums to INC_NUM
        row=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower() #replace CM# and CHG# to CM_NUM
        
        
        row=re.sub("(\.\s+)", ' ', str(row)).lower() #delete full stop at end of words(not between)
        row=re.sub("(\-\s+)", ' ', str(row)).lower() #delete - at end of words(not between)
        row=re.sub("(\:\s+)", ' ', str(row)).lower() #delete : at end of words(not between)
        
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #delete any single charecters hanging between 2 spaces
        
        row = re.sub("(\s+)",' ',str(row)).lower() #delete multiple spaces
        
        #Should always be last
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #delete any single charecters hanging between 2 spaces

        yield row

In [8]:
clean_train_text = text_strip(train_df['text'])
clean_train_summ = text_strip(train_df['summary'])

clean_val_text = text_strip(val_df['text'])
clean_val_summ = text_strip(val_df['summary'])

In [15]:
import spacy
from tqdm import tqdm
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) 


train_text = [str(doc) for doc in tqdm(nlp.pipe(clean_train_text, batch_size=50))]
train_summary = ['_START_ '+ str(doc) + ' _END_' for doc in tqdm(nlp.pipe(clean_train_summ, batch_size=50))]

val_text = [str(doc) for doc in tqdm(nlp.pipe(clean_val_text, batch_size=50))]
val_summary = ['_START_ '+ str(doc) + ' _END_' for doc in tqdm(nlp.pipe(clean_val_summ, batch_size=50))]

561867it [3:53:38, 40.08it/s] 
571868it [20:06, 474.15it/s]
77707it [33:04, 39.17it/s] 
77707it [03:01, 427.23it/s]


In [1]:
print(train_text)

NameError: name 'train_text' is not defined