In [1]:
import pandas as pd
import numpy as np
import re

import os
from bs4 import BeautifulSoup

# Get evaluation dataset from TIPSTER data

### Get files

`./resources/tipster/*.xml`

In [2]:
path = "resources/tipster/"
ext = ".xml"

filenames = []
for _, _, files in os.walk(path):
    filenames = [os.path.join(path, f) for f in files if f.endswith(ext)]
print(len(filenames))

183


### Store text in dictionary

* Get text
* Get BeautifulSoup to split abstract and text from xml

In [3]:
data = {"abstract":[], "text":[]}

In [4]:
for file in filenames:
    with open(file, "r", encoding="latin1") as f:
        xml = f.read()
        soup = BeautifulSoup(xml, "xml")
        
        abstract_raw = soup.find("ABSTRACT").find_all("P")
        abstract = " ".join([s.text.replace("\n", " ") for s in abstract_raw])
        data["abstract"].append(abstract)
        
        text_raw = soup.find("BODY").find_all("P")
        text = " ".join([s.text.replace("\n", " ") for s in text_raw])
        data["text"].append(text)

### Save as CSV file

In [5]:
df = pd.DataFrame(data)

In [6]:
df.to_csv("data/tipster_test_data.csv")

# Get evaluation dataset from BBC News data

### Get files

Grabbing only one category to speed up processing

`./resources/bbcnews/News Articles/entertainment/*.txt`
`./resources/bbcnews/Summaries/entertainment/*.txt`

In [3]:
textpath = "resources/bbcnews/News Articles/entertainment/"
summpath = "resources/bbcnews/Summaries/entertainment/"
ext = ".txt"

textfilenames = []
for _, _, files in os.walk(textpath):
    textfilenames = [os.path.join(textpath, f) for f in files if f.endswith(ext)]

summfilenames = []
for _, _, files in os.walk(summpath):
    summfilenames = [os.path.join(summpath, f) for f in files if f.endswith(ext)]

print(len(textfilenames),"=?",len(summfilenames))


386 =? 386


### Store text in dictionary

* Ensure filenames are sorted
* Add text and summary to dictionary

In [7]:
bbcdata = {"abstract":[], "text":[]}

textfilenames.sort()
summfilenames.sort()

for file in textfilenames:
    with open(file, "r") as f:
        bbcdata["text"].append(f.read())

for file in summfilenames:
    with open(file, "r") as f:
        bbcdata["abstract"].append(f.read())

### Save as CSV file

In [9]:
df = pd.DataFrame(bbcdata)
df.head()

Unnamed: 0,abstract,text
0,"The messages will be ""unwrapped"" by sculptor R...",Gallery unveils interactive tree\n\nA Christma...
1,Bloom is to be formally presented with the Han...,Jarre joins fairytale celebration\n\nFrench mu...
2,The classic film It's A Wonderful Life is to b...,Musical treatment for Capra film\n\nThe classi...
3,"""It was very hard to follow last year's extrem...",Richard and Judy choose top books\n\nThe 10 au...
4,Mary Poppins was originally created by author ...,Poppins musical gets flying start\n\nThe stage...


In [10]:
df.to_csv("data/bbc_test_data.csv")

# Get evaluation dataset from News Summary (Kaggle) data

### Get files

Grabbing only one category to speed up processing

`./resources/news-summary/news_summary.csv`

In [5]:
df = pd.read_csv("resources/news-summary/news_summary.csv", encoding="latin1")
df.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [7]:
df.drop(["author", "date", "headlines", "read_more"], axis=1, inplace=True)

In [9]:
df.columns = ["text", "abstract"]
df.head()

Unnamed: 0,text,abstract
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [10]:
df.to_csv("data/kaggle_test_data.csv")