In [2]:
import pandas as pd
import numpy as np
import re

import os
from bs4 import BeautifulSoup

# Get evaluation dataset from TIPSTER data

### Get files

`./resources/tipster/*.xml`

In [2]:
path = "resources/tipster/"
ext = ".xml"

filenames = []
for _, _, files in os.walk(path):
    filenames = [os.path.join(path, f) for f in files if f.endswith(ext)]
print(len(filenames))

183


### Store text in dictionary

* Get text
* Get BeautifulSoup to split abstract and text from xml

In [3]:
data = {"abstract":[], "text":[]}

In [4]:
for file in filenames:
    with open(file, "r", encoding="latin1") as f:
        xml = f.read()
        soup = BeautifulSoup(xml, "xml")
        
        abstract_raw = soup.find("ABSTRACT").find_all("P")
        abstract = " ".join([s.text.replace("\n", " ") for s in abstract_raw])
        data["abstract"].append(abstract)
        
        text_raw = soup.find("BODY").find_all("P")
        text = " ".join([s.text.replace("\n", " ") for s in text_raw])
        data["text"].append(text)

### Save as CSV file

In [5]:
df = pd.DataFrame(data)

In [6]:
df.to_csv("data/tipster_test_data.csv")

# Get evaluation dataset from BBC News data

### Get files

Grabbing only one category to speed up processing

`./resources/bbcnews/News Articles/entertainment/*.txt`
`./resources/bbcnews/Summaries/entertainment/*.txt`

In [3]:
textpath = "resources/bbcnews/News Articles/entertainment/"
summpath = "resources/bbcnews/Summaries/entertainment/"
ext = ".txt"

textfilenames = []
for _, _, files in os.walk(textpath):
    textfilenames = [os.path.join(textpath, f) for f in files if f.endswith(ext)]

summfilenames = []
for _, _, files in os.walk(summpath):
    summfilenames = [os.path.join(summpath, f) for f in files if f.endswith(ext)]

print(len(textfilenames),"=?",len(summfilenames))


386 =? 386


### Store text in dictionary

* Ensure filenames are sorted
* Add text and summary to dictionary

In [7]:
bbcdata = {"abstract":[], "text":[]}

textfilenames.sort()
summfilenames.sort()

for file in textfilenames:
    with open(file, "r") as f:
        bbcdata["text"].append(f.read())

for file in summfilenames:
    with open(file, "r") as f:
        bbcdata["abstract"].append(f.read())

### Save as CSV file

In [9]:
df = pd.DataFrame(bbcdata)
df.head()

Unnamed: 0,abstract,text
0,"The messages will be ""unwrapped"" by sculptor R...",Gallery unveils interactive tree\n\nA Christma...
1,Bloom is to be formally presented with the Han...,Jarre joins fairytale celebration\n\nFrench mu...
2,The classic film It's A Wonderful Life is to b...,Musical treatment for Capra film\n\nThe classi...
3,"""It was very hard to follow last year's extrem...",Richard and Judy choose top books\n\nThe 10 au...
4,Mary Poppins was originally created by author ...,Poppins musical gets flying start\n\nThe stage...


In [10]:
df.to_csv("data/bbc_test_data.csv")