# W266 - Headline Project

### Split the data into train, dev, test

In [120]:
# libraries and filepaths
import os
import csv
import numpy as np
from NYT_parser import NYTArticle

root_path = './data/' # point this to the unzipped & untared data subfolder in NYT Annotated corpus 
log_file="./log_file.log" # this file contains complete list of all xml files


In [121]:
# this creates a log file containing the names of all xml files in the corpus
with open(log_file,'w') as resultFile:
    wr = csv.writer(resultFile)
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file.endswith(".xml"):
                filepath = os.path.join(root, file)
                if "data/" in filepath:
                    filepath = filepath[filepath.find("data/")+5:]
                wr.writerow([filepath])
            

In [122]:
# read log with all xml filenames in and put into a list
file_list = []
with open(log_file, newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        if row:
            file_list.append(row[0])

In [123]:
# check you've got it loaded
print("Read", len(file_list), "files from the log.")

Read 1855658 files from the log.


In [124]:
# shuffle the filenames in place in the list
np.random.seed(seed=100)
np.random.shuffle(file_list)

In [125]:
# split the train, dev, test sets
num_files = len(file_list)
train_split = int(num_files * 0.7)
dev_split = train_split + int(num_files * 0.1) # rest is test
train_files = file_list[0:train_split]
dev_files = file_list[train_split:dev_split]
test_files = file_list[dev_split:]
print("There are",num_files,"total files in the dataset.")
print("There are",len(train_files),"train set files," ,len(dev_files),"dev set files,",len(test_files),"test set files.")

There are 1855658 total files in the dataset.
There are 1298960 train set files, 185565 dev set files, 371133 test set files.


In [126]:
# write the train, dev, test file IDs to log files
with open("train_IDs.csv",'w') as resultFile:
    wr = csv.writer(resultFile)
    for file_id in train_files:    
        wr.writerow([file_id])
with open("dev_IDs.csv",'w') as resultFile:
    wr = csv.writer(resultFile)
    for file_id in dev_files:
        wr.writerow([file_id])
with open("test_IDs.csv",'w') as resultFile:
    wr = csv.writer(resultFile)
    for file_id in test_files:
        wr.writerow([file_id])

In [127]:
# IF YOU WANT, YOU COULD JUST START HERE
# loads in the train, dev, test file IDs into lists
train_IDs = []
dev_IDs = []
test_IDs = []
with open("train_IDs.csv", newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        if row:
            train_IDs.append(row[0])
with open("dev_IDs.csv", newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        if row:
            dev_IDs.append(row[0])
with open("test_IDs.csv", newline='') as f:
    reader = csv.reader(f)
    for row in reader: 
        if row:
            test_IDs.append(row[0])

In [128]:
# confirming sizes
print("There are",num_files,"total files in the dataset.")
print("There are",len(train_IDs),"train set files," ,len(dev_IDs),"dev set files,",len(test_IDs),"test set files.")

There are 1855658 total files in the dataset.
There are 1298960 train set files, 185565 dev set files, 371133 test set files.


In [129]:
# now, you can look at train data easily...

# open the very first train doc
with open(root_path+train_IDs[0],encoding = 'utf-8') as f:
    article = NYTArticle.from_file(f)


In [130]:
# how does this differ from the headline??
article.title

"The President's Address: 'We Heard America Shouting'"

In [131]:
# document ID (filename)
article.docid

'739212'

In [132]:
# date in a datetime format
article.date

datetime.datetime(1995, 1, 25, 0, 0)

In [133]:
# summary -- few docs seem to have?
article.summary

[]

In [134]:
# dateline -- location (city, state) and pub date
article.dateline

[]

In [135]:
# headline that ran in the print edition
article.print_hede

["The President's Address: 'We Heard America Shouting'"]

In [136]:
# headline that ran on the website version
article.online_hede

[]

In [137]:
# opening paragraph
article.lede

["Following is President Clinton's State of the Union Message last night, as recorded by The New York Times:",
 'Mr. President, Mr. Speaker, members of the 104th Congress, my fellow Americans, again we are here in the sanctuary of democracy. And once again, our democracy has spoken.']

In [138]:
# topic tags
article.descriptors

['biographical information',
 'law and legislation',
 'state of the union message (us)',
 'united states politics and government']

In [139]:
# more extensive tags?
article.general_descriptors

['law and legislation',
 'politics and government',
 'state of the union message (us)',
 'united states politics and government']

In [140]:
# more extensive tags?
article.types_of_material

['text']

In [141]:
article.wordcount

'8630'

In [142]:
# section of the print newspaper
article.section

'A'

In [143]:
# body text but also includes subhead, book title data, endbyline, section title -- all kinds of stuff
article.paragraphs

["Following is President Clinton's State of the Union Message last night, as recorded by The New York Times:",
 'Mr. President, Mr. Speaker, members of the 104th Congress, my fellow Americans, again we are here in the sanctuary of democracy. And once again, our democracy has spoken.',
 'So let me begin by congratulating all of you here in the 104th Congress, and congratulating you, Mr. Speaker.',
 'If we agree on nothing else tonight, we must agree that the American people certainly voted for change in 1992 and in 1994.',
 'And as I look out at you, I know how some of you must have felt in 1992.',
 "I must say that in both years we didn't hear America singing, we heard America shouting. And now all of us, Republicans and Democrats alike, must say: We hear you. We will work together to earn the jobs you have given us. For we are the keepers of the sacred trust and we must be faithful to it in this new and very demanding era.",
 'Over 200 years ago, our founders changed the entire course