# W266 - Headline Project

### Split the data into train, dev, test

In [24]:
# libraries and filepaths
import os
import csv
import numpy as np
from NYT_parser import NYTArticle

root_path = './data/' # point this to the unzipped & untared data subfolder in NYT Annotated corpus 
log_file="./log_file.log" # this file contains complete list of all xml files


In [25]:
# this creates a log file containing the names of all xml files in the corpus
with open(log_file,'w',encoding='utf-8', newline='') as resultFile:
    wr = csv.writer(resultFile)
    for root, dirs, files in sorted(os.walk(root_path)):
        for file in sorted(files):
            if file.endswith(".xml"):
                filepath = os.path.join(root, file)
                if "data/" in filepath:
                    filepath = filepath[filepath.find("data/")+5:]
                wr.writerow([filepath])
            

In [26]:
# read log with all xml filenames in and put into a list
file_list = []
with open(log_file, encoding='utf-8', newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        if row:
            file_list.append(row[0])

In [27]:
# check you've got it loaded
print("Read", len(file_list), "files from the log.")

# should see 1855658 files

Read 1855658 files from the log.


In [28]:
# shuffle the filenames in place in the list
np.random.seed(seed=100)
np.random.shuffle(file_list)

print(file_list[0])
# expected output
# 1995/01/25/0739212.xml

1995/01/25/0739212.xml


In [29]:
# split the train, dev, test sets
num_files = len(file_list)
train_split = int(num_files * 0.7)
dev_split = train_split + int(num_files * 0.1) # rest is test
train_files = file_list[0:train_split]
dev_files = file_list[train_split:dev_split]
test_files = file_list[dev_split:]
print("There are",num_files,"total files in the dataset.")
print("There are",len(train_files),"train set files," ,len(dev_files),"dev set files,",len(test_files),"test set files.")

There are 1855658 total files in the dataset.
There are 1298960 train set files, 185565 dev set files, 371133 test set files.


In [30]:
# write the train, dev, test file IDs to log files
with open("train_IDs.csv",'w') as resultFile:
    wr = csv.writer(resultFile)
    for file_id in train_files:    
        wr.writerow([file_id])
with open("dev_IDs.csv",'w') as resultFile:
    wr = csv.writer(resultFile)
    for file_id in dev_files:
        wr.writerow([file_id])
with open("test_IDs.csv",'w') as resultFile:
    wr = csv.writer(resultFile)
    for file_id in test_files:
        wr.writerow([file_id])

In [31]:
# IF YOU WANT, YOU COULD JUST START HERE
# loads in the train, dev, test file IDs into lists
train_IDs = []
dev_IDs = []
test_IDs = []
with open("train_IDs.csv", newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        if row:
            train_IDs.append(row[0])
with open("dev_IDs.csv", newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        if row:
            dev_IDs.append(row[0])
with open("test_IDs.csv", newline='') as f:
    reader = csv.reader(f)
    for row in reader: 
        if row:
            test_IDs.append(row[0])

In [32]:
# confirming sizes
print("There are",num_files,"total files in the dataset.")
print("There are",len(train_IDs),"train set files," ,len(dev_IDs),"dev set files,",len(test_IDs),"test set files.")

# Expected output:
# There are 1855658 total files in the dataset.
# There are 1298960 train set files, 185565 dev set files, 371133 test set files.

There are 1855658 total files in the dataset.
There are 1298960 train set files, 185565 dev set files, 371133 test set files.
