# Headline project

See shared google doc for buildout plan:
https://docs.google.com/document/d/1GrwFtcygBsiHBWx3GpUJIW-Pr7bfTim9xVAksxTRZh8/edit

### Load libraries

In [38]:
import random
import os
import csv
import re
import time
import statsmodels.formula.api
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from NYT_parser import NYTArticle

[nltk_data] Downloading package punkt to /home/arnoldyb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Set global variables

In [60]:
NYT_TOTAL_FILES = 1855658 # The total number of XML files in the NYT Annotated corpus
RANDOMIZATION_SEED = 100 # The seed is used to split data into train, dev, text in replicatable manner
FIRST_PATH = '1995/01/25/0739212.xml' # Used as a check of the randomization

### Set up file system

In [None]:
# Your directories should be set up as follows to run this notebook:
# headline_generation (folder)
#      |___ main.ipynb           The main notebook for training and showing test results
#      |___ NYT_parser.ipynb     A class for parsing the raw XML files
#      |___ utilities.ipynb      Some helper functions to keep code from getting cluttered
#      |___ EDA.ipynb            Some initial exploratory data analysis work
#      |___ Gavrilov.py          Tensor2Tensor subclass that defines our Problem 
#      |___ logs (folder)        Lists of filepaths based on various filters and train/dev/test split
#      |___ data (folder)
#             |___ sentiment (folder)
#                    |___ positive-words.txt (unrar from: http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar)
#                    |___ negative-words.txt (unrar from: http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar)
#             |___ glove (folder)
#                    |___ glove.42B.300d.txt (unzip from: http://nlp.stanford.edu/data/glove.42B.300d.zip)
#             |___ nyt (folder - unzip/untar from https://catalog.ldc.upenn.edu/download/a22bbeb044db7cb70954c21e130aec48c512cb90a2874a6746e3bc722b3f)
#                    |___ 1987 (folders for all years from 1987 to 2007)
#                           |___ 01 (folders for all months 1 to 12)
#                                 |___ 01 (folders for all days of month)
#                                       |___ 0000000.xml (1.8 million xml files numbered sequentially from 0)

In [None]:
# Code to setup the above structure

# NOTE: Since some of the sites are password protected, this may not fully work automatically.
# You will most likely need to do some manual downloading, unpacking, and moving of files.

# WRITE THIS LATER AFTER ASKING PROFS HOW THEY WANT THIS HANDLED
# THERE IS A TON OF MKDIR AND WGET CODE FROM OLD NOTEBOOKS THAT CAN BE ADAPTED
# WE COULD ALSO DO A SET OF ASSERTS WHEN DEFINING FILE PATHS BELOW TO MAKE SURE DATA IS ORGANIZED PROPERLY

In [3]:
# Define filepaths
nyt_path = './data/nyt/' # points to folder containing the years folders of the NYT Annotated corpus 
sentiment_path = './data/sentiment/' # points to folder containing sentiment classification data files
glove_path = './data/glove/' # point to folder containing glove embeddings txt file
log_path = './logs/' # points to folder containing all the logs
all_data_log = log_path + 'all_data.log' # points to file containing filepaths for all NYT xml files

### Create log of all raw data

In [29]:
# This creates a log file containing the names of all xml files in the corpus
with open(all_data_log, 'w' ,encoding='utf-8', newline='') as resultFile:
    wr = csv.writer(resultFile)
    for root, dirs, files in sorted(os.walk(nyt_path)):
        for file in sorted(files):
            if file.endswith(".xml"):
                filepath = os.path.join(root, file)
                if nyt_path in filepath: # truncate the set path to NYT data in the log file
                    filepath = filepath[filepath.find(nyt_path)+11:]
                wr.writerow([filepath])       

In [61]:
# read log with all xml filenames in and put into a list
all_files_list = []
with open(all_data_log, encoding='utf-8', newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        if row:
            all_files_list.append(row[0])

In [56]:
# checksum
if len(all_files_list) == NYT_TOTAL_FILES:
    print("You have logged all", NYT_TOTAL_FILES,"files in the NYT Annotated corpus.")
else:
    print("WARNING! You do not seem to have logged all", NYT_TOTAL_FILES,"files in the NYT Annotated corpus.")

You have logged all 1855658 files in the NYT Annotated corpus.


### Data preprocessing

In [62]:
# shuffle the data and split into train, dev, test

np.random.seed(seed=RANDOMIZATION_SEED)
np.random.shuffle(all_files_list)

# checksum
if all_files_list[0] != FIRST_PATH:
    print("Your randomization differs")