# Headline project

See shared google doc for buildout plan:
https://docs.google.com/document/d/1GrwFtcygBsiHBWx3GpUJIW-Pr7bfTim9xVAksxTRZh8/edit

### Load libraries

In [1]:
import random
import os
import csv
import re
import time
import statsmodels.formula.api
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from NYT_parser import NYTArticle
from utilities import *

[nltk_data] Downloading package punkt to /home/arnoldyb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/arnoldyb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Set global variables

In [2]:
NYT_TOTAL_FILES = 1855658 # The total number of XML files in the NYT Annotated corpus
RANDOMIZATION_SEED = 100 # The seed is used to split data into train, dev, text in replicatable manner
FIRST_PATH = '1995/01/25/0739212.xml' # Used as a check of the randomization
TRAIN_SPLIT = 0.7 # Put 70% of the data into the train set
DEV_SPLIT = 0.1 # Put 10% of the data into the dev set
TEST_SPLIT = 0.2 # Put 20% of the data into the test set

### Set up file system

In [3]:
# Your directories should be set up as follows to run this notebook:
# headline_generation (folder)
#      |___ main.ipynb           The main notebook for training and showing test results
#      |___ NYT_parser.ipynb     A class for parsing the raw XML files
#      |___ utilities.ipynb      Some helper functions to keep code from getting cluttered
#      |___ EDA.ipynb            Some initial exploratory data analysis work
#      |___ Gavrilov.py          Tensor2Tensor subclass that defines our Problem 
#      |___ logs (folder)        Lists of filepaths based on various filters and train/dev/test split
#      |___ data (folder)
#             |___ sentiment (folder)
#                    |___ positive-words.txt (unrar from: http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar)
#                    |___ negative-words.txt (unrar from: http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar)
#             |___ glove (folder)
#                    |___ glove.42B.300d.txt (unzip from: http://nlp.stanford.edu/data/glove.42B.300d.zip)
#             |___ nyt (folder - unzip/untar from https://catalog.ldc.upenn.edu/download/a22bbeb044db7cb70954c21e130aec48c512cb90a2874a6746e3bc722b3f)
#                    |___ 1987 (folders for all years from 1987 to 2007)
#                           |___ 01 (folders for all months 1 to 12)
#                                 |___ 01 (folders for all days of month)
#                                       |___ 0000000.xml (1.8 million xml files numbered sequentially from 0)

In [4]:
# Code to setup the above structure

# NOTE: Since some of the sites are password protected, this may not fully work automatically.
# You will most likely need to do some manual downloading, unpacking, and moving of files.

# WRITE THIS LATER AFTER ASKING PROFS HOW THEY WANT THIS HANDLED
# THERE IS A TON OF MKDIR AND WGET CODE FROM OLD NOTEBOOKS THAT CAN BE ADAPTED
# WE COULD ALSO DO A SET OF ASSERTS WHEN DEFINING FILE PATHS BELOW TO MAKE SURE DATA IS ORGANIZED PROPERLY

In [5]:
# Define filepaths
nyt_path = './data/nyt/' # points to folder containing the years folders of the NYT Annotated corpus 
sentiment_path = './data/sentiment/' # points to folder containing sentiment classification data files
glove_path = './data/glove/glove.42B.300d.txt' # point to file containing glove embeddings
log_path = './logs/' # points to folder containing all the logs
all_data_log = log_path + 'all_data.log' # points to file containing filepaths for all NYT xml files

### Create log of all raw data

In [6]:
# This creates a log file containing the names of all xml files in the corpus
with open(all_data_log, 'w' ,encoding='utf-8', newline='') as resultFile:
    wr = csv.writer(resultFile)
    for root, dirs, files in sorted(os.walk(nyt_path)):
        for file in sorted(files):
            if file.endswith(".xml"):
                filepath = os.path.join(root, file)
                if nyt_path in filepath: # truncate the set path to NYT data in the log file
                    filepath = filepath[filepath.find(nyt_path)+11:]
                wr.writerow([filepath])       

In [7]:
# read log with all xml filenames in and put into a list
all_files_list = []
with open(all_data_log, encoding='utf-8', newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        if row:
            all_files_list.append(row[0])

all_files_count = len(all_files_list)

In [8]:
# checksum
if all_files_count == NYT_TOTAL_FILES:
    print("You have logged all", NYT_TOTAL_FILES,"files in the NYT Annotated corpus.")
else:
    print("WARNING! You do not seem to have logged all", NYT_TOTAL_FILES,"files in the NYT Annotated corpus.")

You have logged all 1855658 files in the NYT Annotated corpus.


### Train sentiment model

In [9]:
embeddings = load_embeddings(glove_path) # load embeddigs
pos_words = load_lexicon(sentiment_path+'positive-words.txt')
neg_words = load_lexicon(sentiment_path+'negative-words.txt')
pos_vectors = embeddings.loc[pos_words].dropna()
neg_vectors = embeddings.loc[neg_words].dropna()
vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)
train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)
model = SGDClassifier(loss='log', random_state=0, n_iter=100)
model.fit(train_vectors, train_targets)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  after removing the cwd from sys.path.
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """


SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=100,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=0, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [10]:
# helper functions for sentiment analysis 

def vecs_to_sentiment(vecs):
    # predict_log_proba gives the log probability for each class
    predictions = model.predict_log_proba(vecs)
    # To see an overall positive vs. negative classification in one number,
    # we take the log probability of positive sentiment minus the log
    # probability of negative sentiment.
    return predictions[:, 1] - predictions[:, 0]

def words_to_sentiment(words):
    vecs = embeddings.loc[words].dropna()
    log_odds = vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)

def text_to_sentiment(text):
    tokens = word_tokenize(text.lower())
    try: 
        sentiments = words_to_sentiment(tokens)
    except: # handle case where there's no known words in input
        return 0
    return sentiments['sentiment'].mean()

def score_article(text):
    score = 0
    num_sentences = 0
    for sentence in text:
        num_sentences += 1
        score += text_to_sentiment(sentence)
    if num_sentences == 0:
        return 0
    else:
        return score / num_sentences

### Create meta data log file

In [None]:
# THIS CELL TAKES A LONG TIME TO RUN
with open(log_path+"meta_data.log",'w') as resultFile:
    wr = csv.writer(resultFile)
    # set time
    start_time = time.time()
    # set counter
    filecount = 0
    for filepath in all_files_list:
        filecount += 1
        if (filecount%10000 == 0):
            print("Processing file",filecount,"after",time.time() - start_time, "seconds.")
        # get NYT meta data
        article = NYTArticle.from_file(os.path.join(nyt_path, filepath))
        if article.pass_filters(): # applies Gavrilov's basic filtering: no obits, hedes and body text within a wordcount range
            hede_size, section, wordcount = article.get_meta()
            # calc sentiment data
            sent_hede = score_article(article.print_hede)
            sent_lede = score_article(article.lede)
            sent_body = score_article(article.paragraphs)
            # write row of meta data        
            wr.writerow([filepath, hede_size, wordcount, section, sent_hede, sent_lede, sent_body])
            

### Split the Data

In [None]:
colnames = ["filepath", "hede_size", "wordcount", "section", "sent_hede", "sent_lede", "sent_body"]

# read the meta log into a pd.DataFrame
meta_df = pd.read_csv(log_path+"meta_data.log", sep=",", header=None, names=colnames, 
                 dtype={'filepath': str,'hede_size': int,'wordcount': int,'section': str, 'sent_hede': float, 'sent_lede': float, 'sent_body': float})

# check if splits are rational
if (TRAIN_SPLIT + DEV_SPLIT + TEST_SPLIT) != 1.0:
    print("WARNING! Your train/dev/test splits do not toal 1.0.")

# shuffle!
rando = np.random.seed(seed=RANDOMIZATION_SEED)
meta_df = meta_df.sample(frac=1, axis=0, random_state=rando).reset_index(drop=True) # this shuffles randomly

# set breaks
train_break = int(all_files_count * TRAIN_SPLIT)
dev_break = train_break + int(all_files_count * DEV_SPLIT) # rest is test

# split the train, dev, test sets
train_df = meta_df[0:train_break]
dev_df = meta_df[train_break:dev_break]
test_df = meta_df[dev_break:]

# output number of files in each split
print("There are",len(train_df),"train set files,",len(dev_df),"dev set files,",len(test_df),"test set files.")

# checksum
if train_df['filepath'][0] != FIRST_PATH:
    print("WARNING! Your randomization differs. This will leave you with different train/dev/test splits.")
    
# write the split data to individual meta log files
train_df.to_csv(path_or_buf=log_path+"meta_train_unfltrd.log", index=False, header=True)
dev_df.to_csv(path_or_buf=log_path+"meta_dev.log", index=False, header=True)
test_df.to_csv(path_or_buf=log_path+"meta_test.log", index=False, header=True)

### Filter training data based on sentiment

In [None]:
# load in unfiltered train log file
train_df = pd.read_csv(log_path+"meta_train_unfltrd.log", sep=",", header=0, 
                 dtype={'filepath': str,'hede_size': int,'wordcount': int,'section': str, 'sent_hede': float, 'sent_lede': float, 'sent_body': float})

# Conduct sentiment filtering on the train data
# INITIAL TEST: filter out all articles that have opposite sentiment in headline and lede
drop_count = 0
for index, row in train_df.iterrows():
    headline = float(row['sent_hede'])
    lede = float(row['sent_lede'])
    if (headline > 0 and lede < 0) or (headline < 0 and lede > 0):
        drop_count += 1
        train_df.drop(index, inplace=True)
print("Filtered out", drop_count,"training files due to sentiment.")
        
# write (filtered) train log file
train_df.to_csv(path_or_buf=log_path+"meta_train.log", index=False, header=True)