In [1]:
# libraries
import os
import math
import csv
import re
import statsmodels.formula.api
import pandas as pd
from statistics import mean 
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from NYT_parser import NYTArticle
from utilities import *

In [None]:
# filepath vars -- COMMENT IN/OUT RF VS SF DEPENDING ON FILE INGESTING
nyt_path = './data/nyt/' # points to folder containing the years folders of the NYT Annotated corpus 
sentiment_path = './data/sentiment/' # points to folder containing sentiment classification data files
glove_path = './data/glove/glove.42B.300d.txt' # point to file containing glove embeddings
log_path = './logs/' # points to folder containing all the logs
postp_path = './postprocess/' # points to folder containing logs and files for evaluating the models
test_log = 'meta_test.log' # points to the test file log
decoder_path = './decoder/' # points to folder with decoder output files
decoder_output = 'decoder_rf.txt' # points to file with decoder headlines from randomly filtered model
#decoder_output = 'decoder_sf.txt' # points to file with decoder headlines from sentiment filtered model
postp_sentiments = 'sents_rf.txt' # points to file with sentiment scores from randomly filtered model
#postp_sentiments = 'sents_sf.txt' # points to file with sentiment scores from sentiment filtered model

### Produce files for decoder ingestion

In [2]:
# get baseline word count from train data
df_train = pd.read_csv(os.path.join(log_path,"meta_train.log"))
hede_avg = math.ceil(df_train.hede_size.mean())
df_train.hede_size.mean()

7.528217631618259

In [3]:
df = pd.read_csv(os.path.join(log_path,test_log))
df.head()

Unnamed: 0,filepath,hede_size,wordcount,section,sent_hede,sent_lede,sent_body
0,2003/08/22/1513837.xml,11,832,"['bombs and explosives', 'international relati...",-1.066886,0.401246,0.136112
1,2004/06/19/1590476.xml,8,636,"['education and schools', 'grading of students...",-0.553291,1.198004,1.375394
2,2002/10/08/1430194.xml,6,102,"['news and news media', 'public opinion', 'tel...",2.452781,0.980704,0.255741
3,1995/08/11/0781753.xml,6,514,"['blood', 'transfusions']",-0.997738,-0.442929,0.241921
4,1991/06/30/0456560.xml,4,337,['music'],4.567766,0.980704,1.183736


In [None]:
GRAF_LIMIT = 3

with open(os.path.join("postprocess", "headlines.txt"), "w+") as headlines, \
    open(os.path.join("postprocess", "bodies.txt"), "w+") as bodies, \
    open(os.path.join("postprocess", "baseline.txt"), "w+") as baseline:
    
    for index, row in df.iterrows():
        article = NYTArticle.from_file(os.path.join("data","nyt",row.filepath))
        
        headlines.write(article.print_hede[0]+"\n")
        bodies.write(" ".join(article.paragraphs[:GRAF_LIMIT])+"\n")
        try:
            baseline.write(" ".join(" ".join(article.paragraphs).split()[:hede_avg])+"\n")
        except:
            baseline.write(" ".join(article.paragraphs))
#         print(article.print_hede[0])
#         print(" ".join(article.paragraphs[:GRAF_LIMIT]))
#         print(" ".join(" ".join(article.paragraphs).split()[:hede_avg]))

### Train sentiment model

In [None]:
embeddings = load_embeddings(glove_path) # load embeddigs
pos_words = load_lexicon(sentiment_path+'positive-words.txt')
neg_words = load_lexicon(sentiment_path+'negative-words.txt')
pos_vectors = embeddings.loc[pos_words].dropna()
neg_vectors = embeddings.loc[neg_words].dropna()
vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)
train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)
model = SGDClassifier(loss='log', random_state=0, n_iter=100)
model.fit(train_vectors, train_targets)

In [1]:
# faster helper functions for sentiment analysis

sentiment_dict = {} # stores tokens with their sentiment score for quick lookup

# helper functions for sentiment analysis 

def vecs_to_sentiment2(vecs):
    # predict_log_proba gives the log probability for each class
    predictions = model.predict_log_proba(vecs)
    # To see an overall positive vs. negative classification in one number,
    # we take the log probability of positive sentiment minus the log
    # probability of negative sentiment.
    return predictions[:, 1] - predictions[:, 0]

def words_to_sentiment2(words):
    log_odds = [] # holds log odds
    for word in words: # if we've seen this word before, look up the score in dictionary rather than model
        if word in sentiment_dict:
            log_odds.append(sentiment_dict[word])
        else: # if we haven't seen word before, score it with model and add to dictionary for next time
            score = vecs_to_sentiment2(embeddings.loc[[word]].dropna())[0]
            sentiment_dict[word] = score
            log_odds.append(score)
    return log_odds

def text_to_sentiment2(text):
    tokens = word_tokenize(text.lower())
    try: 
        sentiments = words_to_sentiment2(tokens)
    except: # handle case where there's no known words in input
        return 0
    return mean(sentiments)

### Score decoder output headlines for sentiment

In [None]:
# read in the file with preprocessed sentiment scores into a pd.DataFrame
scores_df = pd.read_csv(postp_path+test_log, sep=",", header=0,  
                 dtype={'filepath': str,'hede_size': int,'wordcount': int,'section': str, 'sent_hede': float, 'sent_lede': float, 'sent_body': float})

# read in the headlines from the decoder output file
with open(decoder_path+decoder_output) as f:
    headlines = [headlines.rstrip('\n') for headlines in f]

genhede_sent = [text_to_sentiment2(hede) for hede in headlines] # calc sentiment for decoder headlines
scores_df['sent_decoder'] = genhede_sent # add decoder headline sentiment to dataframe
scores_df.to_csv(path_or_buf=postp_path+postp_sentiments, index=False, header=True) # save scores to file

### Calculate sentiment metrics

In [None]:
# F1_sentiment
# This is percentage of the generated headlines have sentiment polarity (+/-) that matches lede sentiment polarity.
# This can also be thought of as an F1-score, calculated as: 
# F1 = 2 * (Precision*Recall) / (Precision+Recall)
# Sentiment difference measures the average test headline label sentiment minus average decoder generated headline sentiment.

# 
TP = 0 # true positive count
TN = 0 # true negative count
FP = 0 # false positive count
FN = 0 # false negative count
match = 0
total = len(scores_df.sent_decoder)

for index, row in scores_df.iterrows():
    
    # f1-score calcs
    if row.sent_hede >=0 and row.sent_decoder >=0: TP += 1
    elif row.sent_hede >= 0 and row.sent_decoder <0: FN += 1
    elif row.sent_hede <0 and row.sent_decoder >=0: FP += 1
    else: TN += 1
    
    # confirm
    if row.sent_hede >= 0 and row.sent_decoder >=0: match +=1
    if row.sent_hede <0 and row.sent_decoder <0: match +=1
    
average_hede = mean(scores_df.sent_hede)
average_decoder = mean(scores_df.sent_decoder)
average_difference = average_hede - average_decoder

precision = TP / (TP+FP)
recall = TP / (TP + FN)
f1-score = 2 * (precision*recall) / (precision+recall)
confirm = match/total

print("The f1-score is", f1-score,"Confirming:", confirm)
print("The average test headline score is", average_hede,"The average decoder headline score is", average_decoder,"Sentiment difference is", average_difference)
