In [1]:
# Author: Nicholas Benavides, Ray Thai, & Crystal Zheng
# Code liberally inspired by and lifted from:
# https://github.com/kolchinski/reddit-sarc

In [2]:
import os
import csv
import json
from itertools import islice, chain
import nltk
from collections import Counter
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB


In [3]:
pol_dir = '../SARC/2.0/pol'
comments_file = os.path.join(pol_dir, 'comments.json')
train_file = os.path.join(pol_dir, 'train-balanced.csv')

In [4]:
with open(comments_file, 'r') as f:
    comments = json.load(f)

In [5]:
for x in islice(comments.items(), 2):
    print(x, '\n')

('7uxqr', {'text': 'Nancyt Pelosi messes up.. 500 Million Jobs lost every month that the economic recovery plan is not passed.. LMAO', 'author': 'Fishbum', 'score': 0, 'ups': 2, 'downs': 4, 'date': '2009-02', 'created_utc': 1233788424, 'subreddit': 'politics'}) 

('7vewt', {'text': 'Netflix CEO: "Please raise my taxes"', 'author': 'jdl2003', 'score': 1733, 'ups': 1985, 'downs': 252, 'date': '2009-02', 'created_utc': 1233940024, 'subreddit': 'politics'}) 



In [6]:
train_ancestors = []
train_responses = []
train_labels = []
lower = True
with open(train_file, 'r') as f:
    reader = csv.reader(f, delimiter='|')
    for row in reader:
        ancestors = row[0].split(' ')
        responses = row[1].split(' ')
        labels = row[2].split(' ')
        if lower:
            train_ancestors.append([comments[r]['text'].lower() for r in ancestors])
            train_responses.append([comments[r]['text'].lower() for r in responses])
        else:
            train_ancestors.append([comments[r]['text'] for r in ancestors])
            train_responses.append([comments[r]['text'] for r in responses])
        train_labels.append(labels)

In [7]:
[(len(x), len(y)) for x,y in zip(train_ancestors, train_responses)][:9]

[(1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (2, 2)]

In [8]:
train_ancestors[8], train_responses[8], train_labels[8]

(['new jersey governor chris christie thinks a family making $6,000 a year is too rich to qualify for medicaid',
  "if you have a kid and the best you can do is a minimum wage job then i think you've made some bad choices in the past"],
 ["yah, at that point they don't even count as people anymore!",
  'do you by any chance know what the term "structural unemployment" means?'],
 ['1', '0'])

In [9]:
from collections import defaultdict
train_vocab = defaultdict(int)
for pair in train_responses:
    for comment in pair:
        for w in nltk.word_tokenize(comment):
            train_vocab[w] += 1
train_vocab = Counter(train_vocab)
print(len(train_vocab))

13630


In [10]:
train_vocab.most_common(5)

[('.', 7226), ('the', 6553), (',', 5269), ('to', 4080), ('a', 3342)]

In [11]:
def unigrams_phi_c(comment):
    return Counter(nltk.word_tokenize(comment))

In [12]:
def concat_phi_r(response_features_pair):
    assert len(response_features_pair) == 2
    #print(response_features_pair[0].shape, response_features_pair[1].shape)
    cat = np.concatenate((response_features_pair[0], response_features_pair[1]))
    return cat
    

In [13]:
#phi_c turns comments into features
#phi_a combines ancestor features into summary
#phi_r combines response features into summary
#Note that this is for the "balanced" framing!
#TODO: Initially ignoring ancestors, include them as another vector later
def build_dataset(ancestors, responses, labels, phi_c, phi_a, phi_r, vectorizer=None, vectorize = True):
    X = []
    Y = []
    feat_dicts = [[],[]]
    N = len(ancestors)
    assert N == len(responses) == len(labels)
    print(N)
    for i in range(N):
        assert len(responses[i]) == 2
        feat_dicts[0].append(phi_c(responses[i][0]))
        feat_dicts[1].append(phi_c(responses[i][1]))
    
        #We only care about the first of the two labels since in the balanced setting
        #they're either 0 1 or 1 0
        Y.append(int(labels[i][0]))
            
    if vectorize:
        # In training, we want a new vectorizer:
        if vectorizer == None:
            vectorizer = DictVectorizer(sparse=False)
            #print(feat_dicts[0][:10], feat_dicts[1][:10])
            feat_matrix = vectorizer.fit_transform(feat_dicts[0] + feat_dicts[1])
        # In assessment, we featurize using the existing vectorizer:
        else:
            feat_matrix = vectorizer.transform(chain(feat_dicts[0], feat_dicts[1]))
        
        response_pair_feats = [feat_matrix[:N], feat_matrix[N:]]
    else:
        response_pair_feats = feat_dicts
        #print(response_pair_feats[0])

    #assert len(feat_matrix == 2*N) 
    #print((feat_matrix[0]), len(feat_matrix[1]))
    
    X = [phi_r((response_pair_feats[0][i], response_pair_feats[1][i])) for i in range(N)]
    #X = list(map(phi_r, response_pair_feats))
    
    return {'X': np.array(X),
            'y': np.array(Y),
            'vectorizer': vectorizer,
            'raw_examples': (ancestors, responses)}

        

In [14]:
responses = train_responses
phi_c = unigrams_phi_c
N = len(responses)
feat_dicts = [[],[]]
for i in range(N):
    assert len(responses[i]) == 2
    feat_dicts[0].append(phi_c(responses[i][0]))
    feat_dicts[1].append(phi_c(responses[i][1]))

In [15]:
unigram_dataset = build_dataset(train_ancestors, train_responses, train_labels, unigrams_phi_c, None, concat_phi_r)

unigram_dataset['X'].shape

6834


(6834, 27260)

In [16]:
def fit_maxent_classifier(X, y):  
    #print(X.shape, y.shape)
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [17]:
def fit_naive_bayes_classifier(X, y):  
    #print(X.shape, y.shape)
    mod = MultinomialNB()
    mod.fit(X, y)
    return mod

In [18]:
def xval_model(model_fit_fn, X, y, folds):
    kf = KFold(folds)
    for train, test in kf.split(X, y):
        model = model_fit_fn(X[train], y[train])
        predictions = model.predict(X[test])
        print(classification_report(y[test], predictions, digits=3))

In [19]:
xval_model(fit_maxent_classifier, unigram_dataset['X'], unigram_dataset['y'], 10)



              precision    recall  f1-score   support

           0      0.736     0.727     0.731       352
           1      0.714     0.723     0.719       332

   micro avg      0.725     0.725     0.725       684
   macro avg      0.725     0.725     0.725       684
weighted avg      0.725     0.725     0.725       684





              precision    recall  f1-score   support

           0      0.698     0.729     0.713       329
           1      0.738     0.707     0.722       355

   micro avg      0.718     0.718     0.718       684
   macro avg      0.718     0.718     0.718       684
weighted avg      0.719     0.718     0.718       684





              precision    recall  f1-score   support

           0      0.748     0.757     0.752       345
           1      0.749     0.740     0.745       339

   micro avg      0.749     0.749     0.749       684
   macro avg      0.749     0.748     0.748       684
weighted avg      0.749     0.749     0.749       684





              precision    recall  f1-score   support

           0      0.747     0.745     0.746       365
           1      0.709     0.712     0.710       319

   micro avg      0.730     0.730     0.730       684
   macro avg      0.728     0.728     0.728       684
weighted avg      0.730     0.730     0.730       684





              precision    recall  f1-score   support

           0      0.728     0.723     0.725       347
           1      0.716     0.720     0.718       336

   micro avg      0.722     0.722     0.722       683
   macro avg      0.722     0.722     0.722       683
weighted avg      0.722     0.722     0.722       683





              precision    recall  f1-score   support

           0      0.698     0.698     0.698       331
           1      0.716     0.716     0.716       352

   micro avg      0.707     0.707     0.707       683
   macro avg      0.707     0.707     0.707       683
weighted avg      0.707     0.707     0.707       683





              precision    recall  f1-score   support

           0      0.735     0.801     0.767       347
           1      0.774     0.702     0.736       336

   micro avg      0.753     0.753     0.753       683
   macro avg      0.755     0.752     0.752       683
weighted avg      0.754     0.753     0.752       683





              precision    recall  f1-score   support

           0      0.704     0.687     0.695       332
           1      0.710     0.726     0.718       351

   micro avg      0.707     0.707     0.707       683
   macro avg      0.707     0.707     0.707       683
weighted avg      0.707     0.707     0.707       683





              precision    recall  f1-score   support

           0      0.759     0.701     0.729       345
           1      0.717     0.772     0.744       338

   micro avg      0.736     0.736     0.736       683
   macro avg      0.738     0.737     0.736       683
weighted avg      0.738     0.736     0.736       683





              precision    recall  f1-score   support

           0      0.704     0.751     0.727       345
           1      0.727     0.678     0.701       338

   micro avg      0.714     0.714     0.714       683
   macro avg      0.715     0.714     0.714       683
weighted avg      0.715     0.714     0.714       683



In [20]:
xval_model(fit_naive_bayes_classifier, unigram_dataset['X'], unigram_dataset['y'], 3)

              precision    recall  f1-score   support

           0      0.727     0.732     0.730      1147
           1      0.727     0.721     0.724      1131

   micro avg      0.727     0.727     0.727      2278
   macro avg      0.727     0.727     0.727      2278
weighted avg      0.727     0.727     0.727      2278

              precision    recall  f1-score   support

           0      0.696     0.716     0.706      1156
           1      0.699     0.677     0.688      1122

   micro avg      0.697     0.697     0.697      2278
   macro avg      0.697     0.697     0.697      2278
weighted avg      0.697     0.697     0.697      2278

              precision    recall  f1-score   support

           0      0.710     0.720     0.715      1135
           1      0.718     0.709     0.713      1143

   micro avg      0.714     0.714     0.714      2278
   macro avg      0.714     0.714     0.714      2278
weighted avg      0.714     0.714     0.714      2278



In [21]:
i=0
fasttext_lookup = {}
with open('../../static/wiki-news-300d-1M-subword.vec') as f:
    while True:
        try:
            x = next(f)
        except:
            break
        try:
            fields = x.strip().split()
            idx = fields[0]
            if idx not in train_vocab: continue
            if idx in fasttext_lookup:
                print("Duplicate! ", idx)
            vec = np.array(fields[1:], dtype=np.float32)
            fasttext_lookup[idx] = vec
            i += 1
            if i%500 == 0: print(i)
        except:
            pass


print(len(fasttext_lookup))
print(type(fasttext_lookup['the']), fasttext_lookup['the'].shape, sum(fasttext_lookup['the']))

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
11990
<class 'numpy.ndarray'> (300,) 0.03160001061769435


In [22]:
def embed_phi_c(comment, embeddings):
    words = nltk.word_tokenize(comment)
    unk = np.zeros(next(iter(embeddings.values())).shape)
    return np.sum([embeddings[w] if w in embeddings else unk for w in words], axis=0)

In [23]:
def fasttext_phi_c(comment):
    return embed_phi_c(comment, fasttext_lookup)

In [24]:
fasttext_dataset = build_dataset(
    train_ancestors, train_responses, train_labels, fasttext_phi_c, None, concat_phi_r, None, False)

fasttext_dataset['X'].shape

6834


(6834, 600)

In [25]:
xval_model(fit_maxent_classifier, fasttext_dataset['X'], fasttext_dataset['y'], 3)



              precision    recall  f1-score   support

           0      0.703     0.709     0.706      1147
           1      0.702     0.696     0.699      1131

   micro avg      0.702     0.702     0.702      2278
   macro avg      0.702     0.702     0.702      2278
weighted avg      0.702     0.702     0.702      2278





              precision    recall  f1-score   support

           0      0.716     0.702     0.709      1156
           1      0.699     0.713     0.706      1122

   micro avg      0.707     0.707     0.707      2278
   macro avg      0.707     0.707     0.707      2278
weighted avg      0.707     0.707     0.707      2278





              precision    recall  f1-score   support

           0      0.698     0.707     0.702      1135
           1      0.705     0.696     0.701      1143

   micro avg      0.701     0.701     0.701      2278
   macro avg      0.702     0.702     0.701      2278
weighted avg      0.702     0.701     0.701      2278



In [26]:
%ls ../../static/glove/

glove.6B.100d.txt  glove.6B.200d.txt  glove.6B.300d.txt  glove.6B.50d.txt


In [27]:
i=0
glove_lookup = {}
with open('../../static/glove/glove.6B.300d.txt') as f:
#with open('../../static/') as f:
    while True:
        try:
            x = next(f)
        except:
            break
        try:
            fields = x.strip().split()
            idx = fields[0]
            if idx not in train_vocab: continue
            if idx in glove_lookup:
                print("Duplicate! ", idx)
            vec = np.array(fields[1:], dtype=np.float32)
            glove_lookup[idx] = vec
            i += 1
            if i%500 == 0: print(i)
        except:
            pass


print(len(glove_lookup))
print(type(glove_lookup['the']), glove_lookup['the'].shape, sum(glove_lookup['the']))

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
11824
<class 'numpy.ndarray'> (300,) 3.823568901862018


In [28]:
def glove_phi_c(comment):
    return embed_phi_c(comment, glove_lookup)

glove_dataset = build_dataset(
    train_ancestors, train_responses, train_labels, glove_phi_c, None, concat_phi_r, None, False)

fasttext_dataset['X'].shape

6834


(6834, 600)

In [29]:
xval_model(fit_maxent_classifier, glove_dataset['X'], glove_dataset['y'], 5)



              precision    recall  f1-score   support

           0      0.669     0.674     0.672       681
           1      0.674     0.669     0.672       686

   micro avg      0.672     0.672     0.672      1367
   macro avg      0.672     0.672     0.672      1367
weighted avg      0.672     0.672     0.672      1367





              precision    recall  f1-score   support

           0      0.688     0.684     0.686       708
           1      0.662     0.666     0.664       659

   micro avg      0.675     0.675     0.675      1367
   macro avg      0.675     0.675     0.675      1367
weighted avg      0.675     0.675     0.675      1367





              precision    recall  f1-score   support

           0      0.675     0.652     0.664       679
           1      0.668     0.690     0.679       688

   micro avg      0.672     0.672     0.672      1367
   macro avg      0.672     0.671     0.671      1367
weighted avg      0.672     0.672     0.671      1367





              precision    recall  f1-score   support

           0      0.669     0.665     0.667       680
           1      0.670     0.674     0.672       687

   micro avg      0.669     0.669     0.669      1367
   macro avg      0.669     0.669     0.669      1367
weighted avg      0.669     0.669     0.669      1367





              precision    recall  f1-score   support

           0      0.681     0.688     0.684       690
           1      0.678     0.670     0.674       676

   micro avg      0.679     0.679     0.679      1366
   macro avg      0.679     0.679     0.679      1366
weighted avg      0.679     0.679     0.679      1366



In [30]:
from allennlp.commands.elmo import ElmoEmbedder

In [31]:
elmo = ElmoEmbedder()

In [32]:
def elmo_phi_c(comment):
    vecs = elmo.embed_sentence(nltk.word_tokenize(comment))
    elmo_avg_vec = vecs.mean(axis = 0)
    #print(elmo_avg_vec)
    return elmo_avg_vec[0]

In [33]:
elmo_dataset = build_dataset(
    train_ancestors, train_responses, train_labels, elmo_phi_c, None, concat_phi_r, None, False)

6834


In [34]:
xval_model(fit_maxent_classifier, elmo_dataset['X'], elmo_dataset['y'], 10)



              precision    recall  f1-score   support

           0      0.699     0.705     0.702       352
           1      0.684     0.678     0.681       332

   micro avg      0.692     0.692     0.692       684
   macro avg      0.691     0.691     0.691       684
weighted avg      0.691     0.692     0.691       684





              precision    recall  f1-score   support

           0      0.666     0.690     0.678       329
           1      0.703     0.679     0.691       355

   micro avg      0.684     0.684     0.684       684
   macro avg      0.684     0.684     0.684       684
weighted avg      0.685     0.684     0.684       684





              precision    recall  f1-score   support

           0      0.649     0.643     0.646       345
           1      0.640     0.646     0.643       339

   micro avg      0.645     0.645     0.645       684
   macro avg      0.645     0.645     0.645       684
weighted avg      0.645     0.645     0.645       684





              precision    recall  f1-score   support

           0      0.718     0.677     0.697       365
           1      0.653     0.696     0.674       319

   micro avg      0.686     0.686     0.686       684
   macro avg      0.685     0.686     0.685       684
weighted avg      0.688     0.686     0.686       684





              precision    recall  f1-score   support

           0      0.671     0.651     0.661       347
           1      0.650     0.670     0.660       336

   micro avg      0.660     0.660     0.660       683
   macro avg      0.660     0.660     0.660       683
weighted avg      0.661     0.660     0.660       683





              precision    recall  f1-score   support

           0      0.612     0.604     0.608       331
           1      0.632     0.639     0.636       352

   micro avg      0.622     0.622     0.622       683
   macro avg      0.622     0.622     0.622       683
weighted avg      0.622     0.622     0.622       683





              precision    recall  f1-score   support

           0      0.683     0.709     0.696       347
           1      0.687     0.661     0.674       336

   micro avg      0.685     0.685     0.685       683
   macro avg      0.685     0.685     0.685       683
weighted avg      0.685     0.685     0.685       683





              precision    recall  f1-score   support

           0      0.646     0.693     0.669       332
           1      0.688     0.641     0.664       351

   micro avg      0.666     0.666     0.666       683
   macro avg      0.667     0.667     0.666       683
weighted avg      0.668     0.666     0.666       683





              precision    recall  f1-score   support

           0      0.705     0.672     0.688       345
           1      0.681     0.713     0.697       338

   micro avg      0.693     0.693     0.693       683
   macro avg      0.693     0.693     0.692       683
weighted avg      0.693     0.693     0.692       683





              precision    recall  f1-score   support

           0      0.657     0.667     0.662       345
           1      0.655     0.645     0.650       338

   micro avg      0.656     0.656     0.656       683
   macro avg      0.656     0.656     0.656       683
weighted avg      0.656     0.656     0.656       683

