Conversations Gone Awry:
https://arxiv.org/abs/1805.05345

Trouble on the Horizon:
https://arxiv.org/abs/1909.01362

We can try extending the model later if there is tme (there won't be time).

So this is pretty much just an adaptation of functions in the IMDB sentiment analysis project.

In [2]:
from convokit import Corpus, download

In [3]:
corpus = Corpus(filename=download('subreddit-Cornell'))
corpus.print_summary_stats()

Downloading subreddit-Cornell to /root/.convokit/downloads/subreddit-Cornell
Downloading subreddit-Cornell from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/corpus-zipped/CookingScrewups~-~CrappyDesign/Cornell.corpus.zip (11.2MB)... Done
Number of Speakers: 7568
Number of Utterances: 74467
Number of Conversations: 10744


In [4]:
for conv in corpus.get_conversation_ids():
  utterances = corpus.get_conversation(conv).get_utterance_ids()
  for utt in utterances:
    utt_data = corpus.get_conversation(conv).get_utterance(utt)
    text = utt_data.text
    print(utt_data.meta)

Output hidden; open in https://colab.research.google.com to view.

In [5]:
import os
import sys

import nltk
from nltk import word_tokenize
nltk.download('punkt')
import torch

#Sparse matrix implementation
from scipy.sparse import csr_matrix
import numpy as np
from collections import Counter

np.random.seed(1)

class Vocab:
    def __init__(self, vocabFile=None):
        self.locked = False
        self.nextId = 0
        self.word2id = {}
        self.id2word = {}
        if vocabFile:
            for line in open(vocabFile):
                line = line.rstrip('\n')
                (word, wid) = line.split('\t')
                self.word2id[word] = int(wid)
                self.id2word[wid] = word
                self.nextId = max(self.nextId, int(wid) + 1)

    def GetID(self, word):
        if not word in self.word2id:
            if self.locked:
                return -1        #UNK token is -1.
            else:
                self.word2id[word] = self.nextId
                self.id2word[self.word2id[word]] = word
                self.nextId += 1
        return self.word2id[word]

    def HasWord(self, word):
        return self.word2id.has_key(word)

    def HasId(self, wid):
        return self.id2word.has_key(wid)

    def GetWord(self, wid):
        return self.id2word[wid]

    def SaveVocab(self, vocabFile):
        fOut = open(vocabFile, 'w')
        for word in self.word2id.keys():
            fOut.write("%s\t%s\n" % (word, self.word2id[word]))

    def GetVocabSize(self):
        #return self.nextId-1
        return self.nextId

    def GetWords(self):
        return self.word2id.keys()

    def Lock(self):
        self.locked = True

class Convdata:
    def __init__(self, vocab=None):
        """ Reads in data into sparse matrix format """

        if not vocab:
            self.vocab = Vocab()
        else:
            self.vocab = vocab

        #For csr_matrix (see http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix)
        X_values = []
        X_row_indices = []
        X_col_indices = []
        Y = []

        XwordList = []
        XconvList = []

        self.conv2rows = dict()
        self.row2utt = dict()

        #Read positive files
        i = 0
        for conv in corpus.get_conversation_ids():
            utterances = corpus.get_conversation(conv).get_utterance_ids()
            rows_in_convo = list()
            for utt in utterances:
                utt_data = corpus.get_conversation(conv).get_utterance(utt)
                line = utt_data.text
                wordList   = [self.vocab.GetID(w.lower()) for w in word_tokenize(line) if self.vocab.GetID(w.lower()) >= 0]
                XwordList.append(wordList)
                XconvList.append(conv)
                wordCounts = Counter(wordList)
                for (wordId, count) in wordCounts.items():
                    if wordId >= 0:
                        X_row_indices.append(i)
                        X_col_indices.append(wordId)
                        X_values.append(count)
                i += 1
                # Add things in here if you want to track something at the comment level
                Y.append(int(utt_data.meta['score']>0)) # This isn't what we actually want to track for Y, I was just using it to try some simple models
                rows_in_convo.append(i)
                self.row2utt[i] = utt
            # Add things in here if you want to track something at the conversation level
            self.conv2rows[conv] = rows_in_convo

        self.vocab.Lock()

        #Create a sparse matrix in csr format
        self.X = csr_matrix((X_values, (X_row_indices, X_col_indices)), shape=(i, self.vocab.GetVocabSize()))
        self.Y = np.asarray(Y)

        #Randomly shuffle - un-implemented
        index = np.arange(self.X.shape[0])
        # I commented the below line to stop the shuffling
        # We don't really need it. If we want it though, know that we need to
        # implement shuffling in conv2rows and row2utt
        #np.random.shuffle(index)
        self.X = self.X[index,:]
        self.XwordList = [torch.LongTensor(XwordList[i]) for i in index]  #Two different sparse formats, csr and lists of IDs (XwordList).
        self.XconvList = [XconvList[i] for i in index]
        #self.Y = self.Y[index]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
train = Convdata()
train.vocab.Lock()

In [7]:
# X contains the design matrix representing the training data.
print(f"Train.X has {train.X.shape[0]} rows and {train.X.shape[1]} columns.")

Train.X has 74467 rows and 60206 columns.


In [8]:
i=10
print(np.sum(train.X[-i,:]))
print(train.vocab.GetWord(np.argmax(train.X[-i,:])))

24
the


In [9]:
print(np.min(train.Y))

0


In [10]:
train.conv2rows

{'nyx4d': [1, 2],
 'o0145': [3, 4, 5, 6, 7, 8, 9, 10],
 'o1gca': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
 'o0ss4': [21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66],
 'o31u0': [67, 68, 69, 70, 71],
 'o4ipd': [72, 73, 74, 75, 76, 77, 78],
 'o456r': [79, 80],
 'o4544': [81],
 'o3l7i': [82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98],
 'o3fqm': [99, 100, 101, 102],
 'o617k': [103, 104, 105, 106],
 'o5009': [107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120],
 'o4oyt': [121, 122, 123, 124, 125, 126, 127, 128],
 'o75ge': [129, 130, 131, 132, 133, 134, 135, 136],
 'o9g5w': [137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  

# Proof of concept

Below is a very sloppy, very basic NN that predicts whether a post gets a positive or negative score (upvotes - downvotes) just to demonstrate that the data is now in the format we have been using throughout this class up to this point. Closer inspection will show that the train, dev, and test sets are all the same dataset (overfit much?).

In [12]:
import tqdm
import torch
import torch.nn as nn
from torch import optim
import random
import numpy as np

class NBOW(nn.Module):
    def __init__(self, VOCAB_SIZE, DIM_EMB=300, NUM_CLASSES=2):
        super(NBOW, self).__init__()
        self.NUM_CLASSES=NUM_CLASSES
        #TODO: Initialize parameters.
        self.emb = nn.EmbeddingBag(VOCAB_SIZE, DIM_EMB)
        self.lin = nn.Linear(DIM_EMB, NUM_CLASSES)
        self.logSoftmax = nn.LogSoftmax(dim=0)

    def forward(self, X):
        #TODO: Implement forward computation.
        return self.logSoftmax(self.lin(self.emb(X,torch.tensor([0]).cuda()).squeeze()))

def EvalNet(data, net):
    num_correct = 0
    Y = (data.Y + 1.0) / 2.0
    X = data.XwordList
    for i in range(len(X)):
        logProbs = net.forward(X[i].cuda())
        pred = torch.argmax(logProbs)
        if pred == Y[i]:
            num_correct += 1
    print("Accuracy: %s" % (float(num_correct) / float(len(X))))

def SavePredictions(data, outFile, net):
    fOut = open(outFile, 'w')
    for i in range(len(data.XwordList)):
        logProbs = net.forward(data.XwordList[i].cuda())
        pred = torch.argmax(logProbs)
        fOut.write(f"{data.XfileList[i]}\t{pred}\n")

def Train(net, X, Y, n_iter, dev):
    print("Start Training!")
    #TODO: initialize optimizer.
    optimizer = optim.Adam(net.parameters(), lr=0.001)
    num_classes = len(set(Y))
    batch_size = 10

    for epoch in range(n_iter):
        num_correct = 0
        total_loss = 0.0
        net.train()   #Put the network into training mode
        loss = 0.0
        run = 0
        for i in tqdm.notebook.tqdm(range(len(X))):
            #pass
            #TODO: compute gradients, do parameter update, compute loss.
            y_onehot = torch.zeros(num_classes).cuda()
            y_onehot[int(Y[i])] = 1
            logProbs = net.forward(X[i].cuda())
            loss += torch.neg(logProbs).dot(y_onehot)

            if i%batch_size == 0:
              loss.backward()
              optimizer.step()
              total_loss += loss
              loss = 0.0
              net.zero_grad()

        net.eval()    #Switch to eval mode
        print(f"loss on epoch {epoch} = {total_loss}")
        EvalNet(dev, net)

nbow = NBOW(train.vocab.GetVocabSize()).cuda()
Train(nbow, train.XwordList, train.Y, 5, train)

Start Training!


  0%|          | 0/74467 [00:00<?, ?it/s]

loss on epoch 0 = 18665.228515625
Accuracy: 0.9206494151771926


  0%|          | 0/74467 [00:00<?, ?it/s]

loss on epoch 1 = 17116.369140625
Accuracy: 0.9189036754535566


  0%|          | 0/74467 [00:00<?, ?it/s]

loss on epoch 2 = 16264.466796875
Accuracy: 0.9171579357299207


  0%|          | 0/74467 [00:00<?, ?it/s]

loss on epoch 3 = 15606.9013671875
Accuracy: 0.9148481877878792


  0%|          | 0/74467 [00:00<?, ?it/s]

loss on epoch 4 = 15075.185546875
Accuracy: 0.913008446694509


In [14]:
EvalNet(train, nbow)

Accuracy: 0.913008446694509
