In [2]:
from convokit import Corpus, download

In [3]:
corpus = Corpus(filename=download('subreddit-Cornell'))

Downloading subreddit-Cornell to /root/.convokit/downloads/subreddit-Cornell
Downloading subreddit-Cornell from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/corpus-zipped/CookingScrewups~-~CrappyDesign/Cornell.corpus.zip (11.2MB)... Done


In [4]:
corpus.print_summary_stats()

Number of Speakers: 7568
Number of Utterances: 74467
Number of Conversations: 10744


## Some new Conversation functionality

In [5]:
convo = corpus.get_conversation('o31u0')

In [6]:
convo.print_conversation_structure()

cchambo
    jklol
    djnap
    Brimwoodboy
        jklol


In [7]:
convo.print_conversation_structure(lambda utt: utt.id)

o31u0
    c3dzmtu
    c3e0ou0
    c3f7l5b
        c3feqc4


In [8]:
convo.get_chronological_utterance_list()

[Utterance({'obj_type': 'utterance', 'meta': {'score': 27, 'top_level_comment': None, 'retrieved_on': -1, 'gilded': -1, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/o31u0/cornell_scientists_create_hole_in_time_where/', 'author_flair_text': 'SNES 2015'}, 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {}, 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x7f4ff525be10>, 'id': 'cchambo'}), 'conversation_id': 'o31u0', 'reply_to': None, 'timestamp': 1325714498, 'text': '', 'owner': <convokit.model.corpus.Corpus object at 0x7f4ff525be10>, 'id': 'o31u0'}),
 Utterance({'obj_type': 'utterance', 'meta': {'score': 3, 'top_level_comment': 'c3dzmtu', 'retrieved_on': 1428124647, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': 'AEP 2011'}, 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {}, 'vectors': [], 'owner': <convokit.model.corpus.

In [9]:
[utt.user.id for utt in convo.get_chronological_utterance_list()]

['cchambo', 'jklol', 'djnap', 'Brimwoodboy', 'jklol']

In [10]:
convo.get_root_to_leaf_paths()

[[Utterance({'obj_type': 'utterance', 'meta': {'score': 27, 'top_level_comment': None, 'retrieved_on': -1, 'gilded': -1, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/o31u0/cornell_scientists_create_hole_in_time_where/', 'author_flair_text': 'SNES 2015'}, 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {}, 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x7f4ff525be10>, 'id': 'cchambo'}), 'conversation_id': 'o31u0', 'reply_to': None, 'timestamp': 1325714498, 'text': '', 'owner': <convokit.model.corpus.Corpus object at 0x7f4ff525be10>, 'id': 'o31u0'}),
  Utterance({'obj_type': 'utterance', 'meta': {'score': 2, 'top_level_comment': 'c3e0ou0', 'retrieved_on': 1428125150, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': 'Engineering 2015'}, 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {}, 'vectors': [], 'owner': <convokit.mod

In [11]:
for path in convo.get_root_to_leaf_paths():
    print([utt.user.id for utt in path])

['cchambo', 'djnap']
['cchambo', 'jklol']
['cchambo', 'Brimwoodboy', 'jklol']


## Cumulative BoW

In [12]:
from convokit import Forecaster

Let's set up a forecasting task to predict for whether a Reddit comment will have a positive score, i.e. upvotes > downvotes.

In [13]:
# Adding a 'y' feature to fit to
for utt in corpus.iter_utterances():
    utt.add_meta('pos_score', int(utt.meta['score'] > 0))

In [14]:
forecaster = Forecaster(label_func=lambda utt: utt.meta['pos_score'], skip_broken_convos=True)

No model passed to Forecaster. Initializing default forecaster model: Cumulative Bag-of-words...
Initializing default unigram CountVectorizer...
Initializing default classification model (standard scaled logistic regression)


In [15]:
forecaster.fit(corpus)

Fitting cumulative BoW classification model...
Done.




In [16]:
forecaster.transform(corpus)

<convokit.model.corpus.Corpus at 0x7f4ff525be10>

In [17]:
forecast_df = forecaster.summarize(corpus)

In [18]:
forecast_df.shape

(72930, 2)

In [19]:
forecast_df.head()

Unnamed: 0_level_0,forecast,forecast_prob
utt_id,Unnamed: 1_level_1,Unnamed: 2_level_1
d0mbn8y,1.0,1.0
c39q886,1.0,1.0
d89sah8,1.0,1.0
csijpuk,1.0,1.0
csihtq1,1.0,1.0


In [20]:
forecast_df.tail(10)

Unnamed: 0_level_0,forecast,forecast_prob
utt_id,Unnamed: 1_level_1,Unnamed: 2_level_1
d3jxtqu,0.0,0.000208
dw5hgjc,0.0,0.000194
caryxnd,0.0,0.000103
cxs67g1,0.0,4.9e-05
e56e5ub,0.0,4.5e-05
e7z3lqk,0.0,3.6e-05
dncvjzz,0.0,3.3e-05
d54qiql,0.0,2.4e-05
c3si9dy,0.0,3e-06
d54rl1r,0.0,2e-06


Let's examine a Conversation that has an utterance forecasted to have negative score.

In [21]:
corpus.get_utterance('dpn8e4v')

Utterance({'obj_type': 'utterance', 'meta': {'score': 0, 'top_level_comment': 'dp95zls', 'retrieved_on': 1512515249, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/7a75x0/what_is_the_best_college_for_law_school/dpn8e4v/', 'author_flair_text': '', 'pos_score': 0, 'forecast': 1.0, 'forecast_prob': 0.8382448473755922}, 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {}, 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x7f4ff525be10>, 'id': 'Trumpsamerican'}), 'conversation_id': '7a75x0', 'reply_to': 'dpm8anu', 'timestamp': 1510358978, 'text': "If you don't mind me asking, is your gpa over a 3.7? Because I heard that ILR's average GPA is a 3.5, and that's incredibly low given that History majors typically get 3.8+. I applied to ILR, btw.\n\nLastly, pertaining to the last portion of your statement, is it possible to take only HR classes, and things in that realm, and avoid history/law classes?", '

In [22]:
corpus.get_utterance('dpn8e4v').root



'7a75x0'

In [23]:
corpus.get_conversation(corpus.get_utterance('dpn8e4v').root).print_conversation_structure()

Trumpsamerican
    IthacaisGorges_
    _vpl
    lyfehack
        Trumpsamerican
            byanilla
                Trumpsamerican
                    byanilla
                        Trumpsamerican
                            byanilla
                                Trumpsamerican
                                    byanilla
                                        Trumpsamerican
                                            byanilla
                    [deleted]
    mattezai




### Forecasted

In [24]:
corpus.get_conversation(corpus.get_utterance('dpn8e4v').root).print_conversation_structure(lambda utt: str(utt.meta['forecast']))

1.0
    1.0
    1.0
    1.0
        1.0
            0.0
                1.0
                    0.0
                        1.0
                            1.0
                                1.0
                                    1.0
                                        1.0
                                            1.0
                    1.0
    1.0




### Actual

In [25]:
corpus.get_conversation(corpus.get_utterance('dpn8e4v').root).print_conversation_structure(lambda utt: str(utt.meta['pos_score']))

0
    1
    1
    1
        1
            1
                0
                    1
                        0
                            1
                                1
                                    1
                                        1
                                            1
                    1
    1




In [26]:
forecasts = [utt.meta['forecast'] for utt in corpus.iter_utterances()]
actual = [utt.meta['pos_score'] for utt in corpus.iter_utterances()]

In [27]:
y_true_pred = [(forecast, actual) for forecast, actual in zip(forecasts, actual) if forecast is not None]

In [28]:
import numpy as np
from collections import Counter

In [29]:
y_pred = np.array([x[0] for x in y_true_pred])

In [30]:
y_true = np.array([x[1] for x in y_true_pred])

In [31]:
# baseline accuracy: assume all positive
np.mean(y_true)

0.9315782256958728

In [32]:
# achieved accuracy
np.mean(y_true == y_pred)

0.9217331687919923

In [33]:
from sklearn.metrics import confusion_matrix

In [34]:
confusion_matrix(y_true=y_true, y_pred=y_pred)

array([[  648,  4342],
       [ 1366, 66574]])