In [2]:
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd 
import sys 
import matplotlib 
import os
%matplotlib inline

In [3]:
print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)
print('Matplotlib version ' + matplotlib.__version__)

Python version 2.7.13 |Anaconda 4.4.0 (64-bit)| (default, May 11 2017, 13:17:26) [MSC v.1500 64 bit (AMD64)]
Pandas version 0.20.1
Matplotlib version 2.0.2


In [None]:
'''Joins Coarse-Discourse annotations with Reddit data via Reddit API.'''
# Copyright 2017 Google Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import __future__
from __future__ import print_function

import json
import os
import time

import praw

'''
Requirements: PRAW python library - https://praw.readthedocs.io/en/latest/index.html
              current version: 4.4.0

Date: 04/02/2017

This script adds fields to the coarse-discourse dataset for each comment including the 
text of the comment and the author. The information is taken from the Reddit API.

You can augment this script to gather other information about the comment such as upvote 
or downvote count or about the comment author, such as their karma, from the fields
that the Reddit API provides.


Note:

There may be discrepancies due to changes made between when the coarse-discourse
data was collected (August 2016) and when you are accessing the API, such as missing 
comments or comment text.

You should be able to overcome discrepancies by downloading the full Reddit dump from 
the beginning up until 09/2016 found in various places such as: 

https://www.reddit.com/r/datasets/comments/3bxlg7/i_have_every_publicly_available_reddit_comment/
https://archive.org/details/2015_reddit_comments_corpus
https://bigquery.cloud.google.com/dataset/fh-bigquery:reddit_posts

and using that data instead of the Reddit API to collect the comment texts and author names.
'''

# Replace below with information provided to you by Reddit when registering your script
reddit = praw.Reddit(client_id="QutcH0SZqEwjBw",
                     client_secret="BDIt_Tif5Eo5asnYBC-lZdQ8Q5E",
                     user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
                     AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36")

with open('coarse_discourse_dataset.json') as jsonfile:
    lines = jsonfile.readlines()
    dump_with_reddit = open('coarse_discourse_dump_reddit.json', 'w')

    for line in lines:
        reader = json.loads(line)
        print(reader['url'])

        submission = reddit.submission(url=reader['url'])

        # Annotators only annotated the 40 "best" comments determined by Reddit
        submission.comment_sort = 'best'
        submission.comment_limit = 40

        post_id_dict = {}

        for post in reader['posts']:
            post_id_dict[post['id']] = post

        try:
            full_submission_id = 't3_' + submission.id
            if full_submission_id in post_id_dict:
                post_id_dict[full_submission_id]['body'] = submission.selftext

                # For a self-post, this URL will be the same URL as the thread.
                # For a link-post, this URL will be the link that the link-post is linking to.
                post_id_dict[full_submission_id]['url'] = submission.url
                if submission.author:
                    post_id_dict[full_submission_id]['author'] = submission.author.name

            submission.comments.replace_more(limit=0)
            for comment in submission.comments.list():
                full_comment_id = 't1_' + comment.id
                if full_comment_id in post_id_dict:
                    post_id_dict[full_comment_id]['body'] = comment.body
                    if comment.author:
                        post_id_dict[full_comment_id]['author'] = comment.author.name

        except Exception as e:
            print('Error %s' % (e))

        found_count = 0
        for post in reader['posts']:
            if not post.has_key('body'):
                print("Can't find %s in URL: %s" % (post['id'], reader['url']))
            else:
                found_count += 1

        print('Found %s posts out of %s' % (found_count, len(reader['posts'])))

        dump_with_reddit.write(json.dumps(reader) + '\n')

        # To keep within Reddit API limits
        time.sleep(2)


In [None]:
pd.read_json?
# path
jsonloc = r'C:\Users\Sophia\DTI_semifinal\coarse-discourse\coarse_discourse_dataset.json'

# take a look at the json annotation file
df = pd.read_json(jsonloc, lines = True)
df.dtypes
print df.head()
df.columns



In [6]:
# %load prepair_data.py
import pandas as pd
from collections import Counter
import numpy as np
comments = []
labels = []

label_categories = ["question", "answer", "announcement",
                    "appreciation","agreement", "elaboration",
                    "disagreement", "humor", "negativereaction"]

label_map = dict((s, i) for i, s in enumerate(label_categories))

with open('coarse_discourse_dump_reddit.json', 'r') as f:
    for line_idx, line in enumerate(f):
        if line_idx >= 9482: # last line is incomplete
            break

        df = pd.read_json(line.encode('utf-8').strip())
        crt_post_df = df["posts"]
        for post_idx in range(len(crt_post_df)):
            annotation_df = crt_post_df[post_idx]["annotations"]
            # check if all annotation agrees
            c = Counter(annotation_df[i]["main_type"] for i in range(len(annotation_df)))
            vote, count = c.most_common()[0]
            agree = count == 3 and vote in label_map
            if agree:
                if "body" in crt_post_df[post_idx]: # and annotation_df[0]["main_type"] in label_map
                    comments.append(crt_post_df[post_idx]["body"].encode('utf-8'))
                    labels.append(label_map[vote])
            if len(comments) % 1000 == 0:
                print ('...', len(comments), len(labels))
        if line_idx % 1000 == 0:
            print('.')
        

assert len(comments)==len(labels), "number of comments and labels should match"

# random split train and test
perm_idx = np.random.permutation(len(comments))
ntrain = int(0.8*len(comments))
ntest = len(comments) - ntrain
train_idx = perm_idx[:ntrain]
test_idx = perm_idx[ntrain:]

with open('train_comments.csv', 'w') as f:
    for idx in train_idx:
        c = comments[idx]
        c = c.replace('\n', ' ').strip()
        f.write("{},{}\n".format(c, labels[idx]))

with open('test_comments.csv', 'w') as f:
    for idx in test_idx:
        c = comments[idx]
        c = c.replace('\n', ' ').strip()
        f.write("{},{}\n".format(c, labels[idx]))
        
with open('comments.csv', 'w') as f:
    for idx in range(len(comments)):
        c = comments[idx]
        c = c.replace('\n', ' ').strip()
        f.write("{},{}\n".format(c, labels[idx]))

.
('...', 1000, 1000)
('...', 2000, 2000)
('...', 3000, 3000)
('...', 3000, 3000)
('...', 3000, 3000)
('...', 3000, 3000)
('...', 3000, 3000)
('...', 3000, 3000)
('...', 3000, 3000)
('...', 3000, 3000)
('...', 3000, 3000)
('...', 3000, 3000)
('...', 3000, 3000)
('...', 3000, 3000)
('...', 4000, 4000)
('...', 4000, 4000)
('...', 5000, 5000)
('...', 5000, 5000)
('...', 5000, 5000)
('...', 6000, 6000)
('...', 6000, 6000)
('...', 6000, 6000)
('...', 7000, 7000)
.
('...', 8000, 8000)
('...', 8000, 8000)
('...', 8000, 8000)
('...', 9000, 9000)
('...', 9000, 9000)
('...', 9000, 9000)
('...', 9000, 9000)
('...', 9000, 9000)
('...', 9000, 9000)
('...', 9000, 9000)
('...', 10000, 10000)
('...', 11000, 11000)
('...', 11000, 11000)
('...', 11000, 11000)
('...', 11000, 11000)
('...', 12000, 12000)
('...', 13000, 13000)
('...', 13000, 13000)
('...', 14000, 14000)
.
('...', 15000, 15000)
('...', 15000, 15000)
('...', 16000, 16000)
('...', 17000, 17000)
('...', 18000, 18000)
('...', 19000, 19000)
('..