#### Extract comments from Reddit pickles to pubhlish to MTurk:

- Saves it as a CSV in /crowdsourcing/data/csvs/
- Adds the extracted reply_ids to /crowdsourcing/data/reply_ids_extracted.txt | Makes sure that there are no duplicate comments extracted

In [2]:
import pandas as pd
import pickle
import os

from question_extractor import clean_text, extract_questions_regex
# pd.set_option('display.max_colwidth', -1)
# pd.__version__

#### Hyperparameters:

In [4]:
#### Can change these for every run ####
BATCH_NUMBER = 3 # Batch number for MTurk
N_SAMPLES = 420 # Number of samples from EACH subreddit

In [5]:
# Comments longer than 400 characters or shorter than 10 characters would be skipped.
MAX_CHAR_LIMIT = 400
MIN_CHAR_LIMIT = 10

# Questions shorter than 4 characters would be skipped (that's most likely to be an extraction error)
MIN_QUES_LIMIT = 4

# Inspired from Antag-Stats 
# & https://www.vice.com/en_ca/article/8xxymb/here-are-reddits-whiniest-most-low-key-toxic-subreddits
SUBREDDITS = ['The_Donald', 'politics', 'PoliticalDiscussion', 'Conservative',
              'cringepics', 'cringe', '4chan', 'CringeAnarchy', 'KotakuInAction',
              'ImGoingToHellForThis', 'TumblrInAction',
              'nfl', 'sports', 'nba', 'hockey']

# Output path for the CSV:
df_csv_output_fname = '/home/ndg/users/sbagga1/unpalatable-questions/crowdsourcing/data/csvs/batch_'+\
                        str(BATCH_NUMBER)+'_'+str(N_SAMPLES*len(SUBREDDITS))+'.csv'

# Sanity check:
if os.path.exists(df_csv_output_fname):
    print "ERROR: CSV already exists.", df_csv_output_fname
else:
    print "{} will be created in this run.".format(df_csv_output_fname)

/home/ndg/users/sbagga1/unpalatable-questions/crowdsourcing/data/csvs/batch_3_6300.csv will be created in this run.


#### Don't sample duplicate reply IDs and Test Questions:

In [6]:
# Reply IDs already covered:
with open('/home/ndg/users/sbagga1/unpalatable-questions/crowdsourcing/data/reply_ids_extracted.txt', 'r') as f:
    a = f.read().splitlines()
print len(a)
    
# Test Questions:
b = pd.read_csv('/home/ndg/users/sbagga1/unpalatable-questions/crowdsourcing/data/TestQuestions.csv', \
                lineterminator='\n')['reply_id'].tolist()

print "Number of Test Questions: ", len(b)
IDs_COVERED = set(a+b)
print "# Unique comments already extracted from pickles:", len(IDs_COVERED)

5100
Number of Test Questions:  350
# Unique comments already extracted from pickles: 5106


In [None]:
inp_dict = {'question':[], 'comment_text':[], 'reply_text':[], 'comment_id':[], 'reply_id':[], 'subreddit':[]}

for subred in SUBREDDITS:
    counter = 0
    print "Working on {} with counter-value: {}".format(subred, counter)
    with open('/home/ndg/users/sbagga1/unpalatable-questions/pickles/subreddit_interactions_commentsOnly/'+subred+'-comment_replies.pickle', 'rb') as f:
        data = pickle.load(f)
        
    for user_tup, conversation in data.items():
        for interaction in conversation: # interaction is a (comment,reply) tuple so its length is always 2            
            comment_id = interaction[0][0]; comment_text = interaction[0][1]
            reply_id = interaction[1][0]; reply_text = interaction[1][1]
            
            if reply_id in IDs_COVERED: # skip if already covered in a previous batch
                continue
            
            if len(reply_text) > MAX_CHAR_LIMIT or len(comment_text) > MAX_CHAR_LIMIT: # Skip if either the comment/reply is too long  
                continue
                
            if len(reply_text) < MIN_CHAR_LIMIT or len(comment_text) < MIN_CHAR_LIMIT: # Skip if either the comment/reply is too short
                continue

            if '&gt;' in comment_text or '&gt;' in reply_text: # Skip if comments quote other comments in the thread (avoid confusion)
                continue
                
            clean_reply_text = clean_text(reply_text) # removes non-ASCII characters and URLs 
            
            if '?' not in clean_reply_text: # Skip if no question mark present
                continue
            
            try:
                questions = extract_questions_regex(clean_reply_text)
            except:
                print "This didn't work: ", reply_text
                continue
                
            if len(questions) > 1: # Skip if multiple questions present
                continue
            
            # The list can still be empty even though there was '?' because (1) quoting questions (2) questions in brackets
            if len(questions) == 0:
                continue

            q = questions[0].strip()
            if len(q) < MIN_QUES_LIMIT: # Skip if the question extracted is really short
                print "Likely an extraction error: ", q
                continue
            
            # If any of the values are NaN, skip row:
            if type(q) != str or type(comment_text) != str or type(reply_text) != str:
                print "Skipping this row because all elements are not string: ", reply_id, comment_text, reply_text, q
                print type(comment_text), type(reply_text), type(q)            
                continue
            
            # Populate dictionary:
            inp_dict['question'].append(q)
            inp_dict['comment_text'].append(comment_text)
            inp_dict['reply_text'].append(reply_text)
            inp_dict['comment_id'].append(comment_id)
            inp_dict['reply_id'].append(reply_id)
            inp_dict['subreddit'].append(subred)
            
            counter += 1 # only gets incremented if the code makes it this far :D
            if counter == N_SAMPLES:
                break
        if counter == N_SAMPLES:
            break

In [8]:
# Turn into a dataframe:
df = pd.DataFrame.from_dict(inp_dict, orient='columns')

# Preview dataframe:
cols = [u'question', u'reply_text', u'comment_text', u'comment_id', u'reply_id', u'subreddit']
df = df[cols]
print df.shape
print "Any Null Values:", df.isnull().values.any()
df.head(3)

(6300, 6)
Any Null Values: False


Unnamed: 0,question,reply_text,comment_text,comment_id,reply_id,subreddit
0,Hillary is anti-gun?,Hillary is anti-gun?\nhttps://medium.com/@Jean...,Guns will be an important issue in this electi...,d25fh6j,d25j18a,The_Donald
1,So are you telling me that there are not extre...,So are you telling me that there are not extre...,"The issue isn't ""Blacks"", that's the equivalen...",d25nfay,d25nl5t,The_Donald
2,So about the size of Rubio?,So about the size of Rubio?,One quarter of a child voted for Lubio?,d10llre,d10lubn,The_Donald


In [9]:
if not df.isnull().values.any(): # Sanity check: no NaN values in DataFrame
    # Append to reply IDs covered:
    with open('/home/ndg/users/sbagga1/unpalatable-questions/crowdsourcing/data/reply_ids_extracted.txt', 'a') as f:
        for ID in df['reply_id'].tolist():
            f.write("%s\n" % ID)

    # Save to csv for tracking batches:
    df.to_csv(df_csv_output_fname, index=None)
    
else:
    print("ERROR: Null values in DataFrame. CSV not created..")

## fin.