Below is a script to extract paired reddit comments from May of 2015. Script was built using the SentDex guide, which can be found here. https://pythonprogramming.net/chatbot-deep-learning-python-tensorflow/ 

In [1]:
import sqlite3
import json
from datetime import datetime

sql_transaction = []

connection = sqlite3.connect('2015-05.db')
c = connection.cursor()

def create_table():
    c.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")

def format_data(data):
    data = data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")
    return data

def transaction_bldr(sql):
    global sql_transaction
    sql_transaction.append(sql)
    if len(sql_transaction) > 1000:
        c.execute('BEGIN TRANSACTION')
        for s in sql_transaction:
            try:
                c.execute(s)
            except:
                pass
        connection.commit()
        sql_transaction = []

def sql_insert_replace_comment(commentid,parentid,parent,comment,subreddit,time,score):
    try:
        sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id =?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
        transaction_bldr(sql)
    except Exception as e:
        print('s0 insertion',str(e))

def sql_insert_has_parent(commentid,parentid,parent,comment,subreddit,time,score):
    try:
        sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
        transaction_bldr(sql)
    except Exception as e:
        print('s0 insertion',str(e))

def sql_insert_no_parent(commentid,parentid,comment,subreddit,time,score):
    try:
        sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
        transaction_bldr(sql)
    except Exception as e:
        print('s0 insertion',str(e))

def acceptable(data):
    if len(data.split(' ')) > 50 or len(data) < 1:
        return False
    elif len(data) > 1000:
        return False
    elif data == '[deleted]':
        return False
    elif data == '[removed]':
        return False
    else:
        return True

def find_parent(pid):
    try:
        sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
        c.execute(sql)
        result = c.fetchone()
        if result != None:
            return result[0]
        else: return False
    except Exception as e:
        #print(str(e))
        return False

def find_existing_score(pid):
    try:
        sql = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
        c.execute(sql)
        result = c.fetchone()
        if result != None:
            return result[0]
        else: return False
    except Exception as e:
        #print(str(e))
        return False
    
if __name__ == '__main__':
    create_table()
    row_counter = 0
    paired_rows = 0

    with open("C:/Users/mikes/Documents/ChatBot/RC_2015-05/RC_2015-05", buffering=1000) as f:
        for row in f:
            row_counter += 1
            row = json.loads(row)
            parent_id = row['parent_id']
            body = format_data(row['body'])
            created_utc = row['created_utc']
            score = row['score']
            comment_id = row['name']
            subreddit = row['subreddit']
            parent_data = find_parent(parent_id)
            if score >= 2:
                existing_comment_score = find_existing_score(parent_id)
                if existing_comment_score:
                    if score > existing_comment_score:
                        if acceptable(body):
                            sql_insert_replace_comment(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
                            
                else:
                    if acceptable(body):
                        if parent_data:
                            sql_insert_has_parent(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
                            paired_rows += 1
                        else:
                            sql_insert_no_parent(comment_id,parent_id,body,subreddit,created_utc,score)
                            
            if row_counter % 100000 == 0:
                print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now())))

Total Rows Read: 100000, Paired Rows: 3220, Time: 2018-06-28 19:03:21.247010
Total Rows Read: 200000, Paired Rows: 8064, Time: 2018-06-28 19:03:34.321769
Total Rows Read: 300000, Paired Rows: 13710, Time: 2018-06-28 19:03:48.001520
Total Rows Read: 400000, Paired Rows: 19732, Time: 2018-06-28 19:04:00.726113
Total Rows Read: 500000, Paired Rows: 25657, Time: 2018-06-28 19:04:13.984302
Total Rows Read: 600000, Paired Rows: 31065, Time: 2018-06-28 19:04:26.298576
Total Rows Read: 700000, Paired Rows: 36074, Time: 2018-06-28 19:04:38.050063
Total Rows Read: 800000, Paired Rows: 41589, Time: 2018-06-28 19:04:49.832822
Total Rows Read: 900000, Paired Rows: 47722, Time: 2018-06-28 19:05:04.434174
Total Rows Read: 1000000, Paired Rows: 54085, Time: 2018-06-28 19:05:19.406834
Total Rows Read: 1100000, Paired Rows: 60445, Time: 2018-06-28 19:05:32.254839
Total Rows Read: 1200000, Paired Rows: 66804, Time: 2018-06-28 19:05:44.469770
Total Rows Read: 1300000, Paired Rows: 73208, Time: 2018-06-28 

Total Rows Read: 10400000, Paired Rows: 629019, Time: 2018-06-28 19:25:10.323355
Total Rows Read: 10500000, Paired Rows: 635382, Time: 2018-06-28 19:25:24.444779
Total Rows Read: 10600000, Paired Rows: 641507, Time: 2018-06-28 19:25:37.157651
Total Rows Read: 10700000, Paired Rows: 647309, Time: 2018-06-28 19:25:49.404857
Total Rows Read: 10800000, Paired Rows: 653334, Time: 2018-06-28 19:26:02.288012
Total Rows Read: 10900000, Paired Rows: 659477, Time: 2018-06-28 19:26:14.941103
Total Rows Read: 11000000, Paired Rows: 665659, Time: 2018-06-28 19:26:27.854038
Total Rows Read: 11100000, Paired Rows: 671743, Time: 2018-06-28 19:26:40.186425
Total Rows Read: 11200000, Paired Rows: 677414, Time: 2018-06-28 19:26:52.671155
Total Rows Read: 11300000, Paired Rows: 682966, Time: 2018-06-28 19:27:05.224504
Total Rows Read: 11400000, Paired Rows: 689003, Time: 2018-06-28 19:27:18.986059
Total Rows Read: 11500000, Paired Rows: 695241, Time: 2018-06-28 19:27:32.717216
Total Rows Read: 11600000, P

Total Rows Read: 20500000, Paired Rows: 1252751, Time: 2018-06-28 19:47:35.523418
Total Rows Read: 20600000, Paired Rows: 1259313, Time: 2018-06-28 19:47:48.010086
Total Rows Read: 20700000, Paired Rows: 1265716, Time: 2018-06-28 19:48:02.043728
Total Rows Read: 20800000, Paired Rows: 1272379, Time: 2018-06-28 19:48:14.913217
Total Rows Read: 20900000, Paired Rows: 1278973, Time: 2018-06-28 19:48:28.507752
Total Rows Read: 21000000, Paired Rows: 1285556, Time: 2018-06-28 19:48:42.950277
Total Rows Read: 21100000, Paired Rows: 1291903, Time: 2018-06-28 19:48:56.485071
Total Rows Read: 21200000, Paired Rows: 1297929, Time: 2018-06-28 19:49:09.890188
Total Rows Read: 21300000, Paired Rows: 1303778, Time: 2018-06-28 19:49:23.388146
Total Rows Read: 21400000, Paired Rows: 1310155, Time: 2018-06-28 19:49:37.062878
Total Rows Read: 21500000, Paired Rows: 1316272, Time: 2018-06-28 19:49:50.542085
Total Rows Read: 21600000, Paired Rows: 1322346, Time: 2018-06-28 19:50:03.869765
Total Rows Read:

Total Rows Read: 30500000, Paired Rows: 1866789, Time: 2018-06-28 20:18:04.432132
Total Rows Read: 30600000, Paired Rows: 1872890, Time: 2018-06-28 20:18:16.586958
Total Rows Read: 30700000, Paired Rows: 1879227, Time: 2018-06-28 20:18:28.931331
Total Rows Read: 30800000, Paired Rows: 1885442, Time: 2018-06-28 20:18:41.371386
Total Rows Read: 30900000, Paired Rows: 1891945, Time: 2018-06-28 20:18:54.764805
Total Rows Read: 31000000, Paired Rows: 1898557, Time: 2018-06-28 20:19:09.732441
Total Rows Read: 31100000, Paired Rows: 1905095, Time: 2018-06-28 20:19:24.450506
Total Rows Read: 31200000, Paired Rows: 1911833, Time: 2018-06-28 20:19:39.099600
Total Rows Read: 31300000, Paired Rows: 1918410, Time: 2018-06-28 20:19:52.319959
Total Rows Read: 31400000, Paired Rows: 1924935, Time: 2018-06-28 20:20:06.035275
Total Rows Read: 31500000, Paired Rows: 1931344, Time: 2018-06-28 20:20:20.469801
Total Rows Read: 31600000, Paired Rows: 1937700, Time: 2018-06-28 20:20:33.636084
Total Rows Read:

Total Rows Read: 40500000, Paired Rows: 2483556, Time: 2018-06-28 20:39:36.366871
Total Rows Read: 40600000, Paired Rows: 2489566, Time: 2018-06-28 20:39:48.679778
Total Rows Read: 40700000, Paired Rows: 2495404, Time: 2018-06-28 20:40:01.222848
Total Rows Read: 40800000, Paired Rows: 2501298, Time: 2018-06-28 20:40:13.594353
Total Rows Read: 40900000, Paired Rows: 2507599, Time: 2018-06-28 20:40:25.950729
Total Rows Read: 41000000, Paired Rows: 2513422, Time: 2018-06-28 20:40:38.656845
Total Rows Read: 41100000, Paired Rows: 2518873, Time: 2018-06-28 20:40:51.198260
Total Rows Read: 41200000, Paired Rows: 2524306, Time: 2018-06-28 20:41:03.893826
Total Rows Read: 41300000, Paired Rows: 2530107, Time: 2018-06-28 20:41:16.533577
Total Rows Read: 41400000, Paired Rows: 2536246, Time: 2018-06-28 20:41:28.670378
Total Rows Read: 41500000, Paired Rows: 2542569, Time: 2018-06-28 20:41:41.363190
Total Rows Read: 41600000, Paired Rows: 2549074, Time: 2018-06-28 20:41:53.575672
Total Rows Read:

Total Rows Read: 50500000, Paired Rows: 3095294, Time: 2018-06-28 21:00:26.298466
Total Rows Read: 50600000, Paired Rows: 3101415, Time: 2018-06-28 21:00:38.922474
Total Rows Read: 50700000, Paired Rows: 3107356, Time: 2018-06-28 21:00:50.926156
Total Rows Read: 50800000, Paired Rows: 3113450, Time: 2018-06-28 21:01:02.906261
Total Rows Read: 50900000, Paired Rows: 3119559, Time: 2018-06-28 21:01:14.734869
Total Rows Read: 51000000, Paired Rows: 3125727, Time: 2018-06-28 21:01:26.654222
Total Rows Read: 51100000, Paired Rows: 3132041, Time: 2018-06-28 21:01:38.537674
Total Rows Read: 51200000, Paired Rows: 3138203, Time: 2018-06-28 21:01:50.306541
Total Rows Read: 51300000, Paired Rows: 3144214, Time: 2018-06-28 21:02:02.522094
Total Rows Read: 51400000, Paired Rows: 3150205, Time: 2018-06-28 21:02:14.271604
Total Rows Read: 51500000, Paired Rows: 3156174, Time: 2018-06-28 21:02:26.099700
Total Rows Read: 51600000, Paired Rows: 3161920, Time: 2018-06-28 21:02:37.917224
Total Rows Read: