# Data Acquistion from Reddit

Go to <a href=#bookmark>bookmark</a>

### 2019-06-08 - Goal - Develop End-to-End Data Flow, at least at small scale.
# OR BUST

![](https://images.unsplash.com/photo-1515255384510-23e8b6a6ca3c?ixlib=rb-1.2.1&auto=format&fit=crop&w=1489&q=80)

---

## Libraries

In [1]:
# Install libs on this computer:
# !pip install praw
# !pip install pymongo
# !pip install psycopg2

In [41]:
import os             # file system stuff
import json           # digest json
import praw           # reddit API
import pandas as pd   # Dataframes
import pymongo        # MongoDB
import numpy as np    # math and arrays

import time           # To time stuff

#DATA STORAGE
from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

In [3]:
import helper     # Custom helper functions

---

## 1A Load Reddit keys

Step 3: Create your first Authorized Reddit Instance

In [4]:
# Define path to secret

secret_path = os.path.join(os.environ['HOME'], '.secret', 'reddit.json')
#secret_path = os.path.join(os.environ['HOME'], 'mia/.secret', 'reddit_api.json')

secret_path

'/Users/werlindo/.secret/reddit.json'

#### Save submissions to DB

In [5]:
# Define path to secret

secret_path_aws = os.path.join(os.environ['HOME'], '.secret', 
                           'aws_ps_flatiron.json')
secret_path_aws

'/Users/werlindo/.secret/aws_ps_flatiron.json'

## 1B Load AWS-PostgreSQL DB keys

#### Load keys

In [6]:
aws_keys = helper.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

In [7]:
aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

### Use SQLAlchemy to create PSQL engine

In [8]:
# dialect+driver://username:password@host:port/database
sql_alch_engine = create_engine(aws_ps_engine)

## 2 Load keys, Create Reddit Instance

In [9]:
keys = helper.get_keys(secret_path)

In [10]:
reddit = praw.Reddit(client_id=keys['client_id'] 
                     ,client_secret=keys['api_key']
                     ,username=keys['username']
                     ,password=keys['password']
                     ,user_agent='reddit_research accessAPI:v0.0.1 (by /u/FlatDubs)')

---

## Initialize parameters for this submissions pull

https://en.wikipedia.org/wiki/List_of_Game_of_Thrones_characters

In [46]:
subreddit_nm = 'gameofthrones'

# query = """
#         "qyburn" OR "yara"
        
#         """

query = "harry strickland"

results_lim = 1

nm_subs_tbl = 'got_subs'

nm_comms_tbl = 'got_comms'

---

# Get subreddit submissions and their comments

## - Obtain a Subreddit Instance(s) from your Reddit Instance

In [36]:
def get_subs_df(praw_reddit
                ,subreddit_nm='all'
                ,query=''
                ,results_lim=1000
               ):
    """
    Query a subreddit and return a dataframe of submissions
    Parameters:
    -----------
    praw_reddit: pre-instantiated praw Reddit class
    subreddit_nm: name of subreddit to search
    query: query string to search on
    results_lim: number of submissions results to return
    
    
    Returns:
    --------
    Pandas dataframe of submissions
    """

    # Instantiate subreddit
    subred = praw_reddit.subreddit(subreddit_nm) 
    

    # Get Search generator
    search_results = subred.search(query, 
                            sort='comments',
                           limit=results_lim
                           ,time_filter='month')


    # Compile submission into list
    title = [] 
    num_comments = []
    upvote_ratio = []
    sub_id = []
    i=0

    # Loop through generator and get data
    for submission in search_results:
        i+=1
        title.append(submission.title)
        num_comments.append(submission.num_comments)
        upvote_ratio.append(submission.upvote_ratio)
        sub_id.append(submission.id)
        if i%100 == 0:
            print(f'{i} submissions completed')

    # Make dataframe to hold results        
    df_subs = pd.DataFrame(
        {'title': title,
         'num_comments': num_comments,
         'upvote_ratio': upvote_ratio,
         'id':sub_id
        })

    return df_subs

In [37]:
def get_comms_df(praw_reddit, list_sub_ids=[]):
    """
    Query a list of submission id's get the comments for each submission
    and store in dataframe
    Parameters:
    -----------
    praw_reddit: pre-instantiated praw Reddit class
    list_sub_ids: list of submission ids
    
    Returns:
    --------
    Pandas dataframe of comments
    """    
    # List to hold all the comments dfs
    comm_dfs = []

    # Loop through list sub ids and get comments data
    for this_sub_id in list_sub_ids:
        subm = praw_reddit.submission(id=this_sub_id)
        
        # Instantiate lists to hold comments data
        comment_body = []
        comment_id = []
        sub_id = []

        # Force loading comments until maxed out
        while True:
            try:
                subm.comments.replace_more()
                break
            except PossibleExceptions:
                print('Handling replace_more exception')
                sleep(1)

        # Loop through comments and put into list
        
        for comment in subm.comments.list():
            comment_id.append(comment.id)
            comment_body.append(comment.body)
            sub_id.append(this_sub_id)
            
        # create df from lists
        this_df = pd.DataFrame({
            'comment': comment_body,
            'comment_id':comment_id,
            'sub_id':sub_id
        })

        # Add this sub's comments df to list of dfs
        comm_dfs.append(this_df)
        
        # Combine the list of dataframes
        df_combined = pd.concat(comm_dfs, axis=0).reset_index(drop=True)
        
    return df_combined
   

In [44]:
def get_subred_subs_coms(praw_reddit
                         ,sql_alch_engine 
                         ,subreddit_nm='all'
                         ,query=''
                         ,results_lim=1000
                         ,nm_subs_tbl=subreddit_nm + '_subs'
                         ,nm_comms_tbl=subreddit_nm + '_comms'
                         ):
    """
    Given the name of a subreddit and search terms, get submissions and their 
    related comments and save them to an AWS DB
    Parameters
    ---------
    praw_reddit: pre-instantiated praw Reddit class
    subreddit_nm: name of subreddit to search
    query: query string to search on
    results_lim: number of submissions results to return
    nm_subs_tbl: name of the submissions table
    nm_comms_tbl: name of the comments table
    sql_alc_engine: SQLAlchemy engine, for pandas to connect 

    Returns
    -------
    No return object, but will print success
    """

    # Start timing
    start_time = time.time()
    now = time.ctime(int(time.time()))
    print('Starting: ' + str(now) + '\n')
   
    # Get Submissions dataframe
    subs_df = get_subs_df(praw_reddit=praw_reddit
                          ,subreddit_nm=subreddit_nm
                          ,query=query
                          ,results_lim=results_lim)
    
    print("Retrieved submissions.")
    
    # Get just submissions IDs
    list_sub_ids = subs_df['id'].tolist()
    
    # Get comments dataframe
    comms_df = get_comms_df(praw_reddit=praw_reddit
                            ,list_sub_ids=list_sub_ids)
    
    
    print("Retrieved comments.")

    # Write dataframes out to SQL DB
    subs_df.to_sql(nm_subs_tbl, con=sql_alch_engine, if_exists='append')
    comms_df.to_sql(nm_comms_tbl, con=sql_alch_engine, if_exists='append')
#     print('write to ' + nm_subs_tbl)
#     print('write to ' + nm_comms_tbl)

    # Timing Stuff
    end_time = time.time()
    now = time.ctime(int(time.time()))
    print('\nFinished: ' + str(now) + '\n')

    mins_to_complete = (end_time - start_time)/60 
    print("It took {:.2f} minutes to complete.".format(mins_to_complete))
    print("There were {} submissions added.".format(subs_df.shape[0]))
    print("There were {:,} comments added.".format(comms_df.shape[0]))
    
    return

In [45]:
get_subred_subs_coms(praw_reddit=reddit
                    ,sql_alch_engine=sql_alch_engine
                    ,subreddit_nm=subreddit_nm
                    ,query=query
                    ,results_lim=results_lim
                    ,nm_subs_tbl='got_subs'
                    ,nm_comms_tbl='got_comms'
                    ,)

Starting: Sun Jun  9 21:25:36 2019

Retrieved submissions.
Retrieved comments.
write to got_subs
write to got_comms

Finished: Sun Jun  9 21:25:39 2019

It took 0.04 minutes to complete.
There were 1 submissions added.
There were 26 comments added.


---

# Results

persons = """"
doran" OR "davos"
"""

persons = """
            "bran" OR 'brandon stark' OR 'jon snow' OR 'jon' 
                         OR 'khaleesi' OR 'dany' OR 'daenerys' OR 'danyris'
          """
          
It took 14.21 minutes to complete.
There were 249 submissions added.
There were 11272 comments added.

persons = """
            "cersei" OR 'tyrion' OR 'sansa' OR 'arya' 
                        OR 'stannis' OR 'varys' OR 'jamie' OR 'brienne'
"""

It took 92.47 minutes to complete.  
There were 246 submissions added.  
There were 65,896 comments added.

persons = """
            "samwell" OR "jorah" OR "theon" OR "hound" OR "littlefinger" 
          """

It took 30.70 minutes to complete.  
There were 246 submissions added.  
There were 30,374 comments added.  

### Bookmark! <a name='bookmark' />

![](https://images.unsplash.com/photo-1534224563519-fea04849cadf?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1350&q=80
 )

![](https://images.unsplash.com/photo-1553058296-61093581de13?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1351&q=80)

### f. Check that the table was created, or can be appended.

In [None]:
# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [None]:
# QUERY TO GET LIST OF TABLES
# query = """
#     SELECT * FROM pg_catalog.pg_tables
#     WHERE schemaname = 'public';
# """

In [None]:
# Instantiate cursor
cur = conn.cursor()

In [None]:
# Set up query
query = """
    SELECT count(*) ct FROM got_comms;
"""

In [None]:
# Execute the query
cur.execute(query)

In [None]:
# conn.rollback()

In [None]:
# Check results
df_clone = pd.DataFrame(cur.fetchall())
df_clone.columns = [col.name for col in cur.description]

In [None]:
conn.commit()

In [None]:
df_clone

In [None]:
conn.close()