# Data Acquistion from Reddit

Go to <a href=#bookmark>bookmark</a>

### 2019-06-08 - Goal - Develop End-to-End Data Flow, at least at small scale.
# OR BUST

![](https://images.unsplash.com/photo-1515255384510-23e8b6a6ca3c?ixlib=rb-1.2.1&auto=format&fit=crop&w=1489&q=80)

---

## Libraries

In [1]:
# Install libs on this computer:
# !pip install praw
# !pip install pymongo
# !pip install psycopg2

In [2]:
import os             # file system stuff
import json           # digest json
import praw           # reddit API
import pandas as pd   # Dataframes
import pymongo        # MongoDB
import numpy as np    # math and arrays

from time import time # To time stuff

#DATA STORAGE
from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

In [3]:
import helper     # Custom helper functions

---

## 1A Load Reddit keys

Step 3: Create your first Authorized Reddit Instance

In [5]:
# Define path to secret

secret_path = os.path.join(os.environ['HOME'], '.secret', 'reddit.json')
#secret_path = os.path.join(os.environ['HOME'], 'mia/.secret', 'reddit_api.json')

secret_path

'C:\\Users\\werlindo\\.secret\\reddit.json'

#### Save submissions to DB

In [83]:
# Define path to secret

secret_path_aws = os.path.join(os.environ['HOME'], '.secret', 
                           'aws_ps_flatiron.json')
secret_path_aws

'C:\\Users\\werlindo\\.secret\\aws_ps_flatiron.json'

## 1B Load AWS-PostgreSQL DB keys

#### Load keys

In [84]:
aws_keys = helper.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

In [85]:
aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

### Use SQLAlchemy to create PSQL engine

In [271]:
# dialect+driver://username:password@host:port/database
sql_alc_engine = create_engine(aws_ps_engine)

## 2 Load keys, Create Reddit Instance

In [6]:
keys = helper.get_keys(secret_path)

In [7]:
reddit = praw.Reddit(client_id=keys['client_id'] 
                     ,client_secret=keys['api_key']
                     ,username=keys['username']
                     ,password=keys['password']
                     ,user_agent='reddit_research accessAPI:v0.0.1 (by /u/FlatDubs)')

## 3 Obtain a Subreddit Instance(s) from your Reddit Instance

In [23]:
#politics = reddit.subreddit('politics')

#### Instantiate subreddit

In [24]:
got = reddit.subreddit('gameofthrones') #Let's start with got for now. If can dev flow for one, can just dupe for other

## 4 Get subreddit submissions and comments; save to dataframe

#### Initialize parameters for this submissions pull

https://en.wikipedia.org/wiki/List_of_Game_of_Thrones_characters

In [361]:
# ROUND 1
# persons = """"
# doran" OR "davos"
# """

# persons = """
#             "bran" OR 'brandon stark' OR 'jon snow' OR 'jon' 
#                         OR 'khaleesi' OR 'dany' OR 'daenerys' OR 'danyris'
#          """

# persons = """
#             "cersei" OR 'tyrion' OR 'sansa' OR 'arya' 
#                         OR 'stannis' OR 'varys' OR 'jamie' OR 'brienne'
#          """

# persons = """
#             "samwell" OR "jorah" OR "theon" OR "hound" OR "littlefinger" 
#           """

# persons = """
#             "joffrey" OR "sandor" OR "mountain" OR "gregor" OR "baelish" 
#           """

# persons = """
#             "robb" OR "drogo" OR "melisandre" OR "bronn" OR "gilly" OR
#             "ramsey" OR "missandei" OR "gendry" OR "grey worm"
#           """

# persons = """
#             "ned" OR "eddard" OR "catelyn" OR "bronn" OR "torumund" OR
#             "robert" OR "tommen" OR "viserys" OR "margaery"
#           """

persons = """
    "yara" OR "qyburn"
"""
persons = """
    "hodor"
"""

results_lim = 1

#### Execute Search

In [362]:
start_time = time()

In [363]:
got_search = got.search(persons, 
                        sort='comments',
                       limit= results_lim
                       ,time_filter='month')

# Count # of results
# num_results = sum(1 for s in got_search)
# print('Returned {} results.'.format(num_results))

# Compile submission into list
title = [] 
num_comments = []
upvote_ratio = []
sub_id = []
i=0

for submission in got_search:
    i+=1
    title.append(submission.title)
    num_comments.append(submission.num_comments)
    upvote_ratio.append(submission.upvote_ratio)
    sub_id.append(submission.id)
#     body.append(?) #look at this later! is it comment[0]? 
    if i%100 == 0:
        print(f'{i} submissions completed')

df_got = pd.DataFrame(
    {'title': title,
     'num_comments': num_comments,
     'upvote_ratio': upvote_ratio,
     'id':sub_id
    })

#df_got

In [364]:
df_got

Unnamed: 0,title,num_comments,upvote_ratio,id
0,[SPOILERS] Criticism should be expected - it's...,275,0.9,bqug8m


#### Now loop through each sub and grab it's comments

In [365]:
# List to hold all the comments dfs
comm_dfs = []

for index, row in df_got.iterrows():
#     print(row['id'])
    submission = reddit.submission(id=row['id'])

    # Instantiate lists to hold comments data
    comment_body = []
    comment_id = []
    sub_id = []

    while True:
        try:
            submission.comments.replace_more()
            break
        except PossibleExceptions:
            print('Handling replace_more exception')
            sleep(1)
    
    # Loop through comments and put into list
    for comment in submission.comments.list():
    #     print(comment.body)
    #     print(comment.id)
        comment_id.append(comment.id)
        comment_body.append(comment.body)
        sub_id.append(row['id'])

    # create df from lists
    this_df = pd.DataFrame({
        'comment': comment_body,
        'comment_id':comment_id,
        'sub_id':sub_id
    })
    
    # Add this sub's comments df to list of dfs
    comm_dfs.append(this_df)


#### Put all the comments into common df

In [366]:
df_got_comm = pd.concat(comm_dfs, axis=0).reset_index(drop=True)

In [369]:
df_got_comm.shape

(270, 3)

In [367]:
df_got_comm.head(20)

Unnamed: 0,comment,comment_id,sub_id
0,I think that your feelings are the feelings of...,eo7zkqb,bqug8m
1,>Why did Brienne choose to become a Kingsguard...,eo8n42y,bqug8m
2,It's rushed but the worst thing is that they a...,eo8kal9,bqug8m
3,Season 7 should have been 10 episodes culminat...,eo85mf6,bqug8m
4,Yep.. and the criticism like this is met by “y...,eo8e6h3,bqug8m
5,The simple fix would have been season 7 at 10 ...,eo9a8nw,bqug8m
6,"the way i see it, Jon’s first vows to the nigh...",eo8kcz8,bqug8m
7,"Couldn't agree more, it's just sad because the...",eo81kwn,bqug8m
8,Totally agree. Instead of forcing actors to go...,eo8pgcr,bqug8m
9,Everybody is so caught up in # of episodes. In...,eo8ymfr,bqug8m


## 5 Save dataframes' contents to PS DB

#### Use `pandas.to_sql` to write the dataframe to the PostgreSQL database, using the SQLAlchemy engine.
    

In [355]:
# df_got.to_sql('got_subs', con=sql_alc_engine, if_exists='append')

# df_got_comm.to_sql('got_comms', con=sql_alc_engine, if_exists='append')

In [356]:
df_got.to_sql('got_subs', con=sql_alc_engine, if_exists='append')

In [357]:
df_got_comm.to_sql('got_comms', con=sql_alc_engine, if_exists='append')

In [358]:
# Timing Stuff
end_time = time()

mins_to_complete = (end_time - start_time)/60 
print("It took {:.2f} minutes to complete.".format(mins_to_complete))
print("There were {} submissions added.".format(df_got.shape[0]))
print("There were {:,} comments added.".format(df_got_comm.shape[0]))


It took 1.09 minutes to complete.
There were 1 submissions added.
There were 1,952 comments added.


In [360]:
df_got_comm.head(100)

Unnamed: 0,comment,comment_id,sub_id
0,||E1|E2|E3|E4|E5|E6|E7|E8|E9|E10|\n|:-|:-|:-|:...,enr87fr,bpc20p
1,Seeing disappointing in the same list with epi...,ensmai8,bpc20p
2,">In one word, how would you describe this epis...",enroqew,bpc20p
3,The correlation between average rating and whe...,enrhrz2,bpc20p
4,I'm happy to see that the vast majority agrees...,enrilow,bpc20p
5,The Mad Queen is a bit of a giveaway...,enr9ue9,bpc20p
6,How well shot was this episode: **8.6** \n\nHo...,enrkiep,bpc20p
7,This would've been an all-time episode if it h...,enrkir0,bpc20p
8,It was visually stunning I have to admit.,enrbw08,bpc20p
9,People gave the direction (7.3) and cinematogr...,ens4r6s,bpc20p


### Results

persons = """"
doran" OR "davos"
"""

persons = """
            "bran" OR 'brandon stark' OR 'jon snow' OR 'jon' 
                         OR 'khaleesi' OR 'dany' OR 'daenerys' OR 'danyris'
          """
          
It took 14.21 minutes to complete.
There were 249 submissions added.
There were 11272 comments added.

persons = """
            "cersei" OR 'tyrion' OR 'sansa' OR 'arya' 
                        OR 'stannis' OR 'varys' OR 'jamie' OR 'brienne'
"""

It took 92.47 minutes to complete.  
There were 246 submissions added.  
There were 65,896 comments added.

persons = """
            "samwell" OR "jorah" OR "theon" OR "hound" OR "littlefinger" 
          """

It took 30.70 minutes to complete.  
There were 246 submissions added.  
There were 30,374 comments added.  

persons = """
            "joffrey" OR "sandor" OR "mountain" OR "gregor" OR "baelish" 
          """  
          
It took 24.55 minutes to complete.  
There were 249 submissions added.  
There were 23,146 comments added.

persons = """
            "robb" OR "drogo" OR "melisandre" OR "bronn" OR "gilly" OR
            "ramsey" OR "missandei" OR "gendry" OR "grey worm"
          """
          
It took 23.15 minutes to complete.  
There were 249 submissions added.  
There were 28,920 comments added.  

persons = """
            "ned" OR "eddard" OR "catelyn" OR "bronn" OR "torumund" OR
            "robert" OR "tommen" OR "viserys" OR "margaery"
          """
          
It took 21.79 minutes to complete.  
There were 250 submissions added.  
There were 23,731 comments added.     

### Left off here <a name='bookmark' />

![](https://images.unsplash.com/photo-1534224563519-fea04849cadf?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1350&q=80
 )

![](https://images.unsplash.com/photo-1553058296-61093581de13?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1351&q=80)

### f. Check that the table was created, or can be appended.

In [304]:
# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [305]:
# QUERY TO GET LIST OF TABLES
# query = """
#     SELECT * FROM pg_catalog.pg_tables
#     WHERE schemaname = 'public';
# """

In [306]:
# Instantiate cursor
cur = conn.cursor()

In [307]:
# Set up query
query = """
    SELECT count(*) ct FROM got_comms;
"""

In [308]:
# Execute the query
cur.execute(query)

In [309]:
# conn.rollback()

In [310]:
# Check results
df_clone = pd.DataFrame(cur.fetchall())
df_clone.columns = [col.name for col in cur.description]

In [311]:
conn.commit()

In [312]:
df_clone

Unnamed: 0,ct
0,157253


In [313]:
conn.close()