## Twitter API

In [7]:
from twython import Twython

import hashlib

COMPUTER_USERNAME_HASH = "327529c973f4a2eca02eda86530afdacbcb2d04af59de078201a6c1d3e1f7af2"

In [None]:
### Input a file path to my credentials ###
# No peeking! Did you think I would put my computer's username in the code?
computer_username = input("Type in a computer username: ")
hashGen = hashlib.sha256()
hashGen.update(computer_username.encode('utf-8'))
if COMPUTER_USERNAME_HASH != hashGen.hexdigest(): 
    raise ValueError("Incorrect file path passed to code!")

In [None]:
### Access Twitter credentials at the given file ###
twitter_credentials_fpath = f"/Users/{computer_username}/Documents/TWITTER_API_CREDENTIALS.txt"
twitter_credentials_file = open(twitter_credentials_fpath, 'r')
twitter_credentials = twitter_credentials_file.read().split("\n")

In [None]:
### Input a file path to my Reddit credentials ###
TWITTER_APP_KEY = twitter_credentials[0]
TWITTER_APP_KEY_SECRET = twitter_credentials[1]
TWITTER_ACCESS_TOKEN = twitter_credentials[2]
TWITTER_ACCESS_TOKEN_SECRET = twitter_credentials[3]

In [None]:
t = Twython(app_key=TWITTER_APP_KEY, 
            app_secret=TWITTER_APP_KEY_SECRET, 
            oauth_token=TWITTER_ACCESS_TOKEN, 
            oauth_token_secret=TWITTER_ACCESS_TOKEN_SECRET)

queries = ['#darkpattern', '#darkpatterns', '@darkpatterns']
# Maximum count: 100
query_counts = [10, 10, 5]

### TODO: Use the until parameter to iterate through tweets until a certain date ###

### TODO: Use the mixed, recent, popular parameters to get responses ###

### TODO: parse and store images ###

### NOTE: "No tweets will be found for a date older than one week ###

for query, cnt in zip(queries, query_counts): 
    search = t.search(q='#lol', count=cnt, lang='en')
    tweets = search['statuses']
    for tweet in tweets:
        if tweet.quoted_status: print(tweet['quoted_status_id'], '\n', tweet['quoted_status.text'], tweet['retweet_count'], tweet['favorite_count'], '\n\n\n')
        else: print(tweet['id_str'], '\n', tweet['text'], tweet['retweet_count'], tweet['favorite_count'], '\n\n\n')

Link to Twitter API guide: https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets

Link to StackOverflow for Twitter API: https://stackoverflow.com/questions/14156625/fetching-tweets-with-hashtag-from-twitter-using-python

## Reddit API

In [3]:
#! usr/bin/env python3
import praw
import urllib.request

import pandas as pd

from datetime import datetime
import time
import os.path

import pprint

from textblob import TextBlob

DELAY = 0.5

In [9]:
### Access Reddit credentials at the given file ###
reddit_credentials_fpath = f"/Users/{computer_username}/Documents/REDDIT_API_CREDENTIALS.txt"
reddit_credentials_file = open(reddit_credentials_fpath, 'r')
reddit_credentials = reddit_credentials_file.read().split("\n")

In [10]:
### Access credentials from file on same machine ###
REDDIT_PERSONAL_USE_SCRIPT = reddit_credentials[0]
REDDIT_SECRET_KEY = reddit_credentials[1]
REDDIT_APP_NAME = reddit_credentials[2]
# No peeking! Did you think I would put my Reddit password in the code? 
# Username & password are unnecessary for public posts
REDDIT_USER_NAME = reddit_credentials[3]
REDDIT_LOGIN_PASSWORD = reddit_credentials[4]

In [11]:
### Plug credentials into a parser object ###
reddit = praw.Reddit(client_id=REDDIT_PERSONAL_USE_SCRIPT, \
                     client_secret=REDDIT_SECRET_KEY, \
                     user_agent=REDDIT_APP_NAME, \
                     username=REDDIT_USER_NAME, \
                     password=REDDIT_LOGIN_PASSWORD)

In [12]:
# assume you have a Reddit instance bound to variable `reddit`
submission = reddit.submission(id='39zje0')
print(submission.title) # to make it non-lazy
pprint.pprint(vars(submission))

reddit will soon only be available over HTTPS
{'_comments': <praw.models.comment_forest.CommentForest object at 0x1a1c7e9550>,
 '_comments_by_id': {'t1_cs7vwlm': Comment(id='cs7vwlm'),
                     't1_cs7xcx2': Comment(id='cs7xcx2'),
                     't1_cs7ykx6': Comment(id='cs7ykx6'),
                     't1_cs81mem': Comment(id='cs81mem'),
                     't1_cs81xp8': Comment(id='cs81xp8'),
                     't1_cs82epc': Comment(id='cs82epc'),
                     't1_cs82kes': Comment(id='cs82kes'),
                     't1_cs83dd8': Comment(id='cs83dd8'),
                     't1_cs83ua8': Comment(id='cs83ua8'),
                     't1_cs83xhc': Comment(id='cs83xhc'),
                     't1_cs846jk': Comment(id='cs846jk'),
                     't1_cs847yp': Comment(id='cs847yp'),
                     't1_cs848n2': Comment(id='cs848n2'),
                     't1_cs84apf': Comment(id='cs84apf'),
                     't1_cs84kz5': Comment(id='cs84kz5'),
   

In [13]:
### Compute date & time from Unix timestamp ###
def get_time_from_unix(timestamp: int): 
    return datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

In [14]:
### Defines the permissible flairs ###
ALLOWED_FLAIRS = [None, "Bad Unsubscribe Function", "Clickshaming", "Dark Pattern", \
                  "Bait and Switch", "Loaded Question"]
IMAGE_TAGS = ["i.redd.it", ".png", ".jpg", ".gif"]
VIDEO_TAGS = ["v.redd.it", ".mp4", ".mov"]

In [17]:
### Defines parsing parameters for a designated subreddit ###
subreddit = reddit.subreddit('assholedesign')
# Options: all, year, day, hour, week, month
top_subreddit = subreddit.top('day', limit=1000)
mainpath = f"/Users/{computer_username}/Desktop/Reddit/"

data_rowslist = []
column_list = ["ID", "Title", "Media_Type", "Media_URL", "Date_Time", "Flair", \
               "Local_File_Name", "Is_OC", "Score", "Upvote_Ratio"]


### Parses a designated subreddit for certain data ###
outfile = open(mainpath + "DATA_partial.txt", "a")
for submission in top_subreddit: 
    # Removes all posts without URLs (i.e. posts without images)
    if submission.is_self: continue
    # Removes all posts without the proper flairs
    if submission.link_flair_text not in ALLOWED_FLAIRS: continue
    # Removes all posts with low score (might not be "asshole design")
    if submission.upvote_ratio < 0.6: continue
    # Determines if the title is English
    title_obj = TextBlob(submission.title)
    title_lang = title_obj.detect_language()
    # if title_lang != "en": continue
    # Parses date & time
    submission_datetime = get_time_from_unix(int(float(submission.created_utc)))
    # Decides if media is image or video
    submission_type = "Other"
    submission_url = submission.url
    if any([img_tag in submission_url for img_tag in IMAGE_TAGS]): submission_type = "Image"
    elif any([vid_tag in submission_url for vid_tag in VIDEO_TAGS]): submission_type = "Video"
    elif "imgur.com" in submission_url: submission_type = "Imgur_Non_Image"
    
    submission_localname = mainpath + submission.id + "." + submission_url.split('.')[-1]
    
    if submission_type == "Image" and not os.path.isfile(submission_localname): 
        urllib.request.urlretrieve(submission_url, submission_localname)
    else: submission_localname = "NA"
    
    rowdict = {}
    rowdict["ID"] = submission.id
    rowdict["Title"] = submission.title.replace('\t', '')
    rowdict["Title Language"] = title_lang
    rowdict["Media_Type"] = submission_type
    rowdict["Media_URL"] = submission_url
    rowdict["Date_Time"] = submission_datetime
    flair = submission.link_flair_text
    if flair != None: flair = flair.replace('\t', '')
    rowdict["Flair"] = flair
    rowdict["Local_File_Name"] = '/'.join(submission_localname.split('/')[3:])
    rowdict["Is_OC"] = submission.is_original_content
    rowdict["Score"] = submission.score
    rowdict["Upvote_Ratio"] = submission.upvote_ratio
    
    data_rowslist.append(rowdict)
    
    rowlist = [str(val) for val in rowdict.values()]
    outfile.write('\t'.join(rowlist) + '\n')
    
    print(f"Current ID: {submission.id}")
    # print(submission.title)
    # time.sleep(DELAY)

outfile.close()

Current ID: fs8pcf
Current ID: fske7v
Current ID: fsdf3o
Current ID: fs7nac
Current ID: fsesis
Current ID: fs2if8
Current ID: fsfm2z
Current ID: fs8g9o
Current ID: fslh2b


In [40]:
### Save and print the data ###
data = pd.DataFrame(data_rowslist, columns=column_list)
# print(data)
data.to_csv(mainpath + "DATA.txt", sep='\t', index=False)

## MySQL Database API

In [31]:
### Access credentials at the given file ###
mysql_credentials_fpath = f"/Users/{computer_username}/Documents/REDDIT_API_CREDENTIALS.txt"
mysql_credentials_file = open(mysql_credentials_fpath, 'r')
MYSQL_PASSWORD = mysql_credentials_file.read().split("\n")[0]

In [32]:
import pymysql.cursors
import pymysql

# Connect to the database
connection = pymysql.connect(host='localhost',
                             user='user',
                             password=MYSQL_PASSWORD,
                             db='db',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

try:
    with connection.cursor() as cursor:
        # Create a new record
        sql = "INSERT INTO `users` (`email`, `password`) VALUES (%s, %s)"
        cursor.execute(sql, ('webmaster@python.org', 'very-secret'))

    # connection is not autocommit by default. So you must commit to save
    # your changes.
    connection.commit()

    with connection.cursor() as cursor:
        # Read a single record
        sql = "SELECT `id`, `password` FROM `users` WHERE `email`=%s"
        cursor.execute(sql, ('webmaster@python.org',))
        result = cursor.fetchone()
        print(result)
finally:
    connection.close()

OperationalError: (1045, "Access denied for user 'user'@'localhost' (using password: YES)")

Link to Guide: https://www.storybench.org/how-to-scrape-reddit-with-python/