In [1]:
import sys
sys.path.append("../")

%load_ext autoreload
%autoreload 2

In [2]:
from psaw import PushshiftAPI
from datetime import datetime
from tqdm.notebook import trange, tqdm
import pickle

import config
from utills import extract_twitter_username

In [3]:
from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

Extract Twitter-Reddit account Pairs from TwitterFollowers
===

In [None]:
after = str(int(datetime(2010, 1, 1).timestamp()))
api = PushshiftAPI()

In [None]:
# Download data from r/TwitterFollowers posts
res = api.search_submissions(subreddit='TwitterFollowers', filter=['author', 'title', 'url'], after=after)

data = []
for r in tqdm(res):
    if 'twitter.com' in r.url:
        data.append(r.d_)

In [None]:
# Download data from r/TwitterFollowers Submissions
res = api.search_comments(subreddit='TwitterFollowers', filter=['author', 'body'], after=after)
data_comments = []
for r in tqdm(res):
    if 'twitter.com' in r.body:
        data_comments.append(r.d_)

In [None]:
with open('../data/twitter_reddit/r_twitterfollowers.p', 'wb') as f:
    pickle.dump((data, data_comments), f)

In [4]:
with open('../data/twitter_reddit/r_twitterfollowers.p', 'rb') as f:
   (data, data_comments) = pickle.load(f)

In [5]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=[datetime.fromtimestamp(d['created_utc']) for d in data], name='Posts'))
fig.add_trace(go.Histogram(x=[datetime.fromtimestamp(d['created_utc']) for d in data_comments], name='Comments'))
fig.update_layout(title="Post and Comment Timestamps")

In [6]:
print('Number of Posts:', len(data))
print('Number of Comments:', len(data_comments))

Number of Posts: 1813
Number of Comments: 1612


Download all the content from the identified users
===

In [7]:
from data_collection_methods import get_all_tweets, reddit_user_comments
import tweepy 
import re
import glob
import time

In [8]:
reddit_usernames = []
twitter_usernames = []
for d in data:
    twitter = extract_twitter_username(d['url'])
    if twitter:
        reddit_usernames.append(d['author'])
        twitter_usernames.append(twitter)
        
for d in data_comments:
    twitter = extract_twitter_username(d['body'])
    if twitter:
        reddit_usernames.append(d['author'])
        twitter_usernames.append(twitter)

In [11]:
len(reddit_usernames), len(twitter_usernames)

(3410, 3410)

Download Tweets
---

In [None]:
auth = tweepy.OAuthHandler(config.TWITTER_CONSUMER_KEY, config.TWITTER_CONSUMER_SECRET)
auth.set_access_token(config.TWITTER_ACCESS_TOKEN, config.TWITTER_ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)


In [None]:
completed_users = [re.search('/twitter/(.*).jsonl', f).group(1) for f in glob.glob('../data/twitter_reddit/twitterfollowers/twitter/*.jsonl')]
failed = []


for u in twitter_usernames:
    try:
        print(u)
        if u in completed_users or u in failed:
            continue
        all_tweets = get_all_tweets(api, u)
        tweets = [t for t in all_tweets if not hasattr(t, "retweeted_status")]
        print(u, len(all_tweets), len(tweets))
        completed_users.append(u)
        if len(tweets) < 100:
            continue
        with open(TWITTER_DATA_DIR + u + '.jsonl', 'w') as f:
            for t in tweets:
                json.dump(t.full_text, f)
                f.write('\n')
    except tweepy.RateLimitError:
        time.sleep(15 * 60)
    except Exception as e:
        print('Failed: ', e)
        failed.append(u)

Download Reddit Comments
---

In [None]:
after = str(int(datetime(2010, 1, 1).timestamp()))
before = str(int(datetime.now().timestamp()))

completed_users_reddit = [re.search('/reddit/(.*).jsonl', f).group(1) for f in glob.glob('../data/twitter_reddit/twitterfollowers/reddit/*.jsonl')]
failed_reddit = []


for u in tqdm(reddit_usernames):
    try:
        print(u)
        if u in completed_users_reddit or u in failed_reddit or 'Bot' in u:
            continue
        all_comments = reddit_user_comments(u, after, before)
        print(u, len(all_comments))
        completed_users_reddit.append(u)
        if len(all_comments) < 100:
            continue
        with open(REDDIT_DATA_DIR + u + '.jsonl', 'w') as f:
            for c in all_comments:
                json.dump(c, f)
                f.write('\n')
    except Exception as e:
        print('Failed: ', e)
        failed_reddit.append(u)