In [1]:
import os
import json
import pandas as pd
import datetime
import dateutil.parser
import numpy as np
import tweepy
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pd.options.display.max_colwidth = 280

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/madisonlindsay/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
def authenticate_api():
    auth = tweepy.OAuth1UserHandler(
        os.getenv('CONSUMER_KEY'), os.getenv('CONSUMER_SECRET'),
        os.getenv('ACCESS_TOKEN'), os.getenv('ACCESS_SECRET')
    )
    api = tweepy.API(auth)
    return api

In [3]:
def extract_dict(tweet_json):
    tweet_dict = {}
    for user_col in ['id', 'name', 'location', 'followers_count']:
        tweet_dict['user_' + user_col] = [tweet_json['user'][user_col]]
    for col in ['created_at', 'id', 'retweet_count', 'favorite_count', 'lang', 'in_reply_to_user_id', 'full_text']:
        tweet_dict[col] = [tweet_json[col]]
    return tweet_dict

In [4]:
def get_nltk_sentiment(sia, text):
    return sia.polarity_scores(text)['compound']

In [5]:
def get_df(tweets, add_sentiment=True):
    dfs = []
    for tweet in tweets:
        dfs.append(pd.DataFrame(extract_dict(tweet._json)))
    df = pd.concat(dfs)
    if add_sentiment:
        sia = SentimentIntensityAnalyzer()
        df['positivity'] = df['full_text'].apply(lambda x: get_nltk_sentiment(sia, x))
    return df.reset_index(drop=True)

In [6]:
api = authenticate_api()

In [7]:
# tweets = api.home_timeline(count=10, exclude_replies=False, include_entities=False, tweet_mode='extended')
# get_df(tweets)

# the onion = 14075928
# economist = 5988062
# NYT = 807095

### Methods
* `get_retweeter_ids(id, *, count, cursor, stringify_ids)`
* `get_retweets`
* `get_status` 
* `search_tweets(q, *, geocode, lang, locale, result_type, count, until, since_id, max_id, include_entities)`
    * query syntax: https://developer.twitter.com/en/docs/twitter-api/v1/rules-and-filtering/search-operators
* `get_follower_ids(*, user_id, screen_name, cursor, stringify_ids, count)`
    * returns 5000 at a time
* `search_users(q, *, page, count, include_entities)`
* `get_user(*, user_id, screen_name, include_entities)`
* `get_blocked_ids(*, stringify_ids, cursor)` or `get_blocks(*, include_entities, skip_status, cursor)` for user objects
* `available_trends()` and `closest_trends()`
* `search_full_archive(label, query, *, tag, fromDate, toDate, maxResults, next)` not sure if I can access this

In [8]:
war = api.search_tweets("war", tweet_mode='extended', count=100, lang='en')
war_df = get_df(war)
war_df['positivity'].describe()

count    86.000000
mean     -0.265505
std       0.492529
min      -0.957100
25%      -0.669600
50%      -0.401900
75%       0.000000
max       0.812600
Name: positivity, dtype: float64

In [9]:
pumpkin = api.search_tweets("pumpkin", tweet_mode='extended', count=100, lang='en')
pumpkin_df = get_df(pumpkin)
pumpkin_df['positivity'].describe()

count    100.000000
mean       0.232420
std        0.401028
min       -0.802000
25%        0.000000
50%        0.273000
75%        0.467575
max        0.931200
Name: positivity, dtype: float64

In [10]:
followers = api.get_follower_ids(user_id=14075928)  # the onion