In [1]:
import os
import json
import pandas as pd
import datetime
import dateutil.parser
import numpy as np
import tweepy
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

pd.options.display.max_colwidth = 280
stop_words = nltk.corpus.stopwords.words('english')
stop_words.append('')

In [2]:
def authenticate_api():
    auth = tweepy.OAuth1UserHandler(
        os.getenv('CONSUMER_KEY'), os.getenv('CONSUMER_SECRET'),
        os.getenv('ACCESS_TOKEN'), os.getenv('ACCESS_SECRET')
    )
    api = tweepy.API(auth)
    return api

In [3]:
def extract_dict(tweet_json):
    tweet_dict = {}
    for user_col in ['id', 'name', 'location', 'followers_count']:
        tweet_dict['user_' + user_col] = [tweet_json['user'][user_col]]
    for col in ['created_at', 'id', 'retweet_count', 'favorite_count', 'lang', 'in_reply_to_user_id', 'full_text']:
        tweet_dict[col] = [tweet_json[col]]
    return tweet_dict

In [4]:
def get_nltk_sentiment(sia, text):
    return sia.polarity_scores(text)['compound']

In [5]:
def clean_tweet(words):
    words = words.replace('-', ' ').replace('\n', ' ').replace('%', ' percent')
    words = re.sub("RT @(.*?):", "", words)  # remove RT
    http_loc = words.find('https')
    if http_loc > 0:
        return words[0:http_loc-1]  # remove url
    return words

In [6]:
def tokenize(words):
    tokens = re.split('\W+', words.lower())
    return [w for w in tokens if w not in stop_words]

In [7]:
def remove_punctuation(words):
    return "".join([w for w in words if w not in string.punctuation])

In [8]:
def get_df(tweets, add_sentiment=True):
    dfs = []
    for tweet in tweets:
        dfs.append(pd.DataFrame(extract_dict(tweet._json)))
    if len(dfs) == 0:
        return pd.DataFrame()
    df = pd.concat(dfs)
    df['clean_tweet'] = df['full_text'].apply(lambda x: clean_tweet(x))
    df['tokens'] = df['clean_tweet'].apply(lambda x: tokenize(remove_punctuation(x)))
    
    if add_sentiment:
        sia = SentimentIntensityAnalyzer()
        df['positivity'] = df['clean_tweet'].apply(lambda x: get_nltk_sentiment(sia, x))
    return df.reset_index(drop=True)

In [9]:
api = authenticate_api()

In [10]:
tweets = api.home_timeline(count=100, exclude_replies=False, include_entities=False, tweet_mode='extended')
tweet_df = get_df(tweets)
#tweet_df

# NYT = 807095

In [11]:
# tf idf
def prep_inv_doc(x):
    return tokenize(remove_punctuation(clean_tweet(x)))

inv_doc = TfidfVectorizer(analyzer=prep_inv_doc)
inv_doc_output = inv_doc.fit_transform(tweet_df['full_text'])
id_df = pd.DataFrame(inv_doc_output.toarray())

### Methods
* `get_retweeter_ids(id, *, count, cursor, stringify_ids)`
* `get_retweets`
* `get_status` 
* `search_tweets(q, *, geocode, lang, locale, result_type, count, until, since_id, max_id, include_entities)`
    * query syntax: https://developer.twitter.com/en/docs/twitter-api/v1/rules-and-filtering/search-operators
* `get_follower_ids(*, user_id, screen_name, cursor, stringify_ids, count)`
    * returns 5000 at a time
* `search_users(q, *, page, count, include_entities)`
* `get_user(*, user_id, screen_name, include_entities)`
* `get_blocked_ids(*, stringify_ids, cursor)` or `get_blocks(*, include_entities, skip_status, cursor)` for user objects
* `available_trends()` and `closest_trends()`
* `search_full_archive(label, query, *, tag, fromDate, toDate, maxResults, next)` not sure if I can access this
* `user_timeline(*, user_id, screen_name, since_id, count, max_id, trim_user, exclude_replies, include_rts)` Returns the 20 most recent statuses posted from the authenticating user or the user specified

**Emoji positivity scores**
* mean or median of positivity of the tweets that contain emojis
* difficult to detect sarcasm

In [24]:
skull = api.search_tweets("💀", tweet_mode='extended', count=100, lang='en')
skull_df = get_df(skull)
skull_df['positivity'].mean()

-0.020708

In [25]:
heart = api.search_tweets("❤️", tweet_mode='extended', count=100, lang='en')
heart_df = get_df(heart)
heart_df['positivity'].mean()

0.3864837209302326

In [26]:
puke = api.search_tweets("🤮", tweet_mode='extended', count=100, lang='en')
puke_df = get_df(puke)
puke_df['positivity'].mean()

-0.10854400000000002

In [27]:
war = api.search_tweets("war", tweet_mode='extended', count=100, lang='en')
war_df = get_df(war)
war_df['positivity'].mean()

-0.36406206896551724

In [28]:
pumpkin = api.search_tweets("pumpkin", tweet_mode='extended', count=100, lang='en')
pumpkin_df = get_df(pumpkin)
pumpkin_df['positivity'].mean()

0.24945656565656565

In [126]:
# get_df(api.search_tweets("alaskan klee kai", tweet_mode='extended', count=100, lang='en'))

### Network

In [14]:
import networkx as nx
from tqdm import tqdm

In [15]:
base_node = api.get_user(screen_name = "DavidCastilloAC")  # stars reporter
base_node.id

330764365

In [16]:
followers = api.get_follower_ids(user_id=330764365)
print(len(followers))

1225


In [46]:
median_positivity = []
errors = []
no_tweets = []
for follower in tqdm(followers):
    try:
        follower_tweets = get_df(api.user_timeline(
            user_id=follower, include_rts=False, tweet_mode='extended'
        ))
    except:
        errors.append(follower)
    if follower_tweets.shape[0] == 0:
        no_tweets.append(follower)
        continue
    median_positivity.append(follower_tweets['positivity'].median())
    # second_degree = api.get_follower_ids(user_id=follower)

100%|██████████| 1214/1214 [10:21<00:00,  1.95it/s]


In [17]:
def get_user_positivity(user_id):
    try:
        follower_tweets = get_df(api.user_timeline(
            user_id=user_id, include_rts=False, tweet_mode='extended'
        ))
    except:
        return np.nan
    if follower_tweets.shape[0] == 0:
        return np.nan
    return follower_tweets['positivity'].median()

In [18]:
follower_df = pd.DataFrame({'follower': followers})
print(datetime.datetime.now())
follower_df['median_positivity'] = follower_df['follower'].apply(lambda x: get_user_positivity(x))
print(datetime.datetime.now())

2022-10-26 17:06:19.433879
2022-10-26 17:14:26.491735


In [19]:
follower_df[~follower_df['median_positivity'].isna()]['median_positivity'].median()

0.0

In [15]:
#temp_df = get_df(api.user_timeline(user_id=330764365, include_rts=False, tweet_mode='extended'))