# Week 2. Day 2. Exercises from Chapter 5 of FSStDS. 
## Fundamentals of Social Data Science. MT 2022

Within your study pod discuss the following questions. Please submit an individual assignment by 12:30pm Wednesday, October 18, 2022 on Canvas. 

In [8]:
import pandas as pd 
import json
import numpy as np

# Exercise 1. Twitter merging 

I have provided two tables: `dalle2_oct18_2022_tweets.csv` and `dalle2_oct18_2022_users.csv`. You can see how these tweets were collected in the Appendix to this assignment. It's a simple pull of only 100 tweets. To continue this pull would require paging (another day). For now, let's focus on merging. Please merge these two tables. 

Some tips: 
- Ensure that you keep all the tweets.
- Ensure that the names which might overlap (hint...`id`) are given descriptive suffixes.
- Your resulting df should still have 100 rows. 

In [9]:
# Exercise 1 below here 

tweets = []
for entry in json.load(open('../data/dalle2_oct18_2022_tweets.json')):
    temp = {}
    temp['text'] = entry['text']
    temp['id'] = entry['author_id']
    temp['tweet_id'] = entry['id']
    try:
        temp['hashtags'] = [hash['tag'] for hash in entry['entities']['hashtags']]
    except:
        temp['hashtags'] = []
    try:
        temp['num_mentions'] = len(entry['entities']['mentions'])
    except:
        temp['num_mentions'] = 0
    try:
        temp['retweets'] = entry['public_metrics']['retweet_count']
        temp['likes'] = entry['public_metrics']['like_count']
    except:
        temp['retweets'] = 0
        temp['likes'] = 0
    tweets.append(temp)

tweets_df = pd.DataFrame(tweets)

# [['text', 'author_id', 'id']]
users = []
for entry in json.load(open('../data/dalle2_oct18_2022_users.json')):
    temp = {}
    temp['username'] = entry['username']
    temp['id'] = entry['id']
    temp['followers'] = entry['public_metrics']['followers_count']
    users.append(temp)

users_df = pd.DataFrame(users)

merge_df = tweets_df.merge(users_df,how="inner",on="id")

print(len(tweets_df),len(users_df),len(merge_df))
# Should be 100 79 100

100 79 100


# Exercise 2. Twitter analytics 

Split the data into two groups: 
- Those with more than 1000 followers and those with less
- Compare the two groups. Which group has more tweets and _proportionately_ more @mentions in their tweets.
    
> Note: Getting the @mentions can be done cheap and easy (search for @ symbol) or more robust and with a little more difficulty (look in the entities.mentions column and wrangle the dictionary)

In [10]:
# Exercise 2. Answer below here

over1k = merge_df[merge_df['followers'] >= 1000]
under1k = merge_df[merge_df['followers'] < 1000]

len_over1k = len(over1k)
len_under1k = len(under1k)
over1k_nummention = len(over1k[over1k['num_mentions'] > 0]) 
under1k_nummention = len(under1k[under1k['num_mentions'] > 0]) 

print(len_under1k)

print(f"The percentage of tweets from those with over 1k followers that have mentions is  {over1k_nummention / len_over1k:0.1%}",
      f"The percentage of tweets from those with under 1k followers that have mentions is  {under1k_nummention / len_under1k:0.1%}")

# Should be 29 for over1k and 71 for under1k
# And therefore should be 24.1% and 11.3% respectively.

71
The percentage of tweets from those with over 1k followers that have mentions is  24.1% The percentage of tweets from those with under 1k followers that have mentions is  11.3%


# Exercise 3. Grouping the data

Group the data by Author and build a table that reports the max, min, and average for both  `public_metrics.retweet_count` and `public_metrics.like_count`. 

In [11]:
# Exercise 3. Answer below here

merge_df.groupby('username').agg({'retweets': ['mean', 'min', 'max'],
                                  'likes': ['mean', 'min', 'max']})


Unnamed: 0_level_0,retweets,retweets,retweets,likes,likes,likes
Unnamed: 0_level_1,mean,min,max,mean,min,max
username,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0xAdventuresAI,0.0,0,0,0.0,0,0
3ter_314,0.0,0,0,0.0,0,0
AiCollectionart,0.0,0,0,1.0,1,1
AnimalsLab,1.0,1,1,4.0,4,4
ArcturusV18,0.0,0,0,1.0,1,1
...,...,...,...,...,...,...
triflingtree,0.0,0,0,1.0,0,2
vandaloruins,4.0,4,4,10.0,10,10
wlabdl2,0.0,0,0,2.0,2,2
wrAIter_pl,0.0,0,0,1.0,1,1


# Exercise 4. Twitter Reshaping

Create a long `DataFrame` of tweet_ids, author_ids, and hash_tags. That is, one row per hashtag rather than one per tweet. Report the length of this `DataFrame` and the `value_counts()` of the top 10 hashtags.

In [12]:
has_hash_df = merge_df[merge_df['hashtags'].str.len() > 0]

hash_list = []
for hash in np.concatenate(has_hash_df['hashtags'].tolist()):
    temp = {}
    temp_df = has_hash_df[has_hash_df['hashtags'].map(lambda x: hash in x)]
    temp['hashtag'] = hash
    temp['tweet_ids'] = ','.join(temp_df['tweet_id'].tolist())
    temp['author_ids'] = ','.join(temp_df['id'].tolist())
    hash_list.append(temp)

hash_df = pd.DataFrame(hash_list)
hash_df

Unnamed: 0,hashtag,tweet_ids,author_ids
0,MedTwitter,1582307720451207168,517339521
1,dalle2,"1582307720451207168,1582307372231720960,158230...","517339521,1368908006692372483,3236609925,78074..."
2,dalle2,"1582307720451207168,1582307372231720960,158230...","517339521,1368908006692372483,3236609925,78074..."
3,novelai,1582306680523481088,3236609925
4,stablediffusionart,"1582306680523481088,1582290549914357761,158228...","3236609925,1564566449804410881,157692012485817..."
...,...,...,...
594,dalle2,"1582307720451207168,1582307372231720960,158230...","517339521,1368908006692372483,3236609925,78074..."
595,dalle,"1582303240544481280,1582288143856062464,158228...","780746497017032704,780746497017032704,78074649..."
596,midjourney,"1582306680523481088,1582301722310942720,158229...","3236609925,274979139,1560769761734234113,15607..."
597,aiart,"1582306680523481088,1582295222108983297,158229...","3236609925,1387419633788063746,138741963378806..."


In [13]:
hash_df['hashtag'].value_counts().head(10)

dalle2             77
ai                 25
aiart              25
dalle              23
stablediffusion    22
midjourney         22
digitalart         20
AIart              14
aiartist           13
aiartcommunity     12
Name: hashtag, dtype: int64

# Appendix: How I pre-processed the data (See Chapter 7) 




In [14]:
import os
import requests
import dotenv

ENV_PATH = f"..{os.sep}.env"
dotenv.load_dotenv(ENV_PATH) # This will refresh the environment variables
print(len(os.environ.get('TWITTER_BEARER_TOKEN')))

TypeError: object of type 'NoneType' has no len()

In [None]:
URL = "https://api.twitter.com/2/tweets/search/all"

BEARER = os.environ["TWITTER_BEARER_TOKEN"]
headers = {"Authorization": f"Bearer {BEARER}"}

QUERY = "(dalle2) -is:retweet"
MAX_RESULTS = 100 

params={"query": QUERY,
        "max_results":MAX_RESULTS}

params['expansions'] = "author_id,geo.place_id"
params['tweet.fields'] = "entities,public_metrics"
params['user.fields'] = "id,username,name,description,public_metrics"
params['place.fields'] = "id,country,country_code,full_name"

response = requests.get(URL, headers=headers, params=params)

assert response.status_code == 200, \
    f"Code {response.status_code}. See error: {response.json()}"

tweets = response.json()
print(tweets.keys())

dict_keys(['data', 'includes', 'meta'])


In [None]:
import json 

json.dump(tweets['data'], 
          open("dalle2_oct18_2022_tweets.json",'w')) 

json.dump(tweets['includes']['users'],
          open("dalle2_oct18_2022_users.json",'w')) 