### Functions for later...

In [303]:
import datetime
import os
import sys
sys.path.insert(0,"../code")

import altair as alt
alt.data_transformers.disable_max_rows()
import pandas as pd

from collections import Counter, defaultdict
from copy import deepcopy

from nltk.corpus import stopwords
s_words = set(stopwords.words())

from ct_helpers import ct_get_lists, download_posts
from fb_model import FbIgPost


def clean_text(text):
    """
    A convenience function for cleantext.clean because it has an ugly amount
    of parameters.
    """
    return clean(
        text,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=True,  # transliterate to closest ASCII representation
        lower=True,  # lowercase text
        no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
        no_urls=True,  # replace all URLs with a special token
        no_emoji=True,  # remove emojis
        no_emails=True,  # replace all email addresses with a special token
        no_phone_numbers=True,  # replace all phone numbers with a special token
        no_numbers=False,  # replace all numbers with a special token
        no_digits=False,  # replace all digits with a special token
        no_currency_symbols=False,  # replace all currency symbols with a special token
        no_punct=True,  # remove punctuations
        replace_with_punct="",  # instead of removing punctuations you may replace them
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
        lang="en",  # set to 'de' for German special handling
    )

---

# Exploring musical artists Facebook data

In this notebook, we'll walk through how to download and explore posts from a specified list of Facebook accounts. This list will have to be created first with the Facebook-owned insights tool, [CrowdTangle](https://crowdtangle.com/).

Note: There is no cost for using CrowdTangle, but it is only available to select Facebook publishing partners and academics. Check out their [FAQs](https://help.crowdtangle.com/en/collections/41331-faqs-and-troubleshooting) for more information.

### Part 1: Exploring the dashboard.
1. Lists/searches
2. Intelligence
3. Live displays

### Part 2: Two ways to get the data from the dashboard

1. Using the dashboard
2. Using a little code (API)

## Part 3: Exploring the posts

1. **Types of posts sent**
2. **Words used most**
3. **Most basic interactions**:
    1. likes
    2. shares
    3. comments

4. **Most emojis**:
    1. wows
    2. hahas
    3. sad
    4. angry
    5. thankful
    6. care
5. **Trajectory of most popular post (likes)**

In [242]:
post_objs[0].post_object['statistics']['actual']

{'likeCount': 8968,
 'shareCount': 10084,
 'commentCount': 2010,
 'loveCount': 31128,
 'wowCount': 121,
 'hahaCount': 16,
 'sadCount': 8,
 'angryCount': 0,
 'thankfulCount': 0,
 'careCount': 663}

### Get the data for all musical artists

In [234]:
# Load the CT API token
api_token = os.environ.get("CLASS_CT_TOKEN")

# Returns all lists in the dashboard
list_records = ct_get_lists(api_token)

my_list_name = "Musical Artists for Social Media Manipulation"

# Extract the list ID number for that matches my list name
for record in list_records["result"]['lists']:
    id_number = record["id"]
    list_title = record["title"]
    
    if list_title == my_list_name:
        print("Found the list ID number.")
        break

print(f"List ID #: {id_number}")

Found the list ID number.
List ID #: 1732484


In [239]:
# Set up the time between which we'll retrieve data

num_days_back = 30*6

end_dt = datetime.datetime.today().date()
end_time = end_dt.strftime("%Y-%m-%d")
start_time = (
    end_dt - datetime.timedelta(days=num_days_back)
).strftime("%Y-%m-%d")

print(f"We're going to search {num_days_back} days in the past")
print(f"Start time: {start_time}")
print(f"End time  : {end_time}")

We're going to search 180 days in the past
Start time: 2022-04-30
End time  : 2022-10-27


In [240]:
posts = download_posts(
    crowdtangle_list_id=id_number, 
    start=start_time,
    end=end_time,
    max_queries=100,
    api_token=api_token
)

Successful first call.
Setting first_call = False
Found next page: https://api.crowdtangle.com/posts?token=cYbBOStNtq0Hhg4p3DBFiycwNZmcP3GrQ2ONgNbG&sortBy=date&endDate=2022-10-27&startDate=2022-04-30&listIds=1732484&searchField=TEXT_FIELDS_AND_IMAGE_TEXT&count=100&includeHistory=true&offset=100
	|--> 2022-10-11 18:00:19 - 2022-10-26 23:05:36: 100 posts.
Total posts collected: 100
Found next page: https://api.crowdtangle.com/posts?token=cYbBOStNtq0Hhg4p3DBFiycwNZmcP3GrQ2ONgNbG&sortBy=date&endDate=2022-10-27&startDate=2022-04-30&listIds=1732484&searchField=TEXT_FIELDS_AND_IMAGE_TEXT&count=100&includeHistory=true&offset=200
	|--> 2022-09-14 14:09:01 - 2022-10-11 11:57:32: 100 posts.
Total posts collected: 200
Found next page: https://api.crowdtangle.com/posts?token=cYbBOStNtq0Hhg4p3DBFiycwNZmcP3GrQ2ONgNbG&sortBy=date&endDate=2022-10-27&startDate=2022-04-30&listIds=1732484&searchField=TEXT_FIELDS_AND_IMAGE_TEXT&count=100&includeHistory=true&offset=300
	|--> 2022-08-19 17:34:15 - 2022-09-14

In [None]:
for post in posts:
    fb_post = FbIgPost(post)
    print(fb_post.get_link_to_post())

In [243]:
post_objs = [FbIgPost(post) for post in posts] 

In [468]:
text_data = []
performance_data = []

for post in post_objs:
    
    # Handle all posts
    base_record = {
        "id" : post.get_post_ID(),
        "username" : post.get_value(['account','name']),
        "date" : post.post_object["date"],
        "link" : post.get_link_to_post()
    }
    text_record = deepcopy(base_record)
    stat_record = deepcopy(base_record)
    
    
    text_info = {
        "text" : post.get_text(),
        "post_type" : post.get_post_type(),
    }
    text_record.update(text_info)
    text_data.append(text_record)
    
    
    stat_info = post.get_value(["statistics", "actual"])
    stat_record.update(stat_info)
    performance_data.append(stat_record)

{'actual': {'likeCount': 2675,
  'shareCount': 276,
  'commentCount': 172,
  'loveCount': 3204,
  'wowCount': 4,
  'hahaCount': 2,
  'sadCount': 1,
  'angryCount': 0,
  'thankfulCount': 0,
  'careCount': 97},
 'expected': {'likeCount': 5960,
  'shareCount': 639,
  'commentCount': 392,
  'loveCount': 6406,
  'wowCount': 12,
  'hahaCount': 8,
  'sadCount': 3,
  'angryCount': 2,
  'thankfulCount': 0,
  'careCount': 166}}

In [469]:
stats_df = pd.DataFrame.from_records(performance_data)
stats_df

Unnamed: 0,id,username,date,link,likeCount,shareCount,commentCount,loveCount,wowCount,hahaCount,sadCount,angryCount,thankfulCount,careCount
0,28635|675821577247495,Coldplay,2022-10-26 23:05:36,https://www.facebook.com/100044589752285/posts...,9172,10136,2028,31654,121,16,8,0,0,681
1,8316|681572333337247,Eminem,2022-10-26 21:07:38,https://www.facebook.com/100044534585203/posts...,16987,3611,1554,13879,63,18,4,4,0,600
2,28591|704699161019315,David Guetta,2022-10-26 17:00:12,https://www.facebook.com/100044377118271/posts...,669,16,155,203,5,0,0,0,0,9
3,28635|675650017264651,Coldplay,2022-10-26 16:48:03,https://www.facebook.com/100044589752285/posts...,4879,212,421,3432,7,3,4,0,0,91
4,8317|658904715607111,Rihanna,2022-10-26 16:10:43,https://www.facebook.com/100044627640367/posts...,44110,24807,5583,59671,1090,88,10,4,0,1173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844,34676|554861322676940,The Weeknd,2022-05-02 01:36:42,https://www.facebook.com/100044592302176/posts...,6076,1015,721,11441,30,6,0,4,0,420
845,8317|533366314827619,Rihanna,2022-05-02 01:34:05,https://www.facebook.com/100044627640367/posts...,28172,890,1237,11491,141,31,7,6,0,363
846,34676|554602799369459,The Weeknd,2022-05-01 17:21:33,https://www.facebook.com/100044592302176/posts...,10318,1015,1062,15544,48,10,15,1,0,549
847,8317|533102684853982,Rihanna,2022-05-01 15:59:07,https://www.facebook.com/100044627640367/posts...,14747,611,574,5497,30,55,4,4,0,215


In [470]:
stats_df['pd_date'] = pd.to_datetime(stats_df.date)

In [471]:
stats_df

Unnamed: 0,id,username,date,link,likeCount,shareCount,commentCount,loveCount,wowCount,hahaCount,sadCount,angryCount,thankfulCount,careCount,pd_date
0,28635|675821577247495,Coldplay,2022-10-26 23:05:36,https://www.facebook.com/100044589752285/posts...,9172,10136,2028,31654,121,16,8,0,0,681,2022-10-26 23:05:36
1,8316|681572333337247,Eminem,2022-10-26 21:07:38,https://www.facebook.com/100044534585203/posts...,16987,3611,1554,13879,63,18,4,4,0,600,2022-10-26 21:07:38
2,28591|704699161019315,David Guetta,2022-10-26 17:00:12,https://www.facebook.com/100044377118271/posts...,669,16,155,203,5,0,0,0,0,9,2022-10-26 17:00:12
3,28635|675650017264651,Coldplay,2022-10-26 16:48:03,https://www.facebook.com/100044589752285/posts...,4879,212,421,3432,7,3,4,0,0,91,2022-10-26 16:48:03
4,8317|658904715607111,Rihanna,2022-10-26 16:10:43,https://www.facebook.com/100044627640367/posts...,44110,24807,5583,59671,1090,88,10,4,0,1173,2022-10-26 16:10:43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844,34676|554861322676940,The Weeknd,2022-05-02 01:36:42,https://www.facebook.com/100044592302176/posts...,6076,1015,721,11441,30,6,0,4,0,420,2022-05-02 01:36:42
845,8317|533366314827619,Rihanna,2022-05-02 01:34:05,https://www.facebook.com/100044627640367/posts...,28172,890,1237,11491,141,31,7,6,0,363,2022-05-02 01:34:05
846,34676|554602799369459,The Weeknd,2022-05-01 17:21:33,https://www.facebook.com/100044592302176/posts...,10318,1015,1062,15544,48,10,15,1,0,549,2022-05-01 17:21:33
847,8317|533102684853982,Rihanna,2022-05-01 15:59:07,https://www.facebook.com/100044627640367/posts...,14747,611,574,5497,30,55,4,4,0,215,2022-05-01 15:59:07


In [406]:
max_likes = stats_df.groupby(['username'])['likeCount'].max().to_frame("count").reset_index()
max_likes["type"] = "likes"

mean_likes = stats_df.groupby(['username'])['likeCount'].mean().to_frame("count").reset_index()
mean_likes["type"] = "likes"

max_comments = stats_df.groupby(['username'])['commentCount'].max().to_frame("count").reset_index()
max_comments["type"] = "comments"

mean_comments = stats_df.groupby(['username'])['commentCount'].mean().to_frame("count").reset_index()
mean_comments["type"] = "comments"

In [467]:
post.get_link_to_post()

'https://www.facebook.com/100044372291649/posts/565748651580873'

In [407]:
max_mean_df = pd.concat([
    max_likes,
    mean_likes,
    max_comments,
    mean_comments
])

In [408]:
max_mean_df

Unnamed: 0,username,count,type
0,Bad Bunny,89518.0,likes
1,Coldplay,135965.0,likes
2,David Guetta,97425.0,likes
3,Doja Cat,7950.0,likes
4,Drake,58786.0,likes
5,Dua Lipa,145908.0,likes
6,Ed Sheeran,193194.0,likes
7,Eminem,683056.0,likes
8,Harry Styles,281233.0,likes
9,Imagine Dragons,79675.0,likes


In [415]:
stats_df

Unnamed: 0,id,username,date,likeCount,shareCount,commentCount,loveCount,wowCount,hahaCount,sadCount,angryCount,thankfulCount,careCount,pd_date,date_str
0,28635|675821577247495,Coldplay,2022-10-26 23:05:36,9172,10136,2028,31654,121,16,8,0,0,681,2022-10-26 23:05:36,2022_10
1,8316|681572333337247,Eminem,2022-10-26 21:07:38,16987,3611,1554,13879,63,18,4,4,0,600,2022-10-26 21:07:38,2022_10
2,28591|704699161019315,David Guetta,2022-10-26 17:00:12,669,16,155,203,5,0,0,0,0,9,2022-10-26 17:00:12,2022_10
3,28635|675650017264651,Coldplay,2022-10-26 16:48:03,4879,212,421,3432,7,3,4,0,0,91,2022-10-26 16:48:03,2022_10
4,8317|658904715607111,Rihanna,2022-10-26 16:10:43,44110,24807,5583,59671,1090,88,10,4,0,1173,2022-10-26 16:10:43,2022_10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844,34676|554861322676940,The Weeknd,2022-05-02 01:36:42,6076,1015,721,11441,30,6,0,4,0,420,2022-05-02 01:36:42,2022_5
845,8317|533366314827619,Rihanna,2022-05-02 01:34:05,28172,890,1237,11491,141,31,7,6,0,363,2022-05-02 01:34:05,2022_5
846,34676|554602799369459,The Weeknd,2022-05-01 17:21:33,10318,1015,1062,15544,48,10,15,1,0,549,2022-05-01 17:21:33,2022_5
847,8317|533102684853982,Rihanna,2022-05-01 15:59:07,14747,611,574,5497,30,55,4,4,0,215,2022-05-01 15:59:07,2022_5


In [474]:
stats_df.sum()

id               28635|6758215772474958316|68157233333724728591...
username         ColdplayEminemDavid GuettaColdplayRihannaSAM S...
date             2022-10-26 23:05:362022-10-26 21:07:382022-10-...
link             https://www.facebook.com/100044589752285/posts...
likeCount                                                 21387049
shareCount                                                 2420461
commentCount                                               1807959
loveCount                                                 21491979
wowCount                                                    102121
hahaCount                                                   102934
sadCount                                                     79696
angryCount                                                    6229
thankfulCount                                                    0
careCount                                                   624845
dtype: object

In [473]:
bars = alt.Chart(stats_df, title="Single Most Liked Post per Artist").mark_bar().encode(
    x = alt.X("max(likeCount):Q"),
    y = alt.Y(
        "username:N",
        axis = alt.Axis(
            title = None),
        sort="-x"
    ),
    color = alt.Color(
        "max(likeCount):N",
        legend=None
    ),
    href="link:N"
).properties(width=600, height=500)

text = alt.Chart(stats_df).mark_text(dx=30, fontSize=16).encode(
    x = alt.X("max(likeCount):Q"),
    y = alt.Y(
        "username:N",
        axis = alt.Axis(
            title = None),
        sort="-x"
    ),
    text = alt.Text(
        "max(likeCount):N",
        format = ","
    )
).properties(width=600, height=500)


alt.layer(bars,text).configure_axis(
    labelFontSize=16,
    titleFontSize=16
).configure_title(fontSize=18)

In [458]:

alt.Chart(stats_df).mark_bar().encode(
    y = alt.Y("username:N", sort="-x"),
    x = alt.X("max(likeCount):Q"),
    color = alt.Color("username:N", legend=None)
)

In [459]:

alt.Chart(stats_df).mark_bar().encode(
    y = alt.Y("username:N", sort="-x"),
    x = alt.X("mean(likeCount):Q"),
    color = alt.Color("username:N", legend=None)
)

In [442]:
max_likes_per_user = stats_df.groupby(["username"])["likeCount"].max().to_frame("max_likes").reset_index()
max_likes_per_user

Unnamed: 0,username,max_likes
0,Bad Bunny,89518
1,Coldplay,135965
2,David Guetta,97425
3,Doja Cat,7950
4,Drake,58786
5,Dua Lipa,145908
6,Ed Sheeran,193194
7,Eminem,683056
8,Harry Styles,281233
9,Imagine Dragons,79675


In [443]:
num_posts = Counter(stats_df["username"])

In [444]:
num_posts_df = pd.DataFrame.from_dict(num_posts,orient='index').reset_index().rename(columns={0:"num_posts", "index":'username'})

In [445]:
num_posts_df

Unnamed: 0,username,num_posts
0,Coldplay,127
1,Eminem,54
2,David Guetta,41
3,Rihanna,12
4,SAM SMITH,138
5,Bad Bunny,11
6,Imagine Dragons,99
7,Harry Styles,64
8,Dua Lipa,125
9,Taylor Swift,7


In [452]:
merged = max_likes_per_user.merge(num_posts_df, on= "username")#.melt(id_vars="username")

In [453]:
merged

Unnamed: 0,username,max_likes,num_posts
0,Bad Bunny,89518,11
1,Coldplay,135965,127
2,David Guetta,97425,41
3,Doja Cat,7950,1
4,Drake,58786,3
5,Dua Lipa,145908,125
6,Ed Sheeran,193194,67
7,Eminem,683056,54
8,Harry Styles,281233,64
9,Imagine Dragons,79675,99


In [455]:
alt.Chart(merged).mark_point().encode(
    x = alt.X("max_likes:Q"),
    y = alt.Y("num_posts:Q")
)

In [405]:
stats_df.query()

Unnamed: 0,id,username,date,likeCount,shareCount,commentCount,loveCount,wowCount,hahaCount,sadCount,angryCount,thankfulCount,careCount,pd_date,date_str
0,28635|675821577247495,Coldplay,2022-10-26 23:05:36,9172,10136,2028,31654,121,16,8,0,0,681,2022-10-26 23:05:36,2022_10
1,8316|681572333337247,Eminem,2022-10-26 21:07:38,16987,3611,1554,13879,63,18,4,4,0,600,2022-10-26 21:07:38,2022_10
2,28591|704699161019315,David Guetta,2022-10-26 17:00:12,669,16,155,203,5,0,0,0,0,9,2022-10-26 17:00:12,2022_10
3,28635|675650017264651,Coldplay,2022-10-26 16:48:03,4879,212,421,3432,7,3,4,0,0,91,2022-10-26 16:48:03,2022_10
4,8317|658904715607111,Rihanna,2022-10-26 16:10:43,44110,24807,5583,59671,1090,88,10,4,0,1173,2022-10-26 16:10:43,2022_10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844,34676|554861322676940,The Weeknd,2022-05-02 01:36:42,6076,1015,721,11441,30,6,0,4,0,420,2022-05-02 01:36:42,2022_5
845,8317|533366314827619,Rihanna,2022-05-02 01:34:05,28172,890,1237,11491,141,31,7,6,0,363,2022-05-02 01:34:05,2022_5
846,34676|554602799369459,The Weeknd,2022-05-01 17:21:33,10318,1015,1062,15544,48,10,15,1,0,549,2022-05-01 17:21:33,2022_5
847,8317|533102684853982,Rihanna,2022-05-01 15:59:07,14747,611,574,5497,30,55,4,4,0,215,2022-05-01 15:59:07,2022_5


In [362]:
post.post_object["date"]

'2022-04-30 14:27:37'

In [391]:
stats_df['date_str'] = (stats_df.pd_date.dt.year).astype(str) + "_" + (stats_df.pd_date.dt.month).astype(str)

In [392]:
stats_df

Unnamed: 0,id,username,date,likeCount,shareCount,commentCount,loveCount,wowCount,hahaCount,sadCount,angryCount,thankfulCount,careCount,pd_date,date_str
0,28635|675821577247495,Coldplay,2022-10-26 23:05:36,9172,10136,2028,31654,121,16,8,0,0,681,2022-10-26 23:05:36,2022_10
1,8316|681572333337247,Eminem,2022-10-26 21:07:38,16987,3611,1554,13879,63,18,4,4,0,600,2022-10-26 21:07:38,2022_10
2,28591|704699161019315,David Guetta,2022-10-26 17:00:12,669,16,155,203,5,0,0,0,0,9,2022-10-26 17:00:12,2022_10
3,28635|675650017264651,Coldplay,2022-10-26 16:48:03,4879,212,421,3432,7,3,4,0,0,91,2022-10-26 16:48:03,2022_10
4,8317|658904715607111,Rihanna,2022-10-26 16:10:43,44110,24807,5583,59671,1090,88,10,4,0,1173,2022-10-26 16:10:43,2022_10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844,34676|554861322676940,The Weeknd,2022-05-02 01:36:42,6076,1015,721,11441,30,6,0,4,0,420,2022-05-02 01:36:42,2022_5
845,8317|533366314827619,Rihanna,2022-05-02 01:34:05,28172,890,1237,11491,141,31,7,6,0,363,2022-05-02 01:34:05,2022_5
846,34676|554602799369459,The Weeknd,2022-05-01 17:21:33,10318,1015,1062,15544,48,10,15,1,0,549,2022-05-01 17:21:33,2022_5
847,8317|533102684853982,Rihanna,2022-05-01 15:59:07,14747,611,574,5497,30,55,4,4,0,215,2022-05-01 15:59:07,2022_5


In [395]:
mean_stats_df = stats_df.groupby(["username","date_str"])["likeCount"].mean().reset_index()

In [380]:
alt.Chart(stats_df).mark_point().encode(
    x = alt.X("pd_date:T"),
    y = alt.Y("likeCount:Q", scale=alt.Scale(type='log')),
    color = alt.Color("username:N"),
    shape = alt.Shape("username:N"),
    tooltip = [
        alt.Tooltip('username:N', title="User"),
        alt.Tooltip('pd_date:T', title="Date", format="%b %d"),
        alt.Tooltip('likeCount:N', title="# Likes", format=","),
    ]
).properties(
    width=700,
    height=500
).configure_axis(
    titleFontSize=15,
    labelFontSize=15
)

In [398]:
mean_stats_df

Unnamed: 0,username,date_str,likeCount
0,Bad Bunny,2022_10,848.000000
1,Bad Bunny,2022_5,41839.250000
2,Bad Bunny,2022_6,19395.000000
3,Bad Bunny,2022_7,34930.000000
4,Bad Bunny,2022_8,11459.000000
...,...,...,...
69,The Weeknd,2022_5,20782.750000
70,The Weeknd,2022_6,30076.666667
71,The Weeknd,2022_7,14192.750000
72,The Weeknd,2022_8,8304.857143


In [348]:
text_df = pd.DataFrame.from_records(text_data)
text_df

Unnamed: 0,id,username,text,post_type
0,28635|675821577247495,Coldplay,The Astronaut // 28 October // 10월 28일 💜🧑‍🚀,native_video
1,8316|681572333337247,Eminem,"“I’m still standing here screaming ""f*** the F...",native_video
2,28591|704699161019315,David Guetta,Maybe you don’t know about that but my clip of...,status
3,28635|675650017264651,Coldplay,"Show #55 Buenos Aires, Argentina OCT 28 / 29⚡️...",photo
4,8317|658904715607111,Rihanna,lift me up 10.28.22 http://rihanna.lnk.to/lift...,native_video
...,...,...,...,...
844,34676|554861322676940,The Weeknd,created this version of “often” with MIKE DEAN...,native_video
845,8317|533366314827619,Rihanna,how sexy is this?!! 🤎 #FENTYPARFUM fentybeauty...,native_video
846,34676|554602799369459,The Weeknd,EUROPE can you hear me ?! 👀,native_video
847,8317|533102684853982,Rihanna,wake up and smell the billboard NYC!!!! a wise...,photo


In [285]:
post_type_counts = text_df.post_type.value_counts().to_frame("count").reset_index().rename(
        columns={"index": "post_type"}
    )

In [286]:
post_type_counts['percentage'] = (post_type_counts['count'] / len(posts)) * 100
post_type_counts['proportion'] = post_type_counts['count'] / len(posts)
post_type_counts

Unnamed: 0,post_type,count,percentage,proportion
0,photo,508,59.8351,0.598351
1,native_video,296,34.864547,0.348645
2,live_video_scheduled,19,2.237927,0.022379
3,status,9,1.060071,0.010601
4,link,6,0.706714,0.007067
5,youtube,6,0.706714,0.007067
6,live_video_complete,3,0.353357,0.003534
7,video,2,0.235571,0.002356


In [287]:
bars = alt.Chart(post_type_counts, title="Post Type Breakdown").mark_bar().encode(
    x = alt.X("count:Q"),
    y = alt.Y(
        "post_type:N",
        axis = alt.Axis(
            title = None),
        sort="-x"
    ),
    color = alt.Color(
        "post_type:N",
        legend=None
    ),
    tooltip = [
        alt.Tooltip("post_type:N", title = "Post Type"),
        alt.Tooltip("count:Q", title = "# of Posts"),
        alt.Tooltip("percentage:Q", title = "% of Posts", format=".2f"),
    ]
).properties(width=600,height=200)

text = alt.Chart(post_type_counts).mark_text(dx=30, fontSize=16).encode(
    x = alt.X("count:Q"),
    y = alt.Y(
        "post_type:N",
        axis = alt.Axis(
            title = None),
        sort="-x"
    ),
    text = alt.Text(
        "proportion:N",
        format = ".2%"
    )
).properties(width=600)


alt.layer(bars,text).configure_axis(
    labelFontSize=16,
    titleFontSize=16
).configure_title(fontSize=18)

### Clean text to get word counts...

In [330]:
for post in text_df.text:
    print(post, "\n")
    print("-"*50)
    

The Astronaut // 28 October // 10월 28일 💜🧑‍🚀 

--------------------------------------------------
“I’m still standing here screaming "f*** the Free World."" #8Mile 20th Anniversary Deluxe Edition of the soundtrack drops on streaming Friday! 

--------------------------------------------------
Maybe you don’t know about that but my clip of “Just A Little More Love” has been remastered! Check it out here: https://www.youtube.com/watch?v=e72Y5u-r3u8 

--------------------------------------------------
Show #55 Buenos Aires, Argentina OCT 28 / 29⚡️ Watch the show in cinemas worldwide - info / tickets at www.coldplaycinema.live #ColdplayBuenosAires #Coldplay #MusicOfTheSpheresWorldTour #ColdplayLiveBroadcast 

--------------------------------------------------
lift me up 10.28.22 http://rihanna.lnk.to/liftmeup 

--------------------------------------------------
Constantly in awe of you Cat Burns ❤️ It's an honour to have you joining me on my GLORIA tour 💛 Tickets go on sale tomorrow 10am bs

Beautiful night with beautiful friends xx Harris Reed #AnnaWintour 

--------------------------------------------------
I’ve put off explaining this record as long as I could. What can be said about grief? There is no word or sentence to explain what accompanies the loss of someone you love. Mercury act 1 was primarily focused on the shock of losing a loved one. The immediate feeling of emptiness. Something that was of great worth to you is now gone, and the pain left in its wake is overwhelming. Do you attempt to fill it with something else? Do you try to forget it? You can’t forget it. You can’t replace it. Mercury act 2 focuses primarily on life after loss. What now? Life must continue. You have obligations to fulfill. People to take care of. A job. A life of your own. I imagine those who have passed on want nothing more than for us to carry on with more presence of mind. More gratitude. More living! Mercury act 2 is about living. Not forgetting, but rather - carrying on with a new 

In [331]:
text_df['clean_text'] = text_df['text'].map(clean_text)

In [332]:
for post in text_df.clean_text:
    print(post, "\n")
    print("-"*50)

the astronaut 28 october 10weol 28il 

--------------------------------------------------
im still standing here screaming f the free world 8mile 20th anniversary deluxe edition of the soundtrack drops on streaming friday 

--------------------------------------------------
maybe you dont know about that but my clip of just a little more love has been remastered check it out here 

--------------------------------------------------
show 55 buenos aires argentina oct 28 29 watch the show in cinemas worldwide info tickets at coldplaybuenosaires coldplay musicofthespheresworldtour coldplaylivebroadcast 

--------------------------------------------------
lift me up 102822 

--------------------------------------------------
constantly in awe of you cat burns its an honour to have you joining me on my gloria tour tickets go on sale tomorrow 10am bst 

--------------------------------------------------
countdown starts now savagexfentyshow vol 4 110922 savage x fenty amazon prime video amaz


--------------------------------------------------
the walk to the stage for the second night of coldplayfrankfurt 

--------------------------------------------------
rock werchter belgium nathan dobbelaere 

--------------------------------------------------
rock werchter belgium nateconcertphotography 

--------------------------------------------------
 

--------------------------------------------------
celebrating london pride with the magical xx jwanderson 

--------------------------------------------------
 

--------------------------------------------------
tour life all summer mu tsu tu 2 19 

--------------------------------------------------
new sharks and mercury act 2 merch available to shop now on our store 

--------------------------------------------------
pride x jw anderson jwanderson jonathananderson 

--------------------------------------------------
living the fantasy 

--------------------------------------------------
to celebrate 50 years of london pride 

In [289]:
user_word_counter = defaultdict(Counter)

for idx, row in text_df.iterrows():
    user = row['username']
    clean_text = row['clean_text'].split()
    
    for word in clean_text:
        if word not in s_words:
            user_word_counter[user][word] += 1

In [290]:
user_word_count_data = []

for user, data in user_word_counter.items():
    
    for word, count in data.items():
        user_word_count_data.append({
            "user" : user,
            "word" : word,
            "count" : count
        })

In [301]:
user_word_count_df = pd.DataFrame.from_records(user_word_count_data)
user_word_count_df

Unnamed: 0,user,word,count
0,Coldplay,astronaut,2
1,Coldplay,28,13
2,Coldplay,october,11
3,Coldplay,10weol,1
4,Coldplay,28il,1
...,...,...,...
5581,Doja Cat,motion,1
5582,Doja Cat,picture,1
5583,Doja Cat,soundtrack,1
5584,Doja Cat,listen,1


In [328]:
hover = alt.selection_single(
    on='mouseover',  # select on mouseover
    nearest=True,    # select nearest point to mouse cursor
    empty='none'     # empty selection should match nothing
)

click = alt.selection_multi(
    empty='none' # empty selection matches no points
)


plot = alt.Chart(
    title="Words Most-commonly Used by an Artist"
).mark_circle(size=50, opacity=.5).encode(
    x = alt.X(
        "user:N", title = None,
        axis = alt.Axis(labelAngle=45)
    ),
    y = alt.Y("count:Q"),
)

base = plot.transform_filter(
    hover | click # filter to points in either selection
)


alt.layer(
    plot.add_selection(hover).add_selection(click),
    base.mark_point(size=100, stroke='firebrick', strokeWidth=2),
    base.mark_text(dx=4, dy=-8, align='left', stroke='white', strokeWidth=3, fontSize=15).encode(text='word:N'),
    base.mark_text(dx=4, dy=-8, align='left', fontSize=15).encode(text='word:N'),
    data=user_word_count_df
).properties(
    width=800,
    height=450
).configure_axis(
    labelFontSize=15,
    titleFontSize=15
).configure_title(fontSize=18)
