### Functions for later...

In [1]:
import datetime
import os
import sys
sys.path.insert(0,"../code")

import altair as alt
alt.data_transformers.disable_max_rows()
import pandas as pd

from collections import Counter, defaultdict
from copy import deepcopy

from nltk.corpus import stopwords
s_words = set(stopwords.words())

from ct_helpers import ct_get_lists, download_posts
from fb_model import FbIgPost

from cleantext import clean


def clean_text(text):
    """
    A convenience function for cleantext.clean because it has an ugly amount
    of parameters.
    """
    return clean(
        text,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=True,  # transliterate to closest ASCII representation
        lower=True,  # lowercase text
        no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
        no_urls=True,  # replace all URLs with a special token
        no_emoji=True,  # remove emojis
        no_emails=True,  # replace all email addresses with a special token
        no_phone_numbers=True,  # replace all phone numbers with a special token
        no_numbers=False,  # replace all numbers with a special token
        no_digits=False,  # replace all digits with a special token
        no_currency_symbols=False,  # replace all currency symbols with a special token
        no_punct=True,  # remove punctuations
        replace_with_punct="",  # instead of removing punctuations you may replace them
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
        lang="en",  # set to 'de' for German special handling
    )

---

#  🎤 🎸 🥁 🎹 🎺 Exploring musical artists Facebook data 🎤 🎸 🥁 🎹 🎺

In this notebook, we'll walk through how to download and explore posts from a specified list of Facebook accounts. This list will have to be created first with the Facebook-owned insights tool, [CrowdTangle](https://crowdtangle.com/).

Note: There is no cost for using CrowdTangle, but it is only available to select Facebook publishing partners and academics. Check out their [FAQs](https://help.crowdtangle.com/en/collections/41331-faqs-and-troubleshooting) for more information.

---

### [Take me to the CrowdTangle Dashboard](https://apps.crowdtangle.com/socialmediamanipulationclass)
### Part 1: Exploring the dashboard.
1. Lists/searches
2. Intelligence
3. Live displays


## Part 2: Candy Trivia + Exploring the posts (w/ some code)

## Get the data for all musical artists
---

### Get the list ID

In [2]:
# Load the CT API token
api_token = os.environ.get("CLASS_CT_TOKEN")

# Returns all lists in the dashboard
list_records = ct_get_lists(api_token)

my_list_name = "Musical Artists for Social Media Manipulation"

# Extract the list ID number for that matches my list name
for record in list_records["result"]['lists']:
    id_number = record["id"]
    list_title = record["title"]
    
    if list_title == my_list_name:
        print("Found the list ID number.")
        break

print(f"List ID #: {id_number}")

Found the list ID number.
List ID #: 1732484


### Set start and end date

In [3]:
# Set up the time between which we'll retrieve data

num_days_back = 30*6

end_dt = datetime.datetime.today().date()
end_time = end_dt.strftime("%Y-%m-%d")
start_time = (
    end_dt - datetime.timedelta(days=num_days_back)
).strftime("%Y-%m-%d")

print(f"We're going to search {num_days_back} days in the past")
print(f"Start time: {start_time}")
print(f"End time  : {end_time}")

We're going to search 180 days in the past
Start time: 2022-05-13
End time  : 2022-11-09


In [4]:
posts = download_posts(
    crowdtangle_list_id=id_number, #<---- The number associated with out list
    start=start_time,
    end=end_time,
    max_queries=100,               #<---- Make no more than this number of queries
    api_token=api_token            #<---- API token for access
)

Successful first call.
Setting first_call = False
Found next page: https://api.crowdtangle.com/posts?token=cYbBOStNtq0Hhg4p3DBFiycwNZmcP3GrQ2ONgNbG&sortBy=date&endDate=2022-11-09&startDate=2022-05-13&listIds=1732484&searchField=TEXT_FIELDS_AND_IMAGE_TEXT&count=100&includeHistory=true&offset=100
	|--> 2022-10-25 19:19:12 - 2022-11-08 17:10:41: 100 posts.
Total posts collected: 100
Found next page: https://api.crowdtangle.com/posts?token=cYbBOStNtq0Hhg4p3DBFiycwNZmcP3GrQ2ONgNbG&sortBy=date&endDate=2022-11-09&startDate=2022-05-13&listIds=1732484&searchField=TEXT_FIELDS_AND_IMAGE_TEXT&count=100&includeHistory=true&offset=200
	|--> 2022-10-08 04:02:12 - 2022-10-25 19:10:57: 100 posts.
Total posts collected: 200
Found next page: https://api.crowdtangle.com/posts?token=cYbBOStNtq0Hhg4p3DBFiycwNZmcP3GrQ2ONgNbG&sortBy=date&endDate=2022-11-09&startDate=2022-05-13&listIds=1732484&searchField=TEXT_FIELDS_AND_IMAGE_TEXT&count=100&includeHistory=true&offset=300
	|--> 2022-09-12 20:19:12 - 2022-10-07

# Candy Trivia!
# 🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫

![display image](../gifs/despicable-me-candy.gif)
![display image](../gifs/homer-sweet-sweet-candy.gif)
![display image](../gifs/psycho-oprah-candy.gif)

# 🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫

- ## What artist earned the **most interactions overall**?
- ## What type of interaction typically contributes the most to an artists interactions?
- ## Artist with the most emojis total for ➡️ ❤️😡😆
    - Which artist earned the most **love** emojis (total)?
    - Which artist earned the most **angry** emojis (total)?
    - Which artist earned the most **haha** emojis (total)?
- ## Artist with the most emojis from a single post ➡️ ❤️😡😆
    - Which artist earned earned the most **love** in a (single post)?
    - Which artist earned the most **angry** emojis in a (single post)?
    - Which artist earned the most **haha** emojis in a (single post)?

---
---
---
---

### Scoring system

- Each team gets to submit one answer per question
    - Each answer is provided with a point wager between 1-10
- Correct answers earn those points (and team members get to choose one candy per person)
- Incorrect answers lose those points (and get no candy)


### What does the data look like?

In [5]:
type(posts)

list

In [6]:
len(posts)

891

In [9]:
posts[0]

{'platformId': '100044534585203_690864309074716',
 'platform': 'Facebook',
 'date': '2022-11-08 17:10:41',
 'updated': '2022-11-09 20:23:14',
 'type': 'native_video',
 'message': "🍝 🗽 #8Mile 20th Anniversary pop-up with Mom's Spaghetti Detroit is coming to NYC Nov 10-20 at the Shopify NY space in SoHo - RSVP to get in at momsxshopify.com/ 🚨",
 'expandedLinks': [{'original': 'https://www.facebook.com/eminem/videos/512162880798693/',
   'expanded': 'https://www.facebook.com/eminem/videos/512162880798693/'}],
 'link': 'https://www.facebook.com/eminem/videos/512162880798693/',
 'postUrl': 'https://www.facebook.com/100044534585203/posts/690864309074716',
 'subscriberCount': 94802712,
 'score': -5.291709016988529,
 'media': [{'type': 'video',
   'url': 'https://video-sea1-1.xx.fbcdn.net/v/t42.1790-2/315031557_673995004399571_7582144638479440643_n.mp4?_nc_cat=100&ccb=1-7&_nc_sid=985c63&efg=eyJybHIiOjU1MCwicmxhIjo1MTIsInZlbmNvZGVfdGFnIjoic3ZlX3NkIn0%3D&_nc_ohc=mrqXqzf4Qh8AX9cJFxn&rl=550&vabr=3

### `FbIgPost` will make handling the data a bit easier

In [10]:
post_objs = [FbIgPost(post) for post in posts] 

In [11]:
type(posts[0])

dict

In [12]:
type(post_objs[0])

fb_model.FbIgPost

### Organize the data a little bit...

In [13]:
text_data = []
performance_data = []

for post in post_objs:
    
    # Handle all posts
    base_record = {
        "id" : post.get_post_ID(),
        "username" : post.get_value(['account','name']),
        "date" : post.post_object["date"],
        "link" : post.get_link_to_post()
    }
    text_record = deepcopy(base_record)
    stat_record = deepcopy(base_record)
    
    
    text_info = {
        "text" : post.get_text(),
        "post_type" : post.get_post_type(),
    }
    text_record.update(text_info)
    text_data.append(text_record)
    
    
    stat_info = post.get_value(["statistics", "actual"])
    stat_record.update(stat_info)
    performance_data.append(stat_record)

In [14]:
stats_df = pd.DataFrame.from_records(performance_data)
stats_df

Unnamed: 0,id,username,date,link,likeCount,shareCount,commentCount,loveCount,wowCount,hahaCount,sadCount,angryCount,thankfulCount,careCount
0,8316|690864309074716,Eminem,2022-11-08 17:10:41,https://www.facebook.com/100044534585203/posts...,3723,348,455,2302,24,30,5,0,0,95
1,8317|668842161280033,Rihanna,2022-11-08 17:06:31,https://www.facebook.com/100044627640367/posts...,97539,11125,4227,68057,702,109,35,14,0,1525
2,47259|708250887330648,SAM SMITH,2022-11-08 16:24:58,https://www.facebook.com/100044372291649/posts...,8825,1017,1378,8027,90,678,41,32,0,183
3,8316|690418029119344,Eminem,2022-11-08 02:55:06,https://www.facebook.com/100044534585203/posts...,36732,10026,2609,30968,271,40,9,6,0,875
4,11084|673043404193274,Drake,2022-11-07 22:54:47,https://www.facebook.com/100044628600301/posts...,7705,363,677,3685,8,15,2,2,0,125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2470898|549233869904744,Bad Bunny,2022-05-14 16:05:02,https://www.facebook.com/100044544064759/posts...,57593,10317,5673,126481,422,638,20,33,0,3892
887,2470898|549173639910767,Bad Bunny,2022-05-14 14:30:40,https://www.facebook.com/100044544064759/posts...,2253,180,127,4397,4,40,4,3,0,225
888,28635|559655572197430,Coldplay,2022-05-13 21:54:55,https://www.facebook.com/100044589752285/posts...,19367,3016,1308,18688,99,14,16,0,0,549
889,47259|575265377295867,SAM SMITH,2022-05-13 16:57:33,https://www.facebook.com/100044372291649/posts...,3637,121,170,3294,4,5,0,0,0,96


# Q1: What artist earned the **most interactions overall**?

In [15]:
for user in stats_df.username.unique():
    print(user)

Eminem
Rihanna
SAM SMITH
Drake
Justin Bieber
Coldplay
David Guetta
Dua Lipa
Harry Styles
Ed Sheeran
Imagine Dragons
Taylor Swift
Bad Bunny
The Weeknd
Kanye West


In [16]:
# These are the columns we want to count
responses = [
    "likeCount",
    "shareCount",
    "commentCount",
    "wowCount", 
    "hahaCount",
    "sadCount", 
    "careCount"
]

# Calculate the total interactions per emoji option
total_response_counts = stats_df.groupby('username')[responses].sum().reset_index()
total_response_counts.head(3)

Unnamed: 0,username,likeCount,shareCount,commentCount,wowCount,hahaCount,sadCount,careCount
0,Bad Bunny,325799,56771,269419,1395,6096,412,41990
1,Coldplay,2764730,325267,211209,20820,4168,3277,50832
2,David Guetta,411249,82528,39698,2661,6729,306,5174


In [17]:
# Take the total across the rows for each artist...
total_interactions = pd.DataFrame(total_response_counts[responses].sum(axis=1), columns=['counts'])
total_interactions["username"] = total_response_counts['username']

In [18]:
bars = alt.Chart(
    total_interactions,
    title="Most interactions total 🤑"
).mark_bar(color='firebrick').encode(
    x = alt.X("counts:Q", title="Total interactions"),
    y = alt.Y("username:N", sort="-x", title=None)
).properties(
    width=700,
    height=400
)


text = alt.Chart(
    total_interactions
).mark_text(align='left', fontWeight="bold", fontSize=15).encode(
    x = alt.X("counts:Q", title="Total interactions"),
    y = alt.Y("username:N", sort="-x", title=None),
    text = alt.Text("counts:N", format=',')
).properties(
    width=700,
    height=400
)

alt.layer(bars,text).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_legend(
    titleFontSize=15,
    labelFontSize=15
).configure_title(
    fontSize=20,
)

# Q2: What type of interaction typically contributes the most to an artists interactions? 

In [19]:
for interaction in responses:
    print(interaction)

likeCount
shareCount
commentCount
wowCount
hahaCount
sadCount
careCount


In [21]:
total_response_counts

Unnamed: 0,username,likeCount,shareCount,commentCount,wowCount,hahaCount,sadCount,careCount
0,Bad Bunny,325799,56771,269419,1395,6096,412,41990
1,Coldplay,2764730,325267,211209,20820,4168,3277,50832
2,David Guetta,411249,82528,39698,2661,6729,306,5174
3,Drake,183337,15978,17282,406,1968,97,3452
4,Dua Lipa,5389411,319325,202555,16811,10491,15564,187492
5,Ed Sheeran,1380091,84487,90587,3823,24695,1008,20437
6,Eminem,2469712,233548,180954,11243,8532,729,40131
7,Harry Styles,1543508,417098,233771,13076,5450,1321,53973
8,Imagine Dragons,859844,64847,54049,6498,1049,12753,20164
9,Justin Bieber,3793633,135901,161027,9126,11320,16851,88639


In [22]:
props = total_response_counts[responses].divide(total_response_counts.sum(axis=1), axis=0)
props['username'] = total_response_counts['username'].copy()

In [23]:
props

Unnamed: 0,likeCount,shareCount,commentCount,wowCount,hahaCount,sadCount,careCount,username
0,0.464179,0.080884,0.383852,0.001988,0.008685,0.000587,0.059825,Bad Bunny
1,0.817894,0.096224,0.062482,0.006159,0.001233,0.000969,0.015038,Coldplay
2,0.749982,0.150504,0.072396,0.004853,0.012271,0.000558,0.009436,David Guetta
3,0.823912,0.071805,0.077665,0.001825,0.008844,0.000436,0.015513,Drake
4,0.877519,0.051993,0.032981,0.002737,0.001708,0.002534,0.030528,Dua Lipa
5,0.859801,0.052636,0.056436,0.002382,0.015385,0.000628,0.012732,Ed Sheeran
6,0.838655,0.079307,0.061448,0.003818,0.002897,0.000248,0.013628,Eminem
7,0.6805,0.18389,0.103065,0.005765,0.002403,0.000582,0.023796,Harry Styles
8,0.843643,0.063625,0.053031,0.006376,0.001029,0.012513,0.019784,Imagine Dragons
9,0.899712,0.032231,0.03819,0.002164,0.002685,0.003996,0.021022,Justin Bieber


In [24]:
alt.Chart(
    props.melt(id_vars='username'),
    title = "Interaction percentage breakdown"
).mark_bar().encode(
    x = alt.X("value:Q", title="Percent of all interactions", axis=alt.Axis(format="%"), scale=alt.Scale(domain=[0,1])),
    y = alt.Y("username", title=None),
    color = alt.Color("variable:N", title="Interaction type", sort="-x", legend=alt.Legend(orient='bottom')),
    tooltip = [
        alt.Tooltip('value:Q', title="Percent", format=".2%"),
        alt.Tooltip('variable:N', title="Interaction")
    ]
).properties(
    width=800,
    height=400
).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_legend(
    titleFontSize=15,
    labelFontSize=16
).configure_title(
    fontSize=20,
)

# Q3: Artist with the most emojis total for ➡️➡️    ❤️😡😆
- Which artist earned the most **love** emojis (total)?
- Which artist earned the most **angry** emojis (total)?
- Which artist earned the most **haha** emojis (total)?


In [26]:
love_total = stats_df.groupby("username")['loveCount'].sum().to_frame("count").reset_index()
angry_total = stats_df.groupby("username")['angryCount'].sum().to_frame("count").reset_index()
haha_total = stats_df.groupby("username")['hahaCount'].sum().to_frame("count").reset_index()


## Q3.1 Total loves

In [25]:
for user in stats_df.username.unique():
    print(user)

Eminem
Rihanna
SAM SMITH
Drake
Justin Bieber
Coldplay
David Guetta
Dua Lipa
Harry Styles
Ed Sheeran
Imagine Dragons
Taylor Swift
Bad Bunny
The Weeknd
Kanye West


In [27]:
# Create bar chart
bars = alt.Chart(
    love_total,
    title="Total ❤️ for each artist"
).mark_bar(color='forestgreen').encode(
    x = alt.X("count:Q", title = "Total loves"),
    y = alt.Y("username:N", sort="-x", title=None),
).properties(
    width=700,
    height=400
)

# Add the total number of loves for each artist
text = alt.Chart(love_total).mark_text(align='left', fontSize=15).encode(
    x = alt.X("count:Q", title = "Total loves"),
    y = alt.Y("username:N", sort="-x", title=None),
    text = alt.Text("count:Q", format=",")
)

# Put them together and configure some aesthetics
alt.layer(bars, text).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

## Q3.2 Total Angry

In [28]:
for user in stats_df.username.unique():
    print(user)

Eminem
Rihanna
SAM SMITH
Drake
Justin Bieber
Coldplay
David Guetta
Dua Lipa
Harry Styles
Ed Sheeran
Imagine Dragons
Taylor Swift
Bad Bunny
The Weeknd
Kanye West


In [29]:
# Create bar chart
bars = alt.Chart(
    angry_total,
    title="Total 😡 for each artist"
).mark_bar(color='red').encode(
    x = alt.X("count:Q", title = "Total angry"),
    y = alt.Y("username:N", sort="-x", title=None),
).properties(
    width=700,
    height=400
)

# Add the total number of loves for each artist
text = alt.Chart(angry_total).mark_text(align='left', fontSize=15).encode(
    x = alt.X("count:Q", title = "Total angry"),
    y = alt.Y("username:N", sort="-x", title=None),
    text = alt.Text("count:Q", format=",")
)

# Put them together and configure some aesthetics
alt.layer(bars, text).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

## Q3.3 Total haha

In [30]:
for user in stats_df.username.unique():
    print(user)

Eminem
Rihanna
SAM SMITH
Drake
Justin Bieber
Coldplay
David Guetta
Dua Lipa
Harry Styles
Ed Sheeran
Imagine Dragons
Taylor Swift
Bad Bunny
The Weeknd
Kanye West


In [31]:
# Create bar chart
bars = alt.Chart(
    haha_total,
    title="Total 😆 for each artist"
).mark_bar(color='blue').encode(
    x = alt.X("count:Q", title = "Total haha"),
    y = alt.Y("username:N", sort="-x", title=None),
).properties(
    width=700,
    height=400
)

# Add the total number of loves for each artist
text = alt.Chart(haha_total).mark_text(align='left', fontSize=15).encode(
    x = alt.X("count:Q", title = "Total haha"),
    y = alt.Y("username:N", sort="-x", title=None),
    text = alt.Text("count:Q", format=",")
)

# Put them together and configure some aesthetics
alt.layer(bars, text).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

# Q4 Artist with the most emojis from a single post ➡️ ❤️😡😆
- Which artist earned earned the most **love** in a (single post)?
- Which artist earned the most **angry** emojis in a (single post)?
- Which artist earned the most **haha** emojis in a (single post)?

In [32]:
top_count = 10

toploved = stats_df.sort_values("loveCount", ascending=False).head(top_count).reset_index(drop=True)
topangry = stats_df.sort_values("angryCount", ascending=False).head(top_count).reset_index(drop=True)
tophaha = stats_df.sort_values("hahaCount", ascending=False).head(top_count).reset_index(drop=True)

In [None]:
print(f"Top {top_count} Loved Posts:")
for rank in range(0, top_count):
    print(toploved.iloc[rank].link)

In [None]:
print(f"Top {top_count} Angry Posts:")
for rank in range(0, top_count):
    print(topangry.iloc[rank].link)

In [None]:
print(f"Top {top_count} Haha Posts:")
for rank in range(0, top_count):
    print(topangry.iloc[rank].link)

## Q4.1 Which artist earned the most ❤️ in a (single post)?

In [33]:
for user in stats_df.username.unique():
    print(user)

Eminem
Rihanna
SAM SMITH
Drake
Justin Bieber
Coldplay
David Guetta
Dua Lipa
Harry Styles
Ed Sheeran
Imagine Dragons
Taylor Swift
Bad Bunny
The Weeknd
Kanye West


In [34]:
alt.Chart(
    toploved,
    title = "Top 🔟 ❤️ posts"
).mark_circle(size=100, color='black').encode(
    x = alt.X("loveCount:Q"),
    y = alt.Y("username:N", sort='-x', title=None),
    color = alt.Color("username:N", sort='-x', legend=None),
    href = alt.Href("link:N"),
    tooltip = [
        alt.Tooltip("loveCount:Q", format=",")
    ]
).properties(
    width=700,
#     height=300
).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

## Q4.2 Which artist earned the most 😡 in a (single post)?

In [35]:
for user in stats_df.username.unique():
    print(user)

Eminem
Rihanna
SAM SMITH
Drake
Justin Bieber
Coldplay
David Guetta
Dua Lipa
Harry Styles
Ed Sheeran
Imagine Dragons
Taylor Swift
Bad Bunny
The Weeknd
Kanye West


In [36]:
alt.Chart(
    topangry,
    title = "Top 🔟 😡 posts"
).mark_circle(size=100, color='black').encode(
    x = alt.X("angryCount:Q"),
    y = alt.Y("username:N", sort='-x', title=None),
    color = alt.Color("username:N", sort='-x', legend=None),
    href = alt.Href("link:N"),
    tooltip = [
        alt.Tooltip("angryCount:Q", format=",")
    ]
).properties(
    width=700,
#     height=300
).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

## Q4.3 Which artist earned earned the most 😆 in a (single post)?

In [37]:
for user in stats_df.username.unique():
    print(user)

Eminem
Rihanna
SAM SMITH
Drake
Justin Bieber
Coldplay
David Guetta
Dua Lipa
Harry Styles
Ed Sheeran
Imagine Dragons
Taylor Swift
Bad Bunny
The Weeknd
Kanye West


In [38]:
alt.Chart(
    tophaha,
    title = "Top 🔟 😆 posts"
).mark_circle(size=100, color='black').encode(
    x = alt.X("hahaCount:Q"),
    y = alt.Y("username:N", sort='-x', title=None),
    color = alt.Color("username:N", sort='-x', legend=None),
    href = alt.Href("link:N"),
    tooltip = [
        alt.Tooltip("hahaCount:Q", format=",")
    ]
).properties(
    width=700,
#     height=300
).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

# Bonus question: What type of post is shared the most?

### Options:
- `photo`
- `native_video`
- `live_video_scheduled`
- `status`
- `link`
- `youtube`
- `live_video_complete`
- `video`

In [None]:
text_df = pd.DataFrame.from_records(text_data)
text_df

In [None]:
post_type_counts = text_df.post_type.value_counts().to_frame("count").reset_index().rename(
        columns={"index": "post_type"}
    )

In [None]:
post_type_counts['percentage'] = (post_type_counts['count'] / len(posts)) * 100
post_type_counts['proportion'] = post_type_counts['count'] / len(posts)
post_type_counts

In [None]:
bars = alt.Chart(post_type_counts, title="Post Type Breakdown").mark_bar().encode(
    x = alt.X("count:Q"),
    y = alt.Y(
        "post_type:N",
        axis = alt.Axis(
            title = None),
        sort="-x"
    ),
    color = alt.Color(
        "post_type:N",
        legend=None
    ),
    tooltip = [
        alt.Tooltip("post_type:N", title = "Post Type"),
        alt.Tooltip("count:Q", title = "# of Posts"),
        alt.Tooltip("percentage:Q", title = "% of Posts", format=".2f"),
    ]
).properties(width=600,height=200)

text = alt.Chart(post_type_counts).mark_text(dx=30, fontSize=16).encode(
    x = alt.X("count:Q"),
    y = alt.Y(
        "post_type:N",
        axis = alt.Axis(
            title = None),
        sort="-x"
    ),
    text = alt.Text(
        "proportion:N",
        format = ".2%"
    )
).properties(width=600)


alt.layer(bars,text).configure_axis(
    labelFontSize=16,
    titleFontSize=16
).configure_title(fontSize=18)

# Multiple choice bonus question: What are artists typically talking about when they post (based on word count)?
- A) Other celebrities
- B) Themselves
- C) Promoting their music/tour

### Clean text to get word counts...

In [None]:
for post in text_df.text:
    print(post, "\n")
    print("-"*50)
    

In [None]:
text_df['clean_text'] = text_df['text'].map(clean_text)

In [None]:
for post in text_df.clean_text:
    print(post, "\n")
    print("-"*50)

### Count the number of times each word is used by each artist...

In [None]:
user_word_counter = defaultdict(Counter)

for idx, row in text_df.iterrows():
    user = row['username']
    clean_text = row['clean_text'].split()
    
    for word in clean_text:
        if word not in s_words:
            user_word_counter[user][word] += 1

user_word_count_data = []

for user, data in user_word_counter.items():
    
    for word, count in data.items():
        user_word_count_data.append({
            "user" : user,
            "word" : word,
            "count" : count
        })

user_word_count_df = pd.DataFrame.from_records(user_word_count_data)
user_word_count_df

### Most used words

In [None]:
hover = alt.selection_single(
    on='mouseover',  # select on mouseover
    nearest=True,    # select nearest point to mouse cursor
    empty='none'     # empty selection should match nothing
)

click = alt.selection_multi(
    empty='none' # empty selection matches no points
)


plot = alt.Chart(
    title="Words Most-commonly Used by an Artist"
).mark_circle(size=50, opacity=.5).encode(
    x = alt.X(
        "user:N", title = None,
        axis = alt.Axis(labelAngle=45)
    ),
    y = alt.Y("count:Q"),
)

base = plot.transform_filter(
    hover | click # filter to points in either selection
)


alt.layer(
    plot.add_selection(hover).add_selection(click),
    base.mark_point(size=100, stroke='firebrick', strokeWidth=2),
    base.mark_text(dx=4, dy=-8, align='left', stroke='white', strokeWidth=3, fontSize=15).encode(text='word:N'),
    base.mark_text(dx=4, dy=-8, align='left', fontSize=15).encode(text='word:N'),
    data=user_word_count_df
).properties(
    width=800,
    height=450
).configure_axis(
    labelFontSize=15,
    titleFontSize=15
).configure_title(fontSize=18)
