### Functions for later...

In [1]:
import datetime
import os
import sys
sys.path.insert(0,"../code")

import altair as alt
alt.data_transformers.disable_max_rows()
import pandas as pd

from collections import Counter, defaultdict
from copy import deepcopy

from nltk.corpus import stopwords
s_words = set(stopwords.words())

from ct_helpers import ct_get_lists, download_posts
from fb_model import FbIgPost

from cleantext import clean


def clean_text(text):
    """
    A convenience function for cleantext.clean because it has an ugly amount
    of parameters.
    """
    return clean(
        text,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=True,  # transliterate to closest ASCII representation
        lower=True,  # lowercase text
        no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
        no_urls=True,  # replace all URLs with a special token
        no_emoji=True,  # remove emojis
        no_emails=True,  # replace all email addresses with a special token
        no_phone_numbers=True,  # replace all phone numbers with a special token
        no_numbers=False,  # replace all numbers with a special token
        no_digits=False,  # replace all digits with a special token
        no_currency_symbols=False,  # replace all currency symbols with a special token
        no_punct=True,  # remove punctuations
        replace_with_punct="",  # instead of removing punctuations you may replace them
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
        lang="en",  # set to 'de' for German special handling
    )

---

#  🎤 🎸 🥁 🎹 🎺 Exploring musical artists Facebook data 🎤 🎸 🥁 🎹 🎺

In this notebook, we'll walk through how to download and explore posts from a specified list of Facebook accounts. This list will have to be created first with the Facebook-owned insights tool, [CrowdTangle](https://crowdtangle.com/).

Note: There is no cost for using CrowdTangle, but it is only available to select Facebook publishing partners and academics. Check out their [FAQs](https://help.crowdtangle.com/en/collections/41331-faqs-and-troubleshooting) for more information.

---

### [Take me to the CrowdTangle Dashboard](https://apps.crowdtangle.com/socialmediamanipulationclass)
### Part 1: Exploring the dashboard.
1. Lists/searches
2. Intelligence
3. Live displays

---

### Part 2: Two ways to get the data from the dashboard

1. Using the dashboard
2. Using a little code (API) <--- preferred 🤘

## Part 3: Exploring the posts

1. **Types of posts sent**
2. **Words used most**
3. **Most basic interactions**:
    1. likes
    2. shares
    3. comments

4. **Most emojis**:
    1. wows
    2. hahas
    3. sad
    4. angry
    5. thankful
    6. care
5. **Trajectory of most popular post (likes)**

## Get the data for all musical artists
---

### 1. Get the list ID

In [2]:
# Load the CT API token
api_token = os.environ.get("CLASS_CT_TOKEN")

# Returns all lists in the dashboard
list_records = ct_get_lists(api_token)

my_list_name = "Musical Artists for Social Media Manipulation"

# Extract the list ID number for that matches my list name
for record in list_records["result"]['lists']:
    id_number = record["id"]
    list_title = record["title"]
    
    if list_title == my_list_name:
        print("Found the list ID number.")
        break

print(f"List ID #: {id_number}")

Found the list ID number.
List ID #: 1732484


### Set start and end date

In [3]:
# Set up the time between which we'll retrieve data

num_days_back = 30*6

end_dt = datetime.datetime.today().date()
end_time = end_dt.strftime("%Y-%m-%d")
start_time = (
    end_dt - datetime.timedelta(days=num_days_back)
).strftime("%Y-%m-%d")

print(f"We're going to search {num_days_back} days in the past")
print(f"Start time: {start_time}")
print(f"End time  : {end_time}")

We're going to search 180 days in the past
Start time: 2022-05-12
End time  : 2022-11-08


In [4]:
posts = download_posts(
    crowdtangle_list_id=id_number, #<---- The number associated with out list
    start=start_time,
    end=end_time,
    max_queries=100,               #<---- Make no more than this number of queries
    api_token=api_token            #<---- API token for access
)

Successful first call.
Setting first_call = False
Found next page: https://api.crowdtangle.com/posts?token=cYbBOStNtq0Hhg4p3DBFiycwNZmcP3GrQ2ONgNbG&sortBy=date&endDate=2022-11-08&startDate=2022-05-12&listIds=1732484&searchField=TEXT_FIELDS_AND_IMAGE_TEXT&count=100&includeHistory=true&offset=100
	|--> 2022-10-25 16:16:15 - 2022-11-07 22:54:47: 100 posts.
Total posts collected: 100
Found next page: https://api.crowdtangle.com/posts?token=cYbBOStNtq0Hhg4p3DBFiycwNZmcP3GrQ2ONgNbG&sortBy=date&endDate=2022-11-08&startDate=2022-05-12&listIds=1732484&searchField=TEXT_FIELDS_AND_IMAGE_TEXT&count=100&includeHistory=true&offset=200
	|--> 2022-10-07 16:25:20 - 2022-10-25 15:57:22: 100 posts.
Total posts collected: 200
Found next page: https://api.crowdtangle.com/posts?token=cYbBOStNtq0Hhg4p3DBFiycwNZmcP3GrQ2ONgNbG&sortBy=date&endDate=2022-11-08&startDate=2022-05-12&listIds=1732484&searchField=TEXT_FIELDS_AND_IMAGE_TEXT&count=100&includeHistory=true&offset=300
	|--> 2022-09-11 22:29:00 - 2022-10-07

# Candy Trivia!
# 🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫

![display image](../gifs/despicable-me-candy.gif)
![display image](../gifs/homer-sweet-sweet-candy.gif)
![display image](../gifs/psycho-oprah-candy.gif)

# 🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫🍬🍫

- ## What artist earned the **most interactions overall**?
- ## What type of interaction typically contributes the most to an artists interactions?
- ## Artist with the most emojis total for ➡️ ❤️😡😆
    - Which artist earned the most **love** emojis (total)?
    - Which artist earned the most **angry** emojis (total)?
    - Which artist earned the most **haha** emojis (total)?
- ## Artist with the most emojis from a single post ➡️ ❤️😡😆
    - Which artist earned earned the most **love** in a (single post)?
    - Which artist earned the most **angry** emojis in a (single post)?
    - Which artist earned the most **haha** emojis in a (single post)?

---
---
---
---

### What does the data look like?

In [5]:
type(posts)

list

In [6]:
len(posts)

884

In [7]:
posts[0]

{'platformId': '100044628600301_673043404193274',
 'platform': 'Facebook',
 'date': '2022-11-07 22:54:47',
 'updated': '2022-11-08 23:20:57',
 'type': 'link',
 'title': 'Drake & 21 Savage - Privileged Rappers',
 'caption': 'drake.lnk.to',
 'description': 'Go to Drake & 21 Savage - Privileged Rappers.',
 'message': 'Privileged Rappers on ColorsxStudios 21 Savage Drake.lnk.to/Colors',
 'expandedLinks': [{'original': 'Drake.lnk.to/Colors',
   'expanded': 'https://drake.lnk.to/Colors'},
  {'original': 'https://drake.lnk.to/Colors',
   'expanded': 'https://drake.lnk.to/Colors'}],
 'link': 'https://drake.lnk.to/Colors',
 'postUrl': 'https://www.facebook.com/100044628600301/posts/673043404193274',
 'subscriberCount': 52720023,
 'score': 3.230905861456483,
 'media': [{'type': 'photo',
   'url': 'https://external-sea1-1.xx.fbcdn.net/emg1/v/t13/4964197033023031853?url=https%3A%2F%2Flinkstorage.linkfire.com%2Fmedialinks%2Fimages%2F4c7abf91-9d3c-486b-9d12-64b6ea078dd6%2Fartwork-600x315.jpg&fb_obo=

### `FbIgPost` will make handling the data a bit easier

In [8]:
post_objs = [FbIgPost(post) for post in posts] 

In [9]:
type(posts[0])

dict

In [10]:
type(post_objs[0])

fb_model.FbIgPost

### Organize the data a little bit...

In [11]:
text_data = []
performance_data = []

for post in post_objs:
    
    # Handle all posts
    base_record = {
        "id" : post.get_post_ID(),
        "username" : post.get_value(['account','name']),
        "date" : post.post_object["date"],
        "link" : post.get_link_to_post()
    }
    text_record = deepcopy(base_record)
    stat_record = deepcopy(base_record)
    
    
    text_info = {
        "text" : post.get_text(),
        "post_type" : post.get_post_type(),
    }
    text_record.update(text_info)
    text_data.append(text_record)
    
    
    stat_info = post.get_value(["statistics", "actual"])
    stat_record.update(stat_info)
    performance_data.append(stat_record)

In [12]:
stats_df = pd.DataFrame.from_records(performance_data)
stats_df

Unnamed: 0,id,username,date,link,likeCount,shareCount,commentCount,loveCount,wowCount,hahaCount,sadCount,angryCount,thankfulCount,careCount
0,11084|673043404193274,Drake,2022-11-07 22:54:47,https://www.facebook.com/100044628600301/posts...,6767,339,509,3277,6,14,2,0,0,106
1,47259|707711170717953,SAM SMITH,2022-11-07 21:23:22,https://www.facebook.com/100044372291649/posts...,1691,21,820,1168,7,26,2,0,0,37
2,8317|668159511348298,Rihanna,2022-11-07 20:44:30,https://www.facebook.com/100044627640367/posts...,4281,246,469,3003,31,10,3,4,0,127
3,8317|668072371357012,Rihanna,2022-11-07 17:35:36,https://www.facebook.com/100044627640367/posts...,30017,3399,2076,26111,135,52,6,3,0,836
4,8320|711910050291299,Justin Bieber,2022-11-07 15:22:44,https://www.facebook.com/100044169322901/posts...,87968,4973,2604,65898,80,213,42,21,0,2123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
879,29867|541338304025412,Ed Sheeran,2022-05-13 09:17:51,https://www.facebook.com/100044477407527/posts...,2697,126,331,1478,5,2,2,0,0,56
880,47259|574810227341382,SAM SMITH,2022-05-12 23:58:03,https://www.facebook.com/100044372291649/posts...,5918,124,194,4788,3,4,0,0,0,92
881,8320|576446687170970,Justin Bieber,2022-05-12 15:09:35,https://www.facebook.com/100044169322901/posts...,81560,3531,4740,51221,123,190,25,13,0,1776
882,9097|554756472688697,Harry Styles,2022-05-12 15:05:13,https://www.facebook.com/100044630460255/posts...,68461,10384,5725,141100,92,50,5,4,0,2340


# Q1: What artist earned the **most interactions overall**?

In [13]:
# These are the columns we want to count
responses = [
    "likeCount",
    "shareCount",
    "commentCount",
    "wowCount", 
    "hahaCount",
    "sadCount", 
    "careCount"
]

# Calculate the total interactions per emoji option
total_response_counts = stats_df.groupby('username')[responses].sum().reset_index()
total_response_counts.head(2)

Unnamed: 0,username,likeCount,shareCount,commentCount,wowCount,hahaCount,sadCount,careCount
0,Bad Bunny,325762,56770,269417,1395,6096,412,41987
1,Coldplay,2757537,324622,210862,20796,4160,3277,50690


In [14]:
# Take the total across the rows for each artist...
total_interactions = pd.DataFrame(total_response_counts[responses].sum(axis=1), columns=['counts'])
total_interactions["username"] = total_response_counts['username']

In [15]:
bars = alt.Chart(
    total_interactions,
    title="Most interactions total 🤑"
).mark_bar(color='firebrick').encode(
    x = alt.X("counts:Q", title="Total interactions"),
    y = alt.Y("username:N", sort="-x", title=None)
).properties(
    width=700,
    height=400
)


text = alt.Chart(
    total_interactions
).mark_text(align='left', fontWeight="bold", fontSize=15).encode(
    x = alt.X("counts:Q", title="Total interactions"),
    y = alt.Y("username:N", sort="-x", title=None),
    text = alt.Text("counts:N", format=',')
).properties(
    width=700,
    height=400
)

alt.layer(bars,text).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_legend(
    titleFontSize=15,
    labelFontSize=15
).configure_title(
    fontSize=20,
)

# Q2: What type of interaction typically contributes the most to an artists interactions? 

In [16]:
props = total_response_counts[responses].divide(total_response_counts.sum(axis=1), axis=0)
props['username'] = total_response_counts['username'].copy()

In [17]:
props

Unnamed: 0,likeCount,shareCount,commentCount,wowCount,hahaCount,sadCount,careCount,username
0,0.464155,0.080887,0.383873,0.001988,0.008686,0.000587,0.059824,Bad Bunny
1,0.817788,0.096271,0.062534,0.006167,0.001234,0.000972,0.015033,Coldplay
2,0.749833,0.15057,0.072434,0.004867,0.012305,0.000561,0.009429,David Guetta
3,0.824717,0.071054,0.078001,0.00179,0.00854,0.000435,0.015463,Drake
4,0.877517,0.051978,0.03299,0.002738,0.001708,0.002535,0.030533,Dua Lipa
5,0.859979,0.052544,0.056385,0.002381,0.015336,0.000622,0.012753,Ed Sheeran
6,0.840653,0.077252,0.061575,0.003792,0.00293,0.000247,0.01355,Eminem
7,0.684366,0.181576,0.101647,0.005592,0.002335,0.000562,0.023923,Harry Styles
8,0.843657,0.06362,0.053018,0.006377,0.001029,0.012513,0.019784,Imagine Dragons
9,0.89945,0.03232,0.038494,0.002148,0.002673,0.003922,0.020992,Justin Bieber


In [18]:
alt.Chart(
    props.melt(id_vars='username'),
    title = "Interaction percentage breakdown"
).mark_bar().encode(
    x = alt.X("value:Q", title="Percent of all interactions", axis=alt.Axis(format="%"), scale=alt.Scale(domain=[0,1])),
    y = alt.Y("username", title=None),
    color = alt.Color("variable:N", title="Interaction type", sort="-x", legend=alt.Legend(orient='bottom')),
    tooltip = [
        alt.Tooltip('value:Q', title="Percent", format=".2%"),
        alt.Tooltip('variable:N', title="Interaction")
    ]
).properties(
    width=800,
    height=400
).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_legend(
    titleFontSize=15,
    labelFontSize=16
).configure_title(
    fontSize=20,
)

# Q3: Artist with the most emojis total for ➡️➡️    ❤️😡😆
- Which artist earned the most **love** emojis (total)?
- Which artist earned the most **angry** emojis (total)?
- Which artist earned the most **haha** emojis (total)?


In [19]:
love_total = stats_df.groupby("username")['loveCount'].sum().to_frame("count").reset_index()
angry_total = stats_df.groupby("username")['angryCount'].sum().to_frame("count").reset_index()
haha_total = stats_df.groupby("username")['hahaCount'].sum().to_frame("count").reset_index()


## Q3.1 Total loves

In [20]:
# Create bar chart
bars = alt.Chart(
    love_total,
    title="Total ❤️ for each artist"
).mark_bar(color='forestgreen').encode(
    x = alt.X("count:Q", title = "Total loves"),
    y = alt.Y("username:N", sort="-x", title=None),
).properties(
    width=700,
    height=400
)

# Add the total number of loves for each artist
text = alt.Chart(love_total).mark_text(align='left', fontSize=15).encode(
    x = alt.X("count:Q", title = "Total loves"),
    y = alt.Y("username:N", sort="-x", title=None),
    text = alt.Text("count:Q", format=",")
)

# Put them together and configure some aesthetics
alt.layer(bars, text).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

## Q3.2 Total Angry

In [21]:
# Create bar chart
bars = alt.Chart(
    angry_total,
    title="Total 😡 for each artist"
).mark_bar(color='red').encode(
    x = alt.X("count:Q", title = "Total angry"),
    y = alt.Y("username:N", sort="-x", title=None),
).properties(
    width=700,
    height=400
)

# Add the total number of loves for each artist
text = alt.Chart(angry_total).mark_text(align='left', fontSize=15).encode(
    x = alt.X("count:Q", title = "Total angry"),
    y = alt.Y("username:N", sort="-x", title=None),
    text = alt.Text("count:Q", format=",")
)

# Put them together and configure some aesthetics
alt.layer(bars, text).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

## Q3.3 Total haha

In [22]:
# Create bar chart
bars = alt.Chart(
    haha_total,
    title="Total 😆 for each artist"
).mark_bar(color='blue').encode(
    x = alt.X("count:Q", title = "Total haha"),
    y = alt.Y("username:N", sort="-x", title=None),
).properties(
    width=700,
    height=400
)

# Add the total number of loves for each artist
text = alt.Chart(haha_total).mark_text(align='left', fontSize=15).encode(
    x = alt.X("count:Q", title = "Total haha"),
    y = alt.Y("username:N", sort="-x", title=None),
    text = alt.Text("count:Q", format=",")
)

# Put them together and configure some aesthetics
alt.layer(bars, text).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

# Q4 Artist with the most emojis from a single post ➡️ ❤️😡😆
- Which artist earned earned the most **love** in a (single post)?
- Which artist earned the most **angry** emojis in a (single post)?
- Which artist earned the most **haha** emojis in a (single post)?

In [23]:
top_count = 10

toploved = stats_df.sort_values("loveCount", ascending=False).head(top_count).reset_index(drop=True)
topangry = stats_df.sort_values("angryCount", ascending=False).head(top_count).reset_index(drop=True)
tophaha = stats_df.sort_values("hahaCount", ascending=False).head(top_count).reset_index(drop=True)

In [24]:
print(f"Top {top_count} Loved Posts:")
for rank in range(0, top_count):
    print(toploved.iloc[rank].link)

Top 10 Loved Posts:
https://www.facebook.com/100044454818615/posts/626387172186438
https://www.facebook.com/100044454818615/posts/674405764051245
https://www.facebook.com/100044630460255/posts/605056527658691
https://www.facebook.com/100044454818615/posts/665918961566592
https://www.facebook.com/100044454818615/posts/665841494907672
https://www.facebook.com/100044169322901/posts/640902684058703
https://www.facebook.com/100044534585203/posts/627281388766342
https://www.facebook.com/100044454818615/posts/665790151579473
https://www.facebook.com/100044627640367/posts/633885244775725
https://www.facebook.com/100044630460255/posts/560069035490774


In [25]:
print(f"Top {top_count} Angry Posts:")
for rank in range(0, top_count):
    print(topangry.iloc[rank].link)

Top 10 Angry Posts:
https://www.facebook.com/100044544064759/posts/1859068070956987
https://www.facebook.com/100044095313280/posts/656554609157710
https://www.facebook.com/100044169322901/posts/707251264090511
https://www.facebook.com/100044169322901/posts/666315758184062
https://www.facebook.com/100044095313280/posts/688090589337445
https://www.facebook.com/100044544064759/posts/1258675544875971
https://www.facebook.com/100044169322901/posts/707288210753483
https://www.facebook.com/100044534585203/posts/603323677828780
https://www.facebook.com/100044169322901/posts/640902684058703
https://www.facebook.com/100044183001943/posts/684345519714875


In [26]:
print(f"Top {top_count} Haha Posts:")
for rank in range(0, top_count):
    print(topangry.iloc[rank].link)

Top 10 Haha Posts:
https://www.facebook.com/100044544064759/posts/1859068070956987
https://www.facebook.com/100044095313280/posts/656554609157710
https://www.facebook.com/100044169322901/posts/707251264090511
https://www.facebook.com/100044169322901/posts/666315758184062
https://www.facebook.com/100044095313280/posts/688090589337445
https://www.facebook.com/100044544064759/posts/1258675544875971
https://www.facebook.com/100044169322901/posts/707288210753483
https://www.facebook.com/100044534585203/posts/603323677828780
https://www.facebook.com/100044169322901/posts/640902684058703
https://www.facebook.com/100044183001943/posts/684345519714875


## Q4.1 Which artist earned the most ❤️ in a (single post)?

In [27]:
alt.Chart(
    toploved,
    title = "Top 🔟 ❤️ posts"
).mark_circle(size=100, color='black').encode(
    x = alt.X("loveCount:Q"),
    y = alt.Y("username:N", sort='-x', title=None),
    color = alt.Color("username:N", sort='-x', legend=None),
    href = alt.Href("link:N"),
    tooltip = [
        alt.Tooltip("loveCount:Q", format=",")
    ]
).properties(
    width=700,
#     height=300
).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

## Q4.2 Which artist earned earned the most 😡 in a (single post)?

In [28]:
alt.Chart(
    topangry,
    title = "Top 🔟 😡 posts"
).mark_circle(size=100, color='black').encode(
    x = alt.X("angryCount:Q"),
    y = alt.Y("username:N", sort='-x', title=None),
    color = alt.Color("username:N", sort='-x', legend=None),
    href = alt.Href("link:N"),
    tooltip = [
        alt.Tooltip("angryCount:Q", format=",")
    ]
).properties(
    width=700,
#     height=300
).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

## Q4.3 Which artist earned earned the most 😆 in a (single post)?

In [29]:
alt.Chart(
    tophaha,
    title = "Top 🔟 😆 posts"
).mark_circle(size=100, color='black').encode(
    x = alt.X("hahaCount:Q"),
    y = alt.Y("username:N", sort='-x', title=None),
    color = alt.Color("username:N", sort='-x', legend=None),
    href = alt.Href("link:N"),
    tooltip = [
        alt.Tooltip("hahaCount:Q", format=",")
    ]
).properties(
    width=700,
#     height=300
).configure_axis(
    titleFontSize=15,
    labelFontSize=15
).configure_title(fontSize=20)

# Bonus question: What type of post is shared the most?

### Options:
- `photo`
- `native_video`
- `live_video_scheduled`
- `status`
- `link`
- `youtube`
- `live_video_complete`
- `video`

In [32]:
text_df = pd.DataFrame.from_records(text_data)
text_df

Unnamed: 0,id,username,date,link,text,post_type
0,11084|673043404193274,Drake,2022-11-07 22:54:47,https://www.facebook.com/100044628600301/posts...,Privileged Rappers on ColorsxStudios 21 Savage...,link
1,47259|707711170717953,SAM SMITH,2022-11-07 21:23:22,https://www.facebook.com/100044372291649/posts...,💚 what shall I call them? X,photo
2,8317|668159511348298,Rihanna,2022-11-07 20:44:30,https://www.facebook.com/100044627640367/posts...,we bringing you a wholeeee new vibe this year!...,native_video
3,8317|668072371357012,Rihanna,2022-11-07 17:35:36,https://www.facebook.com/100044627640367/posts...,volume 4 loading!! #SAVAGEXFENTYSHOW Savage X ...,native_video
4,8320|711910050291299,Justin Bieber,2022-11-07 15:22:44,https://www.facebook.com/100044169322901/posts...,"Photo dump, love you guyth , mith u gettyimage...",photo
...,...,...,...,...,...,...
879,29867|541338304025412,Ed Sheeran,2022-05-13 09:17:51,https://www.facebook.com/100044477407527/posts...,France! Leto's 2step remix is out now https://...,native_video
880,47259|574810227341382,SAM SMITH,2022-05-12 23:58:03,https://www.facebook.com/100044372291649/posts...,And Just Like That… ❤️,photo
881,8320|576446687170970,Justin Bieber,2022-05-12 15:09:35,https://www.facebook.com/100044169322901/posts...,📷: Evan Paterakis,photo
882,9097|554756472688697,Harry Styles,2022-05-12 15:05:13,https://www.facebook.com/100044630460255/posts...,Harry Styles x Zane Lowe. May 16th.,native_video


In [33]:
post_type_counts = text_df.post_type.value_counts().to_frame("count").reset_index().rename(
        columns={"index": "post_type"}
    )

In [34]:
post_type_counts['percentage'] = (post_type_counts['count'] / len(posts)) * 100
post_type_counts['proportion'] = post_type_counts['count'] / len(posts)
post_type_counts

Unnamed: 0,post_type,count,percentage,proportion
0,photo,530,59.954751,0.599548
1,native_video,303,34.276018,0.34276
2,live_video_scheduled,22,2.488688,0.024887
3,status,9,1.0181,0.010181
4,link,7,0.791855,0.007919
5,youtube,7,0.791855,0.007919
6,live_video_complete,3,0.339367,0.003394
7,video,3,0.339367,0.003394


In [35]:
bars = alt.Chart(post_type_counts, title="Post Type Breakdown").mark_bar().encode(
    x = alt.X("count:Q"),
    y = alt.Y(
        "post_type:N",
        axis = alt.Axis(
            title = None),
        sort="-x"
    ),
    color = alt.Color(
        "post_type:N",
        legend=None
    ),
    tooltip = [
        alt.Tooltip("post_type:N", title = "Post Type"),
        alt.Tooltip("count:Q", title = "# of Posts"),
        alt.Tooltip("percentage:Q", title = "% of Posts", format=".2f"),
    ]
).properties(width=600,height=200)

text = alt.Chart(post_type_counts).mark_text(dx=30, fontSize=16).encode(
    x = alt.X("count:Q"),
    y = alt.Y(
        "post_type:N",
        axis = alt.Axis(
            title = None),
        sort="-x"
    ),
    text = alt.Text(
        "proportion:N",
        format = ".2%"
    )
).properties(width=600)


alt.layer(bars,text).configure_axis(
    labelFontSize=16,
    titleFontSize=16
).configure_title(fontSize=18)

# Multiple choice bonus question: What are artists typically talking about when they post (based on word count)?
- A) Other celebrities
- B) Themselves
- C) Promoting their music/tour

### Clean text to get word counts...

In [36]:
for post in text_df.text:
    print(post, "\n")
    print("-"*50)
    

Privileged Rappers on ColorsxStudios 21 Savage Drake.lnk.to/Colors Drake & 21 Savage - Privileged Rappers Go to Drake & 21 Savage - Privileged Rappers. 

--------------------------------------------------
💚 what shall I call them? X 

--------------------------------------------------
we bringing you a wholeeee new vibe this year!! #SAVAGEXFENTYSHOW VOL. 4 

--------------------------------------------------
volume 4 loading!! #SAVAGEXFENTYSHOW Savage X Fenty Amazon Prime Video 

--------------------------------------------------
Photo dump, love you guyth , mith u gettyimages® Dimitrios Kambouris 

--------------------------------------------------
Show #62 Buenos Aires, Argentina 📷 @tim_toda #ColdplayBuenosAires #Coldplay #MusicOfTheSpheresWorldTour 

--------------------------------------------------
Feels like it was a day ago (or not) 😂 (repost from Bob Sinclar) M I'M DAVID GLETY ME I'M SINCLAR! 

--------------------------------------------------
GLORIA 💛 January 27th Pre-order t


--------------------------------------------------
HOLD ON FOR DEAR LIFE // From the sick and twisted minds of Sam Levinson and The Weeknd, starring Lily-Rose Depp. #THEIDOL is coming soon to HBO Max. 

--------------------------------------------------
got a few tricks up my sleeve 

--------------------------------------------------
My Universe Stade de France, Paris 16 July 2022 #ColdplayParis #BTS 💜 

--------------------------------------------------
WE IN NEW YORK BABY 🗽🗽🗽 

--------------------------------------------------
🔥 The Weeknd had an amazing show last night in Philly!
📷: Hyghly Alleyne 

--------------------------------------------------
Los sueños siempre se pueden hacer realidad.
Recuerdan al niño que buscaba Abel? Bueno pues hoy está a su lado y este niño disfrutará del show como nadie.
Te admiramos Abel, eres el mejor 💕
The Weeknd #theweeknd 

--------------------------------------------------
Philadelphia … we finally did it. thank you for helping me ring in the 

In [37]:
text_df['clean_text'] = text_df['text'].map(clean_text)

In [38]:
for post in text_df.clean_text:
    print(post, "\n")
    print("-"*50)

privileged rappers on colorsxstudios 21 savage drakelnktocolors drake 21 savage privileged rappers go to drake 21 savage privileged rappers 

--------------------------------------------------
what shall i call them x 

--------------------------------------------------
we bringing you a wholeeee new vibe this year savagexfentyshow vol 4 

--------------------------------------------------
volume 4 loading savagexfentyshow savage x fenty amazon prime video 

--------------------------------------------------
photo dump love you guyth mith u gettyimagesr dimitrios kambouris 

--------------------------------------------------
show 62 buenos aires argentina timtoda coldplaybuenosaires coldplay musicofthespheresworldtour 

--------------------------------------------------
feels like it was a day ago or not repost from bob sinclar m im david glety me im sinclar 

--------------------------------------------------
gloria january 27th preorder the album on the official store 

-------------


--------------------------------------------------
to our dear friends in italy come join dan online for an exclusive live discussion about the new album mercury all the info about this event in partnership with mondadori store virgin records and virgin radio is right here imagine dragons imagine dragons incontra dan reynolds in un evento online esclusivo 

--------------------------------------------------
south africa finally were coming presale july 6 all tix on sale july 9 imaginedragonssa23 imagine dragons mercury world tour feb1capetownza 1 feb cape town za dhl stadium feb 4 johannesburg za fnb stadium inaginedraconsmusiccom 

--------------------------------------------------
mercury acts 1 2 out this friday watch the full trailer on youtube imagine dragons mercury act 2 trailer go to imagine dragons mercury act 2 trailer 

--------------------------------------------------
this probably should have happened a while ago behind the scenes fromthed2thelbc video shoot gallery on t

### Count the number of times each word is used by each artist...

In [39]:
user_word_counter = defaultdict(Counter)

for idx, row in text_df.iterrows():
    user = row['username']
    clean_text = row['clean_text'].split()
    
    for word in clean_text:
        if word not in s_words:
            user_word_counter[user][word] += 1

user_word_count_data = []

for user, data in user_word_counter.items():
    
    for word, count in data.items():
        user_word_count_data.append({
            "user" : user,
            "word" : word,
            "count" : count
        })

user_word_count_df = pd.DataFrame.from_records(user_word_count_data)
user_word_count_df

Unnamed: 0,user,word,count
0,Drake,privileged,3
1,Drake,rappers,3
2,Drake,colorsxstudios,1
3,Drake,21,6
4,Drake,savage,6
...,...,...,...
5886,The Weeknd,collaborate,1
5887,The Weeknd,charitable,1
5888,The Weeknd,efforts,1
5889,The Weeknd,provide,1


### Most used words

In [40]:
hover = alt.selection_single(
    on='mouseover',  # select on mouseover
    nearest=True,    # select nearest point to mouse cursor
    empty='none'     # empty selection should match nothing
)

click = alt.selection_multi(
    empty='none' # empty selection matches no points
)


plot = alt.Chart(
    title="Words Most-commonly Used by an Artist"
).mark_circle(size=50, opacity=.5).encode(
    x = alt.X(
        "user:N", title = None,
        axis = alt.Axis(labelAngle=45)
    ),
    y = alt.Y("count:Q"),
)

base = plot.transform_filter(
    hover | click # filter to points in either selection
)


alt.layer(
    plot.add_selection(hover).add_selection(click),
    base.mark_point(size=100, stroke='firebrick', strokeWidth=2),
    base.mark_text(dx=4, dy=-8, align='left', stroke='white', strokeWidth=3, fontSize=15).encode(text='word:N'),
    base.mark_text(dx=4, dy=-8, align='left', fontSize=15).encode(text='word:N'),
    data=user_word_count_df
).properties(
    width=800,
    height=450
).configure_axis(
    labelFontSize=15,
    titleFontSize=15
).configure_title(fontSize=18)
