In [1]:
import re
from collections import Counter
from itertools import chain
from more_itertools import chunked

import pandas
import tweepy
import english_words

from handle_map import handle_map

In [2]:
client = tweepy.Client(bearer_token=open("bearer_token.txt").read())
paginator = tweepy.Paginator(client.search_recent_tweets, query="to: arcmusicfest", tweet_fields="conversation_id")
tweets = [tweet for tweet in paginator.flatten(limit=2000) if tweet.conversation_id==1567540300331077632]

In [3]:
# save to text file just incase i get rate-limited or something later
with open("tweets.txt", "w") as f:
    f.writelines([re.sub(r'\s+', ' ', tweet.text)+"\n" for tweet in tweets])

In [4]:
# lower-case, normalize white-space, remove punctuation, and replace @tagged arists with their name
def normalize_string(s, handle_map={}):
    s = re.sub(r'\s+', ' ', re.sub(r'[^\w\s@]', ' ', s.lower().replace("@arcmusicfest",""))).strip()
    for (k,v) in handle_map.items():
        s = s.replace(k, v)
    return s

In [5]:
# consider every 1, 2, or 3 consecutive combination of words as a possible artist
def runs_of(arr, size):
    return [arr[i:i+size] for i in range(len(arr)-size+1)]

def possible_artist_names(s):
    words_no_tags = [word for word in normalize_string(s).split() if not re.match(twitter_handle_regex, word)]
    return [' '.join(words) for words in chain(*(runs_of(words_no_tags, run) for run in [1,2,3]))]

In [6]:
twitter_handle_regex = r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))(@[A-Za-z]+[A-Za-z0-9-_]+)"

In [7]:
# extract top tagged artists to then write out their names in handle_map.py by hand
tagged_artists = (handle.lower() for handle in chain(*(re.findall(twitter_handle_regex, normalize_string(tweet.text)) for tweet in tweets)))
top_tagged_artists = pandas.DataFrame.from_dict(Counter(tagged_artists), orient="index").sort_values(by=0, ascending=False)[:40]
# for artist in top_tagged_artists.index:
#     print(artist)

In [8]:
artists = set(
    artist for artist in chain(handle_map.keys(), *(possible_artist_names(tweet.text) for tweet in tweets))
    if (len(artist)>3 or artist=="kx5") and not artist in english_words.english_words_lower_set
)

In [9]:
vote_counts = {artist: sum(artist in normalize_string(tweet.text, handle_map) for tweet in tweets) for artist in artists}
vote_counts = pandas.DataFrame.from_dict(vote_counts, orient="index", columns=["votes"]).sort_values(by="votes", ascending=False)

In [10]:
# narrow down what is actual artist names to things with > 5 appearances
# and that arent included in a bigger word (eg remove "i hate" but keep "i hate models")
artists = set(vote_counts[vote_counts["votes"]>5].index)
artists = [artist for artist in artists if not any(artist in artist2 for artist2 in artists if artist!=artist2)]

In [11]:
vote_counts.filter(items=artists, axis=0).sort_values(by="votes", ascending=False)[:10]

Unnamed: 0,votes
amelie lens,34
i hate models,32
john summit,30
michael bibi,27
eli brown,25
tale of us,20
fred again,20
artbat,16
pawsa,16
jamie jones,16
