# Retrieving Mastodon datasets.

Retrieving Mastodon datasets for sentiment analysis on specific topics.

Written by Luc Bijl.

Retrieving Mastodon key from credentials file.

In [None]:
with open("../.credentials", "r") as file:
    for line in file:
        if 'mastodon-key' in line:
            mastodon_key = line.split('mastodon-key=')[1].strip()
            break

Initializing the Mastodon API.

In [None]:
import pandas as pd
from mastodon import Mastodon

api_base_url = "https://mastodon.social"

mastodon = Mastodon(access_token=mastodon_key,api_base_url=api_base_url)

Converting the retrieval start and end time period to timestamps.

In [None]:
from datetime import datetime

start_date_string = '2023-10-17 00:00:00'
end_date_string = '2023-10-21 00:00:00'

start_timestamp = int(datetime.strptime(start_date_string, "%Y-%m-%d %H:%M:%S").timestamp())
end_timestamp = int(datetime.strptime(end_date_string, "%Y-%m-%d %H:%M:%S").timestamp())

Queriying all messages containing Eli Lilly in the period 2022-10-11 to 2022-12-11.

In [None]:
query = 'language:en after:2023-10-16 before:2023-10-22'

Specifiying the chosen search topics.

In [None]:
topics = [
    'topic 1',
    'topic 2'
]

Obtaining all toots for every topic, and adding the dataframes of every topic to a dictionary.

In [None]:
limit = 1000

dataframes = {}

for topic in topics:

    max_id = None
    dates = []
    ids = []
    contents = []

    while len(dates) < limit:

        toots = mastodon.search_v2(query + ' ' + topic, result_type='statuses', max_id=(max_id - 1))['statuses']

        if not toots:
            break

        if toots[-1].id == max_id:
            break
        

        for n in range(1,len(toots)):
            date = int(toots[n].created_at.timestamp())

            if start_timestamp <= date <= end_timestamp:
                dates.append(datetime.utcfromtimestamp(date))
                ids.append(toots[n].id)
                contents.append(toots[n].content)

        max_id = toots[-1].id

    data = {'date': dates, 'id': ids, 'content': contents}
    df_toots = pd.DataFrame(data)
    dataframes[topic] = df_toots

dataframes

Saving the dataframes dictionary in the datasets directory.

In [None]:
import pickle 

with open('../datasets/mastodon.pkl','wb') as file:
    pickle.dump(dataframes, file)