# Fediporter

Jupyter notebook to migrate content like Tweets oder Mastodon posts to Mastodon.

Only works with instances that are patched to allow backdated posts through the API. More info: https://github.com/lucahammer/fediporter


In [None]:
from pathlib import Path
from tqdm.notebook import tqdm
import datetime
import json
import re
import requests

In [None]:
with open('config.json', 'r') as f:
    config = json.loads(f.read())

API_BASE_URL = config['mastodon_url']
MASTODON_BEARER = config['mastodon_bearer']
TWITTER_BEARER = config['twitter_bearer']

DATA_DIR = config['data_dir'] # Unzipped twitter data export
media_dir_backup = config['media_dir_backup'] # media folder of twitter data export
media_dir = config['media_dir'] # media folder of https://github.com/timhutton/twitter-archive-parser

# Test Mastodon bearer token
url = f"{API_BASE_URL}/api/v1/apps/verify_credentials"
r = requests.get(url, headers=HEADERS)
print(r.text)

In [None]:
def post_status(data):
    HEADERS = {'Authorization': f'Bearer {MASTODON_BEARER}'}
    url = f"{API_BASE_URL}/api/v1/statuses"
    r = requests.post(url,
                      data=data,
                      headers=HEADERS)
    return r.json()


def load_tweets():
    with open(data_dir+"tweets.js", 'r', encoding='utf8') as f:
        raw = f.read()
    raw = raw.replace("window.YTD.tweets.part0 = ", "")
    tweets = json.loads(raw)
    tweets = [tweet['tweet'] for tweet in tweets]
    tweets = sorted(tweets, key=lambda d: int(d['id']))
    return tweets


def to_timestamp(created_at):
    timestamp = datetime.datetime.strptime(
        created_at, '%a %b %d %X %z %Y').isoformat(timespec='seconds')
    return timestamp


def replace_urls(tweet):
    if 'full_text' in tweet:
        text = tweet['full_text']
    else:
        text = tweet['text']
    if 'entities' in tweet and 'urls' in tweet['entities']:
        for url in tweet['entities']['urls']:
            text.replace(url['url'], url['expanded_url'])
    return (text)


def replace_usernames(text):
    text = re.sub(r"(\B\@[A-Za-z0-9_]{1,15})(\:)?", r"\1@twitter.com\2", text)
    return text


def tweet_to_toot(tweet):
    toot = {
        'status': replace_usernames(replace_urls(tweet)),
        'visibility': 'public',
        'created_at': to_timestamp(tweet['created_at']),
        'language': tweet['lang']
    }
    return toot


def retrieve_alt_texts(tweet_ids):
    # get alt text for specific IDs from Twitter API
    twitter_url = "https://api.twitter.com/2/tweets"
    twitter_heeaders = {"Authorization": f"Bearer {TWITTER_BEARER}"}
    twitter_params = {'ids': ','.join(tweet_ids),
                      'tweet.fields': 'text,attachments,entities',
                      'expansions': 'attachments.media_keys',
                      'media.fields': 'alt_text'
                      }
    resp = requests.get(
        twitter_url, headers=twitter_heeaders, params=twitter_params)
    resp_json = resp.json()

    for media in resp_json['includes']['media']:
        if 'alt_text' in media:
            alt_texts[media['media_key']] = media['alt_text']


def add_alt_texts(tweets):
    # looks for Tweets with media and asks Twitter API for alt texts
    # adds those alt texts to the dict alt_texts
    tweets_with_media = [tweet for tweet in tweets[50100:]
                         if 'media' in tweet['entities']]
    print(f'Found {len(tweets_with_media)} Tweets with media attachements.')
    tweet_ids = [str(tweets_with_media['id'])
                 for tweets_with_media in tweets_with_media]
    batches = [tweet_ids[idx:idx+100] for idx in range(0, len(tweet_ids), 100)]

    for batch in tqdm(batches):
        retrieve_alt_texts(batch)

    print(f'Found {len(alt_texts)} alt texts.')


def retrieve_rt_texts(tweet_ids):
    # get full text of Retweets for specific IDs from Twitter API
    twitter_url = "https://api.twitter.com/2/tweets"
    twitter_heeaders = {"Authorization": f"Bearer {TWITTER_BEARER}"}
    twitter_params = {'ids': ','.join(tweet_ids),
                      'tweet.fields': 'text,referenced_tweets,entities',
                      'expansions': 'referenced_tweets.id'
                      }
    resp = requests.get(
        twitter_url, headers=twitter_heeaders, params=twitter_params)
    resp_json = resp.json()

    for tweet in resp_json['data']:
        rt = [rt for rt in resp_json['includes']['tweets']
              if rt['id'] == tweet['referenced_tweets'][0]['id']][0]
        text = replace_urls(rt)
        text = f"{tweet['text'].split(':')[0]}: {text}\nhttps://twitter.com/{tweet['text'].split(':')[0].split('@')[-1]}/status/{rt['id']}"
        text = replace_usernames(text)
        rt_texts[tweet['id']] = text


def add_full_RT_texts(tweets):
    # looks for Tweets with media and asks Twitter API for alt texts
    # adds those alt texts to the dict alt_texts
    truncated_retweets = retweets = [tweet for tweet in tweets if tweet['full_text'].startswith(
        'RT @') and tweet['full_text'].endswith('…')]
    print(f'Found {len(truncated_retweets)} truncated Retweets.')
    tweet_ids = [str(tweet['id']) for tweet in truncated_retweets]
    batches = [tweet_ids[idx:idx+100] for idx in range(0, len(tweet_ids), 100)]

    for batch in tqdm(batches):
        retrieve_rt_texts(batch)

    print(f'Collected {len(rt_texts)} full texts for Retweets.')


In [None]:
tweets = load_tweets()
len(tweets)


In [None]:
alt_texts = {}
add_alt_texts(tweets)


In [None]:
rt_texts = {}
add_full_RT_texts(tweets)


In [None]:
ids_dict = {}


In [None]:
for tweet in tqdm(tweets[25218+4828:]):

    if tweet['id'] in ids_dict:
        # was already posted, we can skip it
        pass
    elif tweet['full_text'].startswith('RT @'):
        # Retweets are often truncated and full data needs to be retreived from the API
        if rt_texts.get(tweet['id']):
            toot = {'status': rt_texts.get(tweet['id']),
                    'visibility': 'public',
                    'created_at': to_timestamp(tweet['created_at']),
                    'language': tweet['lang']
                    }
        else:
            toot = tweet_to_toot(tweet)
        posted = post_status(toot)
        ids_dict[tweet['id']] = posted['id']
    else:
        toot = tweet_to_toot(tweet)
        if 'media' in tweet['entities']:
            # upload media to append to the post
            media_ids = []
            for media in tweet['extended_entities']['media']:
                image_path = f"{media_dir}{tweet['id']}-{media['media_url_https'].split('/')[-1]}"
                if not Path(image_path).is_file():
                    image_path = f"{media_dir_backup}{tweet['id']}-{media['media_url_https'].split('/')[-1]}"
                    if not Path(image_path).is_file():
                        continue
                file = open(image_path, 'rb')
                data = file.read()
                url = f"{API_BASE_URL}/api/v2/media"
                files = {
                    'file': (image_path, data, 'application/octet-stream')}
                if alt_texts.get('3_' + media['id']):
                    values = {'description': alt_texts.get('3_' + media['id'])}
                    r = requests.post(url, files=files,
                                      data=values, headers=HEADERS)
                else:
                    r = requests.post(url, files=files, headers=HEADERS)
                json_data = r.json()
                media_ids.append(json_data['id'])
                toot['status'] = toot['status'].replace(media['url'], '')
            toot['media_ids[]'] = media_ids
        if 'in_reply_to_screen_name' in tweet and tweet['in_reply_to_screen_name'] == 'luca':
            # if Tweet is part of a thread, get ID if previous post
            try:
                toot['in_reply_to_id'] = ids_dict.get(
                    tweet['in_reply_to_status_id'])
            except:
                print(tweet)
        # print(tweet)
        # print(toot)
        posted = post_status(toot)
        # print(posted)
        ids_dict[tweet['id']] = posted['id']


In [None]:
with open('ids_dict.txt', 'w') as f:
    f.write(json.dumps(ids_dict))


In [None]:
with open('alt_texts.txt', 'w') as f:
    f.write(json.dumps(alt_texts))


In [None]:
with open('rt_texts.txt', 'w') as f:
    f.write(json.dumps(rt_texts))


# Todos

- [ ] change ids and alt_texts storage from dict to file
- [ ] check alt texts (first ones are broken and need to be replaced)
- [ ] check images (first ones weren't attached to posts and need to be replaced)
- [x] add Retweets (get full text from API and post them; haven't been posted yet)
- [ ] fix videos (they weren't uploaded, but linked. :( )
- [ ] replace self-quotes with self-posts (update url in posts)
- [ ] import mastodon data
