In [1]:
import pandas as pd
import os
import re

# Set variables

In [2]:
# Date from which on the tweets are selected
date_parameter_tweet_selecton = pd.to_datetime('2021-09-26') # Voting: '2021-09-26', Official Start: '2021-10-26'

# Date from which on accounts should tweeted at least once, otherwise they are droped
date_parameter_account_selection = pd.to_datetime('2021-08-27') # 30 days prior voting

# Import of intermediate files

In [3]:
# mdb_list
mdb_list = pd.read_csv('../data/intermediate/mdb_list/mdb_list.csv', sep=";")

# mdb_twitter_list
mdb_twitter_list = pd.read_csv('../data/intermediate/mdb_twitter_list/mdb_twitter_list.csv', sep=";", na_values="NA")

# tweet_list
tweet_list = pd.DataFrame()
pattern = re.compile("tweet_list_*")
directory = "../data/intermediate/tweet_list/"

for filename in os.listdir(directory):
    if pattern.match(filename):
        filepath = directory + filename
        user_tweet_list = pd.read_csv(filepath, sep=";", lineterminator='\n')
        tweet_list = pd.concat([tweet_list, user_tweet_list], axis=0)
        
# user_list
user_list = pd.read_csv('../data/intermediate/user_list/user_list.csv', sep=";", na_values="", dtype={'twitter_id': str})
user_list_secondary = pd.read_csv('../data/intermediate/user_list/user_list_secondary.csv', sep=";", na_values="", dtype={'twitter_id': str})
user_list = pd.concat([user_list, user_list_secondary], axis=0, ignore_index=True)

# user_friendships
user_friendships = pd.read_csv('../data/intermediate/user_friendships/user_friendships.csv', sep=";", na_values="")
user_friendships_secondary = pd.read_csv('../data/intermediate/user_friendships/user_friendships_secondary.csv', sep=";", na_values="")
user_friendships = pd.concat([user_friendships, user_friendships_secondary], axis=0, ignore_index=True)

# Create final _mdb_list.csv_

In [4]:
mdb_list.to_csv('../data/processed/mdb_list.csv', sep=";", index=False)

# Create final _user_list.csv_

In [5]:
# Clean Up of user_list (remove NA and duplicates)
user_list = user_list[user_list['twitter_id'].notnull()] # Remove accounts with no twitter account found
user_list = user_list.drop_duplicates(subset=['twitter_id']) # case Knut Gerschau (has a Twitter Account now "GerschauKnut" but is not included in analysis)

# Add Alias accounts to user_list
user_list = user_list.append({'bundestag_id': int(11004245),
                              'nachname': 'Baerbock',
                              'vorname': 'Annalena',
                              'wp': int(20),
                              'fraktion': 'Fraktion BÜNDNIS 90/DIE GRÜNEN',
                              'twitter_handle': 'AnnalenaBaerbockAlias',
                              'account_type': 'person'}, ignore_index=True)

user_list = user_list.append({'bundestag_id': int(11003231),
                              'nachname': 'Scholz',
                              'vorname': 'Olaf',
                              'wp': int(20),
                              'fraktion': 'Fraktion der Sozialdemokratischen Partei Deutschland',
                              'twitter_handle': 'OlafScholzAlias',
                              'account_type': 'person'}, ignore_index=True)

  user_list = user_list.append({'bundestag_id': int(11004245),
  user_list = user_list.append({'bundestag_id': int(11003231),


### Identify accounts were not all tweets were scraped (because of high twitter usage)

In [6]:
user_list1 = user_list.dropna(subset=['twitter_id'])

def findOldestTweet(twitter_handle):
    user_tweets = tweet_list[tweet_list['twitter_handle'] == twitter_handle]
    return pd.to_datetime(user_tweets.iloc[-1]['tweet_created_at'])

for index, contents in user_list1.iterrows():
    tweets_downloaded = tweet_list[tweet_list['twitter_handle'] == contents['twitter_handle']].shape[0]
    
    if tweets_downloaded > 0:
        dateOldestDownloadedTweet = pd.to_datetime(findOldestTweet(contents['twitter_handle'])).tz_localize(None)
        
        if dateOldestDownloadedTweet > date_parameter_tweet_selecton:
            
            if tweets_downloaded > 100:
                percentage_downloaded = tweet_list[tweet_list['twitter_handle'] == contents['twitter_handle']].shape[0]/contents['tweet_count']
                
                if percentage_downloaded < 0.9:
                    print()
                    print(contents['twitter_handle'])

                    print('Official Tweet count:', contents['tweet_count'])
                    print('Tweets downloaded:', tweets_downloaded)
                    print('Percantage Downloaded', round(percentage_downloaded*100,2), '%')

                    print('Oldest Tweet:', findOldestTweet(contents['twitter_handle']))
                    print('Date Parameter:', date_parameter_tweet_selecton)


MatthiasHauer
Official Tweet count: 18080.0
Tweets downloaded: 3200
Percantage Downloaded 17.7 %
Oldest Tweet: 2021-11-23 05:25:58+00:00
Date Parameter: 2021-09-26 00:00:00

SaraNanni
Official Tweet count: 17017.0
Tweets downloaded: 3200
Percantage Downloaded 18.8 %
Oldest Tweet: 2021-10-04 07:54:50+00:00
Date Parameter: 2021-09-26 00:00:00


### Find inactive accounts

In [7]:
def findLatestTweet(twitter_handle):
    user_tweets = tweet_list[tweet_list['twitter_handle'] == twitter_handle]
    return pd.to_datetime(user_tweets.iloc[0]['tweet_created_at'])

inactiveUsers = []

i = 0
for index, contents in user_list1.iterrows():
    tweets_downloaded = tweet_list[tweet_list['twitter_handle'] == contents['twitter_handle']].shape[0]
    
    if tweets_downloaded > 0:
        dateLatestTweet = pd.to_datetime(findLatestTweet(contents['twitter_handle'])).tz_localize(None)
        if dateLatestTweet < date_parameter_account_selection:
            inactiveUsers.append(contents['twitter_handle'])
            print(contents['twitter_handle'], '\tLatest Tweet:', findLatestTweet(contents['twitter_handle']))
            i += 1
    else:
        inactiveUsers.append(contents['twitter_handle'])
        print(contents['twitter_handle'], '\tLatest Tweet: Never tweeted')
        i += 1
        
print("\nn of inactive accounts", i)

AnetteKramme 	Latest Tweet: 2020-06-21 07:45:09+00:00
ToniHofreiter 	Latest Tweet: 2013-10-29 11:06:47+00:00
CarstenMuellers 	Latest Tweet: 2009-07-08 15:50:13+00:00
GerdesMdB 	Latest Tweet: Never tweeted
VolkmarKlein 	Latest Tweet: 2021-06-11 15:33:50+00:00
MdBMonstadt 	Latest Tweet: 2019-10-04 13:08:55+00:00
FechnerJohannes 	Latest Tweet: Never tweeted
UweFe 	Latest Tweet: 2009-06-15 15:27:15+00:00
KoobMar 	Latest Tweet: Never tweeted
JanMetzler 	Latest Tweet: 2019-09-10 11:37:28+00:00
MittagSusanne 	Latest Tweet: Never tweeted
WilfriedOellers 	Latest Tweet: 2019-05-09 17:25:05+00:00
ruetzelbernd 	Latest Tweet: 2020-09-15 15:30:16+00:00
UdoSchiefner 	Latest Tweet: 2020-06-18 08:33:38+00:00
NAltenkamp 	Latest Tweet: Never tweeted
Philipp_Amthor 	Latest Tweet: Never tweeted
AlexGauland 	Latest Tweet: 2018-07-10 13:19:11+00:00
Jochen_Haug 	Latest Tweet: 2021-05-19 12:58:25+00:00
Huber_AfD 	Latest Tweet: 2017-10-12 00:37:06+00:00
JoernKoenigAfD 	Latest Tweet: 2019-06-25 17:48:25+00:00
NB

### Build dataframe with all relevant users

In [8]:
"""
# MDBs that deleted Twitter account
mdb_deleted_twitter = user_list[user_list['twitter_id'].isnull()]

# Remove accounts that were deleted
final_user_list = pd.concat([user_list, mdb_deleted_twitter]).drop_duplicates(keep=False)
"""

# Remove accounts that are inactive
user_list = user_list[~user_list.twitter_handle.isin(inactiveUsers)]

# Remove duplicate (Knut Gerschau has the wrong twitter_handle / the one of Knut Abraham)
user_list = user_list.drop_duplicates(subset='twitter_handle', keep='first')

user_list.to_csv('../data/processed/user_list.csv', index=False, decimal=',', sep=";", float_format='%.0f')

# Create final _user_friendships.csv_

In [9]:
# Clean Up of user_friendships
user_friendships = user_friendships[user_friendships['following'] == True] # Show only Edges
user_friendships = user_friendships.drop_duplicates(subset=['source_screen_name', 'source_id', 'target_screen_name', 'target_id', 'following']) # Remove duplicates

# Create Alias for Baerbock and Scholz
user_friendships_secondary = user_friendships.replace(to_replace='ABaerbock', value='AnnalenaBaerbockAlias')
user_friendships_secondary = user_friendships_secondary.replace(to_replace='ABaerbockArchiv', value='AnnalenaBaerbockAlias')
user_friendships_secondary = user_friendships_secondary.replace(to_replace='OlafScholz', value='OlafScholzAlias')
user_friendships_secondary = user_friendships_secondary.replace(to_replace='Bundeskanzler', value='OlafScholzAlias')
user_friendships_secondary = user_friendships_secondary.drop_duplicates(subset=['source_screen_name', 'target_screen_name', 'following']) # Remove duplicates

user_friendships = pd.concat([user_friendships, user_friendships_secondary], ignore_index=True).drop_duplicates()

# Filter out users in user_friendships that do not appear in user_list
user_friendships = user_friendships[user_friendships['source_screen_name'].isin(user_list['twitter_handle'])]
user_friendships = user_friendships[user_friendships['target_screen_name'].isin(user_list['twitter_handle'])]

# Remove self-loops
user_friendships = user_friendships[user_friendships['source_screen_name'] != user_friendships['target_screen_name']]

user_friendships.to_csv('../data/processed/user_friendships.csv', index=False, decimal=',', sep=";", float_format='%.0f')

# Create final _tweet_list.csv_

In [10]:
"""
# Create tweets for alias accounts (keep original tweets)
tweet_list_secondary = tweet_list
# display(tweet_list_secondary)

# Create Alias for Baerbock and Scholz
tweet_list_secondary = tweet_list_secondary.replace(to_replace='ABaerbock', value='AnnalenaBaerbockAlias')
tweet_list_secondary = tweet_list_secondary.replace(to_replace='ABaerbockArchiv', value='AnnalenaBaerbockAlias')
tweet_list_secondary = tweet_list_secondary.replace(to_replace='OlafScholz', value='OlafScholzAlias')
tweet_list_secondary = tweet_list_secondary.replace(to_replace='Bundeskanzler', value='OlafScholzAlias')

tweet_list = pd.concat([tweet_list, tweet_list_secondary], ignore_index=True).drop_duplicates()

print(tweet_list.shape)

"""

# Filter out tweets of users that do not appear in user_list
tweet_list = tweet_list[tweet_list['twitter_handle'].isin(user_list['twitter_handle'])]

tweet_list.to_csv('../data/processed/tweet_list.csv', index=False, decimal=',', sep=";", float_format='%.0f')

# Create final retweet_list.csv

In [11]:
# Create a dataframe with all retweets in the tweet_list dataframe
retweet_tweet_list = tweet_list[tweet_list['tweet_referenced_tweet_type'] == "retweeted"]

# Create a dataframe with all retweets that are related to tweets in the tweet_list dataframe
relevant_retweet_tweet_list = pd.merge(retweet_tweet_list, tweet_list, left_on='tweet_referenced_tweet_id', right_on='tweet_id')
relevant_retweet_tweet_list.drop(['twitter_id_x',\
    'tweet_conversation_id_x',\
    'tweet_in_reply_to_user_id_x',\
    'tweet_lang_x',\
    'tweet_possibly_sensitive_x',\
    'tweet_retweet_count_x',\
    'tweet_reply_count_x',\
    'tweet_like_count_x',\
    'tweet_quote_count_x',\
    'tweet_reply_settings_x',\
    'tweet_referenced_tweet_id_x',\
    'tweet_referenced_tweet_id_x',\
    'tweet_source_x',\
    'api_call_x',\
    'twitter_id_y',\
    'tweet_conversation_id_y',\
    'tweet_in_reply_to_user_id_y',\
    'tweet_lang_y',\
    'tweet_possibly_sensitive_y',\
    'tweet_retweet_count_y',\
    'tweet_reply_count_y',\
    'tweet_like_count_y',\
    'tweet_quote_count_y',\
    'tweet_reply_settings_y',\
    'tweet_source_y',\
    'tweet_referenced_tweet_id_y',\
    'tweet_referenced_tweet_type_y',\
    'api_call_y'], axis=1, inplace=True)
relevant_retweet_tweet_list.rename(columns={"tweet_author_id_x": "retweeter_twitter_id ",\
    "tweet_created_at_x": "retweet_created_at",\
    "twitter_handle_x": "retweeter_twitter_handle",\
    "tweet_id_x": "retweet_tweet_id",\
    "tweet_text_x": "retweet_text",\
    "twitter_handle_y": "author_twitter_handle",\
    "tweet_id_y": "tweet_id",\
    "tweet_text_y": "tweet_text",\
    "tweet_author_id_y": "author_twitter_id",\
    "tweet_created_at_y": "tweet_created_at"}, inplace=True)

# Store results
relevant_retweet_tweet_list.to_csv('../data/processed/retweet_list.csv', index=False, decimal=',', sep=";", float_format='%.0f')

# Create final quote_list.csv

In [12]:
# Create a dataframe with all quotes in the tweet_list dataframe
quote_tweet_list = tweet_list[tweet_list['tweet_referenced_tweet_type'] == "quoted"]

# Create a dataframe with all quotes that are related to tweets in the tweet_list dataframe
relevant_quote_tweet_list = pd.merge(quote_tweet_list, tweet_list, left_on='tweet_referenced_tweet_id', right_on='tweet_id')
relevant_quote_tweet_list.drop(['twitter_id_x',\
    'tweet_conversation_id_x',\
    'tweet_in_reply_to_user_id_x',\
    'tweet_lang_x',\
    'tweet_possibly_sensitive_x',\
    'tweet_retweet_count_x',\
    'tweet_reply_count_x',\
    'tweet_like_count_x',\
    'tweet_quote_count_x',\
    'tweet_reply_settings_x',\
    'tweet_referenced_tweet_id_x',\
    'tweet_referenced_tweet_id_x',\
    'tweet_source_x',\
    'api_call_x',\
    'twitter_id_y',\
    'tweet_conversation_id_y',\
    'tweet_in_reply_to_user_id_y',\
    'tweet_lang_y',\
    'tweet_possibly_sensitive_y',\
    'tweet_retweet_count_y',\
    'tweet_reply_count_y',\
    'tweet_like_count_y',\
    'tweet_quote_count_y',\
    'tweet_reply_settings_y',\
    'tweet_source_y',\
    'tweet_referenced_tweet_id_y',\
    'tweet_referenced_tweet_type_y',\
    'api_call_y'], axis=1, inplace=True)
relevant_quote_tweet_list.rename(columns={"tweet_author_id_x": "quoter_twitter_id ",\
    "tweet_created_at_x": "quote_created_at",\
    "twitter_handle_x": "quoter_twitter_handle",\
    "tweet_id_x": "quote_tweet_id",\
    "tweet_text_x": "quote_text",\
    "twitter_handle_y": "author_twitter_handle",\
    "tweet_id_y": "tweet_id",\
    "tweet_text_y": "tweet_text",\
    "tweet_author_id_y": "author_twitter_id",\
    "tweet_created_at_y": "tweet_created_at"}, inplace=True)

# Store results
relevant_quote_tweet_list.to_csv('../data/processed/quote_list.csv', index=False, decimal=',', sep=";", float_format='%.0f')