In [276]:
import pandas as pd
import seaborn as sns
import os
import re

In [277]:
# Import Files
mdb_list = pd.read_csv('../data/processed/mdb_list/mdb_list.csv', sep=";", na_values="")
user_list = pd.read_csv('../data/processed/user_list/user_list.csv', sep=";", na_values="", dtype={'twitter_id': str})

### Find all MDBs without a Twitter account

In [278]:
# MDBs without a Twitter account
mdb_without_twitter = pd.merge(mdb_list, user_list, on='bundestag_id', how='left')
mdb_without_twitter = mdb_without_twitter[mdb_without_twitter['twitter_id'].isnull()]

# MDBs that deleted Twitter account
mdb_deleted_twitter = user_list[user_list['twitter_id'].isnull()]

In [279]:
tweet_list = pd.DataFrame()
pattern = re.compile("tweet_list_*")
directory = "../data/processed/tweet_list/"

for filename in os.listdir(directory):
    if pattern.match(filename):
        filepath = directory + filename
        user_tweet_list = pd.read_csv(filepath, sep=";", lineterminator='\n')
        tweet_list = pd.concat([tweet_list, user_tweet_list], axis=0)

### Find starting point of tweet selection

In [280]:
# Voting: '2021-09-26'
# Official Start: '2021-10-26'
date_parameter = '2021-09-26'

tweet_list['tweet_created_at']= pd.to_datetime(tweet_list['tweet_created_at'])
tweet_list1 = tweet_list[tweet_list['tweet_created_at'] >= date_parameter]
print('Tweet Count until 2022-05-28:', tweet_list.shape[0])

print('Tweet Count since', date_parameter, ':', tweet_list1.shape[0], '(', round(tweet_list1.shape[0]/tweet_list.shape[0]*100, 2), '% of total amount of scraped tweets )')

quote_tweet_list = tweet_list1[tweet_list1['tweet_referenced_tweet_type'] == "quoted"]
relevant_quote_tweet_list = pd.merge(quote_tweet_list, tweet_list1, left_on='tweet_referenced_tweet_id', right_on='tweet_id')
retweet_tweet_list = tweet_list1[tweet_list1['tweet_referenced_tweet_type'] == "retweeted"]
relevant_retweet_tweet_list = pd.merge(retweet_tweet_list, tweet_list1, left_on='tweet_referenced_tweet_id', right_on='tweet_id')

print('\n### Tweet Type Distribution ###')
print('Normal Tweet:', tweet_list1[tweet_list1['tweet_referenced_tweet_type'].isnull()].shape[0], '(', round(tweet_list1[tweet_list1['tweet_referenced_tweet_type'].isnull()].shape[0]/tweet_list1.shape[0]*100, 2), '% )')
print('Reply:', tweet_list1[tweet_list1['tweet_referenced_tweet_type'] == "replied_to"].shape[0], '(', round(tweet_list1[tweet_list1['tweet_referenced_tweet_type'] == "replied_to"].shape[0]/tweet_list1.shape[0]*100, 2), '% )')
print('Quote:', tweet_list1[tweet_list1['tweet_referenced_tweet_type'] == "quoted"].shape[0], '(', round(tweet_list1[tweet_list1['tweet_referenced_tweet_type'] == "quoted"].shape[0]/tweet_list1.shape[0]*100, 2), '% )')
print('Retweet:', tweet_list1[tweet_list1['tweet_referenced_tweet_type'] == "retweeted"].shape[0], '(', round(tweet_list1[tweet_list1['tweet_referenced_tweet_type'] == "retweeted"].shape[0]/tweet_list1.shape[0]*100, 2), '% )')

print('Relevant Quote Count since', date_parameter, ':', relevant_quote_tweet_list.shape[0], '(', round(relevant_quote_tweet_list.shape[0]/tweet_list1.shape[0]*100, 2), '% )')
print('Relevant Retweet Count since', date_parameter, ':', relevant_retweet_tweet_list.shape[0], '(', round(relevant_retweet_tweet_list.shape[0]/tweet_list1.shape[0]*100, 2), '% )')

Tweet Count until 2022-05-28: 1024967
Tweet Count since 2021-09-26 : 211851 ( 20.67 % of total amount of scraped tweets )

### Tweet Type Distribution ###
Normal Tweet: 55553 ( 26.22 % )
Reply: 53303 ( 25.16 % )
Quote: 21019 ( 9.92 % )
Retweet: 81976 ( 38.7 % )
Relevant Quote Count since 2021-09-26 : 749 ( 0.35 % )
Relevant Retweet Count since 2021-09-26 : 3504 ( 1.65 % )


### Identify accounts were not all tweets were scraped (because of high twitter usage)

In [281]:
date_parameter = pd.to_datetime('2021-09-26') # 30 days prior voting

user_list1 = user_list.dropna(subset=['twitter_id'])
   
def findOldestTweet(twitter_handle):
    user_tweets = tweet_list[tweet_list['twitter_handle'] == twitter_handle]
    return pd.to_datetime(user_tweets.iloc[-1]['tweet_created_at'])

for index, contents in user_list1.iterrows():
    tweets_downloaded = tweet_list[tweet_list['twitter_handle'] == contents['twitter_handle']].shape[0]
    
    if tweets_downloaded > 0:
        dateOldestDownloadedTweet = pd.to_datetime(findOldestTweet(contents['twitter_handle'])).tz_localize(None)
        
        if dateOldestDownloadedTweet > date_parameter:
            
            if tweets_downloaded > 100:
                percentage_downloaded = tweet_list[tweet_list['twitter_handle'] == contents['twitter_handle']].shape[0]/contents['tweet_count']
                
                if percentage_downloaded < 0.9:
                    print()
                    print(contents['twitter_handle'])

                    print('Official Tweet count:', contents['tweet_count'])
                    print('Tweets downloaded:', tweets_downloaded)
                    print('Percantage Downloaded', round(percentage_downloaded*100,2), '%')

                    print('Oldest Tweet:', findOldestTweet(contents['twitter_handle']))
                    print('Date Parameter:', date_parameter)


MatthiasHauer
Official Tweet count: 18080.0
Tweets downloaded: 3200
Percantage Downloaded 17.7 %
Oldest Tweet: 2021-11-23 05:25:58+00:00
Date Parameter: 2021-09-26 00:00:00

SaraNanni
Official Tweet count: 17017.0
Tweets downloaded: 3200
Percantage Downloaded 18.8 %
Oldest Tweet: 2021-10-04 07:54:50+00:00
Date Parameter: 2021-09-26 00:00:00


### Find inactive accounts

In [282]:
date_parameter = pd.to_datetime('2021-08-27') # 30 days prior voting

def findLatestTweet(twitter_handle):
    user_tweets = tweet_list[tweet_list['twitter_handle'] == twitter_handle]
    return pd.to_datetime(user_tweets.iloc[0]['tweet_created_at'])

inactiveUsers = []

i = 0
for index, contents in user_list1.iterrows():
    tweets_downloaded = tweet_list[tweet_list['twitter_handle'] == contents['twitter_handle']].shape[0]
    
    if tweets_downloaded > 0:
        dateLatestTweet = pd.to_datetime(findLatestTweet(contents['twitter_handle'])).tz_localize(None)
        if dateLatestTweet < date_parameter:
            inactiveUsers.append(contents['twitter_handle'])
            print()
            print(contents['twitter_handle'])
            print('Latest Tweet:', findLatestTweet(contents['twitter_handle']))
            i += 1
    else:
        inactiveUsers.append(contents['twitter_handle'])
        print()
        print(contents['twitter_handle'])
        print('Latest Tweet: Never tweeted')
        i += 1
        
print("\nn of inactive accounts", i)


AnetteKramme
Latest Tweet: 2020-06-21 07:45:09+00:00

ToniHofreiter
Latest Tweet: 2013-10-29 11:06:47+00:00

CarstenMuellers
Latest Tweet: 2009-07-08 15:50:13+00:00

GerdesMdB
Latest Tweet: Never tweeted

VolkmarKlein
Latest Tweet: 2021-06-11 15:33:50+00:00

MdBMonstadt
Latest Tweet: 2019-10-04 13:08:55+00:00

FechnerJohannes
Latest Tweet: Never tweeted

UweFe
Latest Tweet: 2009-06-15 15:27:15+00:00

KoobMar
Latest Tweet: Never tweeted

JanMetzler
Latest Tweet: 2019-09-10 11:37:28+00:00

MittagSusanne
Latest Tweet: Never tweeted

WilfriedOellers
Latest Tweet: 2019-05-09 17:25:05+00:00

ruetzelbernd
Latest Tweet: 2020-09-15 15:30:16+00:00

UdoSchiefner
Latest Tweet: 2020-06-18 08:33:38+00:00

NAltenkamp
Latest Tweet: Never tweeted

Philipp_Amthor
Latest Tweet: Never tweeted

AlexGauland
Latest Tweet: 2018-07-10 13:19:11+00:00

Jochen_Haug
Latest Tweet: 2021-05-19 12:58:25+00:00

Huber_AfD
Latest Tweet: 2017-10-12 00:37:06+00:00

JoernKoenigAfD
Latest Tweet: 2019-06-25 17:48:25+00:00

N

### Build dataframe with all relevant users

In [285]:
# Remove accounts that were deleted
final_user_list = pd.concat([user_list, mdb_deleted_twitter]).drop_duplicates(keep=False)

# Remove accounts that are inactive
final_user_list = final_user_list[~final_user_list.twitter_handle.isin(inactiveUsers)]

# Remove duplicate (Knut Gerschau has the wrong twitter_handle / the one of Knut Abraham)
final_user_list = final_user_list.drop_duplicates(subset='twitter_handle', keep='first')

final_user_list.to_csv('../data/processed/user_list/user_list_active.csv', index=False, decimal=',', sep=";", float_format='%.0f')