In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import GetOldTweets3 as got
import tweepy
import requests
import bs4

import datetime
import json

%matplotlib inline

In [17]:
def get_tweets(username, since_date='2018-01-01', until_date='now'):
    
    '''Get all tweets from a user in a given time interval.
    
    Args:
        username (str): Username
        since_date (str): The start date of the interval
        until_date (str): The end date
    
    Returns:
        tweets: List of GetOldTweets3.models.Tweet.Tweet objects
    '''

    if until_date == 'now':
        until_date = datetime.datetime.now().strftime('%Y-%m-%d')

    # Creation of query object
    tweetCriteria = got.manager.TweetCriteria().setUsername(username)\
        .setSince(since_date).setUntil(until_date).setTopTweets(False)

    # Creation of list that contains all tweets
    tweets = got.manager.TweetManager.getTweets(tweetCriteria)

    return tweets

In [18]:
tweets = get_tweets('strawlab')

In [19]:
def tweet_to_dict(tweet):
    
    '''Transforms a GetOldTweets3-Tweet object to dictionary
    
    Args:
        tweet: GetOldTweets3.models.Tweet.Tweet object
        
    Returns:
        tweet_dict: Dictionary representation of the Tweet object
    '''

    tweet_dict = {
        'id': tweet.id,
        'permalink': tweet.permalink,
        'username': tweet.username,
        'to': tweet.to,
        'text': tweet.text,
        'date': tweet.date.strftime('%Y-%m-%d-%H-%M-%S'),
        'retweets': tweet.retweets,
        'favorites': tweet.favorites,
        'mentions': tweet.mentions,
        'hashtags': tweet.hashtags,
        'geo': tweet.geo
    }
    
    return tweet_dict

In [20]:
tweets_dict = list(map(tweet_to_dict, tweets))

In [139]:
def get_API():
    
    '''Gets the Twitter API and authenticates with the App "getMdBList".
    
    Returns:
        api: Twitter API
    '''
    
    # read acess key from external file
    with open("app_data", "r") as f:
        consumer_key = f.readline().splitlines()[0]
        consumer_secret = f.readline().splitlines()[0]

    auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)

    api = tweepy.API(auth)
    
    return api


def retrieve_accounts_bundestag(api):
    
    '''Retrieves the members of the Twitter list "MdB (Bundestag)" by "wahl_beobachter"
    https://twitter.com/i/lists/912241909002833921
    
    Returns:
        members: List of members
    '''
    
    return list(tweepy.Cursor(api.list_members, list_id=912241909002833921).items())


# initialize Twitter API
api = get_API()

# get list of Bundestag members with Twitter accounts from https://twitter.com/i/lists/912241909002833921
accounts_bundestag = retrieve_accounts_bundestag(api)

# convert tweepy-User objects to dicts
accounts_bundestag = list(map(lambda user: user._json, accounts_bundestag))

In [144]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import time

def scrape_bundestag_website():

    '''Scrapes the Bundestag website and retrieves the HTML code of a list view of all members.
    
    Returns:
        soup: Beautiful Soup object with the HTML source code for the list view
    '''

    URL = "https://www.bundestag.de/abgeordnete"

    driver = webdriver.Safari()
    driver.maximize_window()
    driver.get(URL)

    # ensure list view button is clickable
    try:
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="mod525246"]/div[2]/header/div[2]/div/form/a')))
    except TimeoutException:
        print('Page timed out after 10 secs.')

    # click list view button
    python_button = driver.find_elements_by_xpath('//*[@id="mod525246"]/div[2]/header/div[2]/div/form/a')[0]
    python_button.click()

    # ensure everything is properly loaded
    time.sleep(3)

    # parse HTML
    soup = bs4.BeautifulSoup(driver.page_source, 'html5lib')

    driver.quit()
    
    return soup


def soup_to_members(soup):
    
    '''Retrieves the Bundestag members and their parties from the HTML page source code. 
    
    Args:
        soup: BeautifulSoup object of the Bundestag website 
        
    Returns:
        names: List of names of the Bundestag members
        names: List of party affiliations
    '''
    
    # go to corresponding div in the HTML page source code of the list view
    table = soup.find('div', attrs = {'class':'bt-2col col-xs-12'})

    # loop through all divs, get name and party for each person
    divs_person = table.find_all('div', attrs = {'class':'bt-teaser-person'})

    names = []
    party = []
    for div in divs_person:

        n = div.div.h3.text.strip()
        p = div.div.p.text.strip()

        # ausgeschieden / quitted
        if '*' in p:
            p = p.splitlines()[0] + ' *'

        # verstorben / deceased
        if '**' in p:
            p = p.splitlines()[0] + ' **'

        # Mandat abgelehnt / mandate rejected
        if '***' in p:
            p = p.splitlines()[0] + ' ***'

        names.append(n)
        party.append(p)

    return names, party
    

soup = scrape_bundestag_website()
names, party = soup_to_members(soup)