In [2]:
import pandas as pd
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

In [3]:
df = pd.read_pickle("./df_clubs_overview.pkl")
df['CLUB_NAME'].unique()

array(['FC Schalke 04', 'Preußen Münster', '1.FC Saarbrücken',
       'Hamburger SV', '1.FC Kaiserslautern', 'Hertha BSC', '1.FC Köln',
       'Karlsruher SC', '1.FC Nürnberg', 'Meidericher SV',
       'Borussia Dortmund', 'TSV 1860 München', 'Eintracht Braunschweig',
       'VfB Stuttgart', 'Eintracht Frankfurt', 'SV Werder Bremen',
       'Borussia Neunkirchen', 'Hannover 96', 'FC Bayern München',
       'SC Tasmania 1900 Berlin (-1973)', 'Borussia Mönchengladbach',
       'Fortuna Düsseldorf', 'Rot-Weiss Essen', 'Alemannia Aachen',
       'MSV Duisburg', 'Kickers Offenbach', 'Rot-Weiß Oberhausen',
       'Arminia Bielefeld', 'VfL Bochum', 'Wuppertaler SV Borussia',
       'SC Fortuna Köln', 'Tennis Borussia Berlin', 'Bayer 05 Uerdingen',
       'FC St. Pauli', 'SV Darmstadt 98', 'Bayer 04 Leverkusen',
       'SV Waldhof Mannheim', 'FC 08 Homburg', 'SV Blau-Weiß Berlin',
       'Stuttgarter Kickers', 'SG Wattenscheid 09', 'FC Hansa Rostock',
       'SG Dynamo Dresden', 'VfB Leipzig',

In [4]:
# Matching clubs to their cities 

bundesliga_vereine_orte = {
    'FC Schalke 04': 'Gelsenkirchen',
    'Preußen Münster': 'Münster',
    '1.FC Saarbrücken': 'Saarbrücken',
    'Hamburger SV': 'Hamburg',
    '1.FC Kaiserslautern': 'Kaiserslautern',
    'Hertha BSC': 'Berlin',
    '1.FC Köln': 'Köln',
    'Karlsruher SC': 'Karlsruhe',
    '1.FC Nürnberg': 'Nürnberg',
    'Meidericher SV': 'Duisburg',
    'Borussia Dortmund': 'Dortmund',
    'TSV 1860 München': 'München',
    'Eintracht Braunschweig': 'Braunschweig',
    'VfB Stuttgart': 'Stuttgart',
    'Eintracht Frankfurt': 'Frankfurt',
    'SV Werder Bremen': 'Bremen',
    'Borussia Neunkirchen': 'Neunkirchen',
    'Hannover 96': 'Hannover',
    'FC Bayern München': 'München',
    'SC Tasmania 1900 Berlin (-1973)': 'Berlin',
    'Borussia Mönchengladbach': 'Mönchengladbach',
    'Fortuna Düsseldorf': 'Düsseldorf',
    'Rot-Weiss Essen': 'Essen',
    'Alemannia Aachen': 'Aachen',
    'MSV Duisburg': 'Duisburg',
    'Kickers Offenbach': 'Offenbach',
    'Rot-Weiß Oberhausen': 'Oberhausen',
    'Arminia Bielefeld': 'Bielefeld',
    'VfL Bochum': 'Bochum',
    'Wuppertaler SV Borussia': 'Wuppertal',
    'SC Fortuna Köln': 'Köln',
    'Tennis Borussia Berlin': 'Berlin',
    'Bayer 05 Uerdingen': 'Krefeld',
    'FC St. Pauli': 'Hamburg',
    'SV Darmstadt 98': 'Darmstadt',
    'Bayer 04 Leverkusen': 'Leverkusen',
    'SV Waldhof Mannheim': 'Mannheim',
    'FC 08 Homburg': 'Homburg',
    'SV Blau-Weiß Berlin': 'Berlin',
    'Stuttgarter Kickers': 'Stuttgart',
    'SG Wattenscheid 09': 'Bochum',
    'FC Hansa Rostock': 'Rostock',
    'SG Dynamo Dresden': 'Dresden',
    'VfB Leipzig': 'Leipzig',
    'SC Freiburg': 'Freiburg',
    'KFC Uerdingen 05': 'Krefeld',
    'VfL Wolfsburg': 'Wolfsburg',
    'SpVgg Unterhaching': 'Unterhaching',
    'SSV Ulm 1846': 'Ulm',
    'FC Energie Cottbus': 'Cottbus',
    '1.FSV Mainz 05': 'Mainz',
    'TSG 1899 Hoffenheim': 'Hoffenheim',
    'FC Augsburg': 'Augsburg',
    'SpVgg Greuther Fürth': 'Fürth',
    'SC Paderborn 07': 'Paderborn',
    'FC Ingolstadt 04': 'Ingolstadt',
    'RasenBallsport Leipzig': 'Leipzig',
    '1.FC Union Berlin': 'Berlin'
}

In [32]:
# Base URL of Transfermarkt Webpage
BASE_URL = 'https://www.wetterkontor.de'

# header config for Browser setup
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}


def weather_url(date):
    """
    Reads table 

    Args:
        date (int): Accept date in form of yyyymmdd starting from 20110101

    Returns:
        response object: Provides methods and attributes to access the data returned by the HTTP request
    """
    url = urljoin(BASE_URL, f'/de/wetter/deutschland/extremwerte.asp?id={date}')
    response = requests.get(url, headers=headers)
    print(url)
    return response




def split_list_into_pairs(input_list):
    result_list = [input_list[i:i+6] for i in range(0, len(input_list), 6)]
    return result_list



def dict_to_df(col_name_ls, value_ls):
    """
    Reads the column names and their values in form of lists. Store it into one dictionary to convert it to a Dataframe format 
    
    Args:
        col_name_ls (list['str']): list of column names
        value_ls (list('list')): list of lists with the values for each column

    Returns:
        df (DataFrame):  
    """
    dict = {}
    for enum in range(len(col_name_ls)):
        dict[col_name_ls[enum]] = value_ls[enum]
    df = pd.DataFrame(dict)
    return df

In [102]:
def weather_data(date, weatherstation_param):

    response = weather_url(20110101)
    soup = BeautifulSoup(response.content, 'lxml')
    
    # Get weather station
    weather_stations = soup.find_all('td', class_="uk-text-left")
    weather_stations_list = [station.text.strip() for station in weather_stations]

    # Get weather data as dictionary
    weather_data = soup.find_all('td', class_="uk-text-center")
    weather_data_list = [w_data.text.strip() for w_data in weather_data]
    weather_data_list_splitted = split_list_into_pairs(weather_data_list)
    weather_data_list_splitted


    keys = ['MIN_TEMP', 'MAX_TEMP', 'MIN_5_CM_NIGHT', 'SNOW_HEIGHT', 'WIND', 'RAIN']
    weather_dict = {key: None for key in keys}

    for i, key in zip(range(0,7), keys):
        ls = []
        for j in range(len(weather_data_list_splitted)):
            value = weather_data_list_splitted[j][i]
            ls.append(value)
            
        weather_dict[key] = ls


    # Get sunshine duration    
    sunshine_duration_h = soup.find_all('td', class_="td_beo_r")
    sunshine_duration_h_list = [duration.text.strip() for duration in sunshine_duration_h]
    sunshine_duration_h_list


    col_name_ls = ['WEATHER_STATION', 'MIN_TEMP_C', 'MAX_TEMP_C', 'MIN_5_CM_NIGHT', 'SNOW_HEIGHT_cm', 'WIND', 'RAIN_l/m2', 'SUNSHINE_DURATION_h']
    value_ls = [weather_stations_list, weather_dict['MIN_TEMP'], weather_dict['MAX_TEMP'], weather_dict['MIN_5_CM_NIGHT'], weather_dict['SNOW_HEIGHT'], weather_dict['WIND'], weather_dict['RAIN'], sunshine_duration_h_list]
    df = dict_to_df(col_name_ls, value_ls)
    df = df[df['WEATHER_STATION'] == weatherstation_param]

    df.drop(['MIN_5_CM_NIGHT'], axis = 1, inplace=True)
    df.drop(['WIND'], axis = 1, inplace=True)


    return df

In [103]:
weather_data(20220101, 'Berlin/Tegel')

https://www.wetterkontor.de/de/wetter/deutschland/extremwerte.asp?id=20110101


Unnamed: 0,WEATHER_STATION,MIN_TEMP_C,MAX_TEMP_C,SNOW_HEIGHT_cm,RAIN_l/m2,SUNSHINE_DURATION_h
14,Berlin/Tegel,1,36,0,0,2
