In [None]:
import pandas as pd
import numpy as np
import re
import requests
import time

from bs4 import BeautifulSoup

def moveSibling(tag, number):
	i = 1
	while i <= number:
		tag = tag.nextSibling
		i += 1
	return tag

In [None]:
# Set up dataframes to store the downloaded data
# Table for game information
matches_columns = ['Season', 'Gameday', 'Link', 'TeamH', 'TeamA',
                   'GoalsH', 'ShotsH', 'PassesH', 'PassQuH', 'PossesH', 
                   'ChallH', 'FoulsH', 'OffsideH', 'CornersH', 'ChancesH',
                   'GoalsA', 'ShotsA', 'PassesA', 'PassQuA', 'PossesA', 
                   'ChallA', 'FoulsA', 'OffsideA', 'CornersA', 'ChancesA',
                   'Attendance', 'Last6-1', 'Last6-2', 'Last6-3', 
                   'Last6-4', 'Last6-5', 'Last6-6']
matches_df = pd.DataFrame(data=None, columns=matches_columns)

# Table for goal information, can be linked through "Link" with game info
goals_columns = ['Season', 'Gameday', 'Link', 'Score', 'Minute', 'Player']
goals_df = pd.DataFrame(data=None, columns=goals_columns)

# Table for Starting 11
lineup_columns = ["Link"] + \
                    ["Home_Player"+("0"+str(i))[-2:] for i in range(1,12)] + \
                    ["Away_Player"+("0"+str(i))[-2:] for i in range(1,12)]
lineup_df = pd.DataFrame(data=None, columns=lineup_columns)

for year in range(2014,2015):
    for gameday in range(1,35):
        # Write URL to access kicker gameday website
        url = "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/"
        url = url + str(year) + "-" + str(year+1)[2:4] + "/"
        url = url + str(gameday) + "/0/spieltag.html"
        
        # Avoid overloading with requests and pause if necessary
        file = ''
        while file =='':
            try:
                file = requests.get(url)
            except:
                time.sleep(0.1)

        # Download and parse page
        main = BeautifulSoup(file.content, 'html.parser')
        
        # Games are listed in a table        
        main_matches = main.find('table', 
                                 {'class': 'tStat tab1-bundesliga', 
                                  'summary': 'Begegnungen'})

        # Find all links of class "link." These refer to the game analyses.
        # In the analyses, we find (most of) the game stats
        for game in main_matches.find_all('a', {'class' : 'link'}):
            game_url = game.get('href')
            encoded_in_url = game_url.split("/")

            # Game ID is the article ID for kicker, as unique ID for game
            game_id = encoded_in_url[8][0:7]

            # Game URL encodes the teams which are playing
            teams = encoded_in_url[9][13:-5].split("_")
            teams = [teams[i][teams[i].rfind('-')+1:] for i in range(2)]
            
            # There is a different webpage for historic info on the teams.
            # We only have to swap out part of the url
            game_url = "http://www.kicker.de" + game_url
            hist_file = requests.get(game_url.replace("spielanalyse", 
                                                      "direktvergleichliga"))
            hist_soup = BeautifulSoup(hist_file.content, 'html.parser')

            # Can't use "Direktvergleich" because it includes all matches 
            # up to date of download and not to date of that game.
            # hist = hist_soup.find('table', {'class': 'tStat', 
            #                                 'summary': 'Direktvergleich'})
            # record = map(int, map(str.strip, str(hist.find('td', {'class': "alignright spielinfo_wert"}).text).split('/')))
            
            hist = hist_soup.find('table', 
                                  {'class': 'tStat tab1-bundesliga', 
                                   'summary': 'Begegnungen'})
            
            # Empty list for the results of previous matchups of the teams
            results = []
            try:
                for g in hist.find_all('td', {'class': 'alignleft nowrap'}):
                    link = g.next_sibling.next_sibling.find('a').get('href')
                    encoded_in_url = link.split("/")

                    # Date of matches in list of all matchups of these teams
                    res_year = int(encoded_in_url[6][0:4])
                    res_gday = int(encoded_in_url[7])
                    
                    # Only include games prior to this one!
                    if res_year < int(year) or (res_year == year and res_gday < gameday):
                        result = g.text
                        result = map(int, result[0:result.find('(')].strip().split(":"))
                        result = result[0]-result[1]
                        # If the historical game was Team B - Team A
                        # instead of Team A - Team B today, then flip result
                        if link.find(teams[0] + ".html") != -1:
                            result = -result
                        results.append(result)
                        
                    # Only interested in the last 6
                    if len(results) >= 6:
                        break
            except:
                continue

            # If there were less than 6 games, fill up with NaN
            results += [np.nan] * (6 - len(results))


            # Open website with statistics from the game and parse.
            game_file = requests.get(game_url)
            game_soup = BeautifulSoup(game_file.content, 'html.parser')
            
            # Error handling if website is malformatted or other.
            error_count = 0
            while game_soup.find('table', {'class': 'tStat', 
                                           'summary': 'Vereinsliste'}) == None:
                game_file = requests.get(game_url)
                game_soup = BeautifulSoup(game_file.content, 'html.parser')
                error_count += 1
                
                if error_count == 3:
                    print "Error on game " + str(game_id)
                    break

            # Download lineups
            # Error handling again; if problems with lineup, write NaN and continue
            if error_count < 3:
                both_teams = game_soup.find('table', {'class': 'tStat', 
                                                      'summary': 'Vereinsliste'})
                lineup = []
                for one_team in both_teams.find_all('div', 
                                                    {'class': 'aufstellung_team'}):
                    for player in one_team.find_all('a', 
                                                    {'class': 'link_noicon'}):
                        lineup.append(player.get('href').split("/")[8])
                        # Make sure to only download the first 11, 
                        # the Starting 11, for each team
                        if np.mod(len(lineup), 11) == 0:
                            break
                lineup = [int(game_id)] + map(int, lineup)
            else:
                lineup = [int(game_id)] + [np.nan for _ in range(22)]
                
            lineup_df = lineup_df.append(pd.DataFrame(data=[lineup], 
                                                      columns=lineup_columns),
                                         ignore_index=True)

            # Download main stats and save in a list.
            stats = game_soup.find('table', {'class': 'tStat tStatKarten', 
                                             'summary': 'Berufungen'})
            team1_stats = []
            team2_stats = []
            try:
                for stat in stats.find_all('td', {'class': "alignleft first"}):
                    if stat.text != ('angekommene Pässe').decode('utf-8') \
                        and stat.text != ('Ecken').decode('utf-8') \
                        and stat.text != ('Fehlpässe').decode('utf-8') \
                        and stat.text != ('Gefoult worden').decode('utf-8'):
                            team1_stats.append(stat.next_sibling.next_sibling.text.strip("\%"))
            except:
                team1_stats = [np.NaN for _ in range(8)]
                team1_stats[0] = int(game_soup.find('div', {'id': 'ovBoardExtMainH'}).text)
            try:
                for stat in stats.find_all('td', {'class': "alignright last"}):
                    if stat.text != ('angekommene Pässe').decode('utf-8') \
                        and stat.text != ('Ecken').decode('utf-8') \
                        and stat.text != ('Fehlpässe').decode('utf-8') \
                        and stat.text != ('Gefoult worden').decode('utf-8'):
                            team2_stats.append(stat.previous_sibling.previous_sibling.text.strip("\%"))
            except:
                team2_stats = [np.NaN for _ in range(8)]
                team2_stats[0] = int(game_soup.find('div', {'id': 'ovBoardExtMainA'}).text)

                
            # For whatever reason, some stats are stored in a different table. 
            # Grab this and add to list.     
            for value in game_soup.findAll('div', 
                                           id=['ctl00_PlaceHolderHalf_ctl03_kickerDaten', 
                                               'ctl00_PlaceHolderHalf_ctl04_kickerDaten']):
                corners = value.find('div', {'class': 'ecken'}).findNext('div').text.split(':')
                team1_stats.append(int(corners[0]))
                team2_stats.append(int(corners[1]))
                chances = value.find('div', {'class': 'chancen'}).findNext('div').text.split(':')
                team1_stats.append(int(chances[0]))
                team2_stats.append(int(chances[1]))

            # Download number of people in attendance
            for value in game_soup.findAll('div', id=['ctl00_PlaceHolderHalf_ctl03_zuschauer', 
                                                      'ctl00_PlaceHolderHalf_ctl04_zuschauer']):
                attendance = value.find('div', {'class': 'wert'}).text
                if attendance.find('(') > 0:
                    # "Error" handling if stadion is sold out
                    attendance = attendance[0:attendance.find('(')]
                attendance = int(attendance.strip())


            # Write stats for game into dataframe
            feed = [year] + [gameday] + [game_id] + teams + \
                    team1_stats + team2_stats + [attendance] + results
            feed += [np.nan] * (len(matches_columns) - len(feed))
            matches_df = matches_df.append(pd.DataFrame(data=[feed],
                                                        columns=matches_columns), 
                                           ignore_index=True)

            # Find the scorers in the game and download the info
            goal_list = []
            goals_cards = game_soup.find('table', {'class': 'tStat', 'summary': 'Tore & Karten'})
            goals = goals_cards.find('div', {'class': 'tore_karten'})
            goals = goals.find_all('div', {'class': 'kompletteZeile'})

            for goal in goals:
                goal_text = goal.text
                goal_link = goal.find('a')
                player = goal_link.get('href').split("/")[8]

                pos = goal_text.find(":")
                score = int(goal_text[pos-1:pos])-int(goal_text[pos+1:pos+2])
                pos = goal_text.find(".,")
                # "Error" handling for when a goal is scored in overtime. Treat as 45' and 90'.
                if pos < 0:
                    pos = goal_text.find(". +") 
                minute = goal_text[0:pos]
                pos = minute.find("(")
                minute = int(minute[pos+1:])

                goal_list.append(map(int,[year, gameday, game_id, score, minute, player]))

            # If goals fell, write info into goal dataframe.    
            if len(goal_list) > 0:
                goals_df = goals_df.append(pd.DataFrame(data=goal_list, 
                                                        columns=goals_columns), 
                                           ignore_index=True)
                
for col in matches_df.columns:
    try:
        matches_df.loc[:, col] = matches_df.loc[:, col].astype(int)
    except:
        continue

matches_df.loc[:, "GD"] = matches_df.loc[:, "GoalsH"] - matches_df.loc[:, "GoalsA"]

matches_columns_ro = ['Season', 'Gameday', 'Link', 'TeamH', 'TeamA', 'GD',
                      'GoalsH', 'GoalsA', 'ShotsH', 'ShotsA', 'PassesH', 
                      'PassesA', 'PassQuH', 'PassQuA', 'PossesH', 'PossesA', 
                      'ChallH', 'ChallA', 'FoulsH', 'FoulsA', 'OffsideH', 'OffsideA', 
                      'CornersH', 'CornersA', 'ChancesH', 'ChancesA', 'Attendance', 
                      'Last6-1', 'Last6-2', 'Last6-3', 'Last6-4', 'Last6-5', 'Last6-6']
matches_df = matches_df.loc[:, matches_columns_ro]

for col in goals_df.columns:
    try:
        goals_df.loc[:, col] = goals_df.loc[:, col].astype(int)
    except:
        continue

for col in lineup_df.columns:
    try:
        lineup_df.loc[:, col] = lineup_df.loc[:, col].astype(int)
    except:
        continue

matches_df.to_csv("Matches_2014.csv")
goals_df.to_csv("Goals_2014.csv")
lineup_df.to_csv("Lineup_2014.csv")

In [None]:
# Code to download table/standings at any given game day.
# Easier though to calculate on the fly.

# main_standings = main.find('table', {'class': 'tStat', 'summary': 'Tabelle'})
# main_standings = main_standings.find_all('tr', {'class': ['fest ', 'fest alt']})

# for team in main_standings:
#     # Start with the link for the team as our anchor
#     current_tag = team
#     # Split up the URL that encodes the team name & ID, and then separate the team ID from the teamname
#     team_id = int(current_tag.get('href').split('/')[7].split('-')[-1])
#     # Move to the win-draw-loss record
#     current_tag = current_tag.parent.parent.parent
#     current_tag = moveSibling(current_tag, 8)
#     rec_w = int(current_tag.text)
#     current_tag = moveSibling(current_tag, 2)
#     rec_d = int(current_tag.text)
#     current_tag = moveSibling(current_tag, 2)
#     rec_l = int(current_tag.text)
#     current_tag = moveSibling(current_tag, 4)
#     goals = map(int, current_tag.text.split(":"))

#     print [team_id] + [rec_w] + [rec_d] + [rec_l] + goals