In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

import os
import json
from urllib.request import urlopen, Request

import sys
import time

from unidecode import unidecode

import pandas as pd
import numpy as np
from pandas import DataFrame, Series
import sqlite3

from datetime import datetime
from difflib import SequenceMatcher

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import scrape_selenium_soccerway as utils

In [3]:

#convert string of type x% to a float 
def perc2float(string):
    s =string.strip() 
    if s[-1]=='%':
        return float(s.strip("%"))/100
    return string

#converts date string of type dd-mon-YY to datetime
def str2date(string):
    helper = { 'Sep':9, 'Aug':8, 'Jul':7, 'Jun':6, 'May':5, 'Apr':4, 'Mar':3, 'Feb':2, 'Jan':1, 'Oct':10, 'Nov':11, 'Dec': 12 }
    s = string.split('-')
    return(datetime(int(''.join(['20',s[2]])), helper[s[1]], int(s[0])))

#converts date string of type dd-mon-YY to datetime
def str2date2(string):
    helper = { 'Sep':9, 'Aug':8, 'Jul':7, 'Jun':6, 'May':5, 'Apr':4, 'Mar':3, 'Feb':2, 'Jan':1, 'Oct':10, 'Nov':11, 'Dec': 12 }
    s = string.split(' ')
    return(datetime(int(s[2]), helper[s[1]], int(s[0])))

#
def dist(string1, string2):
    match = SequenceMatcher(None, string1, string2).find_longest_match(0,len(string1), 0, len(string2))
    return match.size

#
def map_names(list1, list2):
    if len(list1) != len(list2):
        print('lists do not have equal length')
        return None
    list12 = [x[0] for x in list1]
    indices = [np.argmax([dist(s2, s) for s2 in list12]) for s in list2]
    mapping = {}
    for i,ix in enumerate(indices):
        mapping[list2[i]] = list1[ix][1]
    return mapping

def find_patch_id(date):
    dates2patch = {datetime(2017,8,1): 158835,   
                datetime(2017,2,22): 158647,   
                datetime(2016,8,1): 158466,   
                datetime(2016,2,19): 158278,   
                datetime(2015,8,1): 158103,   
                datetime(2015,2,20): 157914,   
                datetime(2014,8,1): 157739,   
                datetime(2014,2,21): 157550,   
                datetime(2013,8,1): 157376,   
                datetime(2013,2,22): 157186,   
                datetime(2012,8,1): 157011,   
                datetime(2012,2,22): 156820,   
                datetime(2011,8,1): 156644,   
                datetime(2011,2,22): 156455,   
                datetime(2010,8,1): 156279,   
                datetime(2010,2,22): 156090,   
                datetime(2009,8,1): 155914,   
                datetime(2009,2,22): 155725,   
                datetime(2008,8,1): 155549}
    
    
    for d in dates2patch.keys():
        if date < d:
            continue
        else:
            return dates2patch[d]
    print('none found')
    return None
    

## players: list of players, first half of team 1, second half of team 2
## teams: teamIDs as seen in database
## database: name of database file
## c: sqlite3 cursor. 
## Either c or database must have a value. 
def player_name2playerID(players, teams, database=None, c=None, out=False):
    if not c:
        con = sqlite3.connect(database)
        c = con.cursor()
    query0 = [x for x in c.execute('SELECT name,playerID FROM player WHERE teamID IN ('+str(teams[0])+')')]
    query1 = [x for x in c.execute('SELECT name,playerID FROM player WHERE teamID IN ('+str(teams[1])+')')]
    if not c:
        con.close()
    playerIDs = []
    
    if len((players[0]+players[1])) != 36:
        print(len(players))
#         print('not the right amount of players provided, sorry')
#         return None
    
    homep = utils.match_club_names(query0, players[0])
    awayp = utils.match_club_names(query1, players[1])
    if out:
        for i in homep:
            print(i, ':', homep[i])
        for i in awayp:
            print(i, ':', awayp[i])
        
    for player in players[0]:
        playerIDs.append(homep[player][1])
    playerIDs += ['0']*(18-len(playerIDs))

    for player in players[1]:
        playerIDs.append(awayp[player][1])
    playerIDs += ['0']*(36-len(playerIDs))
    
    
    return playerIDs

In [4]:
#scrapes a match report from whoscored.com to a row in the match table
def get_match_stats(url, database, driver, team_mapping, ligaID, sleep_time = 0.35):
    
    # team_match_stats_box(home, away):  Shots, Shots on target, pass%, aerial duel %, dribbles, tackles, possesion%
    team_match_stats = []
    
    #store shot statistics(home, away): total shots, aus dem spiel, ruhender ball, konter, elfmeter, 
    #                                   eigentor, total shots, total goals, conversion rate
    attempts = []

    #store passing statistics(home, away): total passes, crosses, through balls, long balls, 
    #                                    short passes, total passes, average pass streak
    passes = []

    #store fouling statistics(home, away): total cards, cards cuz foul, cards cuz unproffesional, 
    #                                    cards cuz dive, cards cuz other, red cards, yellow cards,
    #                                    cards per foul, total fouls
    fouls = []

    #get player names
    names = []

    #get player names simplified (ohne umlaute, akzente, etc)
    names_simple = []

#     #adblock with add_extension
#     chop = webdriver.ChromeOptions()
#     chop.add_extension('/home/mace/Downloads/Adblock-Plus_v1.13.3.crx')
#     with webdriver.Chrome('/home/mace/Downloads/chromedriver', chrome_options=chop) as driver:    
    #load website
    driver.get(url)
    time.sleep(sleep_time)
    
    ## select meta data
    homeID,awayID = [team_mapping[x.text] for x pca vs ldain driver.find_elements_by_class_name('team-link ')]
    date = str2date(driver.find_element_by_xpath('//div[@id="match-header"]//div[position()=3]/dl/dd[position()=2]').text.split(' ')[1])
    matchID = url.split('/')[4]

    #select general team match stats
    for x in driver.find_elements_by_xpath('//div[@id = "match-report-team-statistics"]//span[contains(@class,"stat-value")]/span'):
        team_match_stats.append(perc2float(x.text))

    #scrape shot statistics
    for x in driver.find_elements_by_xpath('//div[@id = "live-goals"]//span[contains(@class,"stat-value")]/span'):
        attempts.append(perc2float(x.text))

    driver.execute_script('window.scrollBy(0,600)')

    #select passing statistics
    driver.find_element_by_xpath("//a[@href='#live-passes']").click()
    time.sleep(sleep_time)

    #scrape passing statistics
    for x in driver.find_elements_by_xpath('//div[@id = "live-passes"]//span[contains(@class,"stat-value")]/span'):
        passes.append(x.text)

    #select fouling statistics
    driver.find_element_by_xpath("//a[@href='#live-aggression']").click()
    time.sleep(sleep_time)

    #scrape fouling statistics
    for x in driver.find_elements_by_xpath('//div[@id = "live-aggression"]//span[contains(@class,"stat-value")]/span'):
        fouls.append(x.text)

    #go to player stats/site
    driver.get(url.replace('MatchReport', 'LiveStatistics'))
    time.sleep(sleep_time)
    #get player names
    #repeat looking until list is populated to give site time to load
    while len(names)< 36 or names[0] == '0':
        names = []
        names_simple = []
        k=driver.find_elements_by_xpath("//div[@id='live-player-home-stats']//a[@class='player-link']")
        for x in k:
            names.append(x.text.split('(')[0].strip())

        for x in k:
            names_simple.append(unidecode(x.text.split('(')[0].strip()))
            
        names += ['0']*(18-len(names))
        names_simple+= ['0']*(18-len(names_simple))
    
        k=driver.find_elements_by_xpath("//div[@id='live-player-away-stats']//a[@class='player-link']")
        for x in k:
            names.append(x.text.split('(')[0].strip())

        for x in k:
            names_simple.append(unidecode(x.text.split('(')[0].strip()))

        names += ['0']*(36-len(names))
        names_simple += ['0']*(36-len(names_simple))
     
    
    with sqlite3.connect(database) as connection:
        cursor = connection.cursor()
        onfield = player_name2playerID(names_simple, [homeID, awayID], c=cursor)
        match_stats = onfield+attempts[14:16]+team_match_stats+attempts[2:12]+attempts[16:]+passes[:10] + passes[12:]+fouls[-2:]+fouls[12:14]+fouls[10:12]+fouls[2:10]
        cursor.execute('INSERT OR IGNORE INTO match VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', (matchID, ligaID, date, url, homeID, awayID, *match_stats))
    
    

SyntaxError: invalid syntax (<ipython-input-4-04bd97400892>, line 35)

In [5]:
len([229923158647, 198202158647, 193886158647, 150418158647, 115533158647, 193886158647, 203030158647, 192984158647, 192565158647, 195363158647, 164994158647, 177553158647, 212245158647, 198118158647, 164994158647, 115533158647, 115533158647, 115533158647, 188545158647, 188545158647, 237595158647, 167495158647, 231228158647, 121939158647, 190483158647, 156616158647, 9014158647, 156616158647, 231228158647, 231228158647, 9014158647, 178603158647, 45197158647, 189596158647, 168607158647, 190483158647])

36

In [6]:
import glob

csv_files_dict = {14:glob.glob('soccer_odds/eng*'),
                  13:glob.glob('soccer_odds/epl_*'),
                  19:glob.glob('soccer_odds/bun*'),
                  20:glob.glob('soccer_odds/2bun*'),
                  53:glob.glob('soccer_odds/lali*'),
                  31:glob.glob('soccer_odds/seri*'),
                  16:glob.glob('soccer_odds/ligue*'),
                  10:glob.glob('soccer_odds/ere*'),
                  68:glob.glob('soccer_odds/superli*'),
                  60:glob.glob('soccer_odds/E2*'),
                  61:glob.glob('soccer_odds/E3*')}
                  

def date_conv(x):
    if len(str(x).split('/')) <3:
        return None
    d, m, y = str(x).split('/')
    return datetime(*[int(x) for x in [''.join(['20',y]), m, d]])

In [7]:
url_base = 'http://de.soccerway.com/national/germany/bundesliga/20162017/regular-season/r35823/matches/'
con = sqlite3.connect('soccer.sqlite')
c = con.cursor()

In [8]:
[x for x in c.execute('select match.ligaID, league.name, count(*) from league join match on match.ligaID = league.leagueID group by match.ligaID')]

[(10, 'Eredevise', 2195),
 (13, 'Premier League', 3033),
 (14, 'English Championship', 3978),
 (16, 'Ligue 1', 2514),
 (19, 'Bundesliga', 2387),
 (20, '2. Bundesliga', 2194),
 (31, 'Serie A', 2726),
 (53, 'La Liga', 3040),
 (60, 'English League One', 4401),
 (68, 'Super Lig', 850)]

In [9]:
chop = webdriver.ChromeOptions()
chop.add_extension('/home/mace/Downloads/Adblock-Plus_v1.13.3.crx')
driver = webdriver.Chrome('/home/mace/Downloads/chromedriver', chrome_options=chop)
driver2 = webdriver.Chrome('/home/mace/Downloads/chromedriver', chrome_options=chop)

In [11]:
c.execute('select * from match')
match_cols =[desc[0] for desc in c.description]

# url_base = 'http://de.soccerway.com/national/england/league-one/20162017/regular-season/r36641/matches/?ICID=PL_3N_02'
url_base = 'http://de.soccerway.com/national/england/league-two/20162017/regular-season/r36644/matches/?ICID=PL_3N_02'

# url_base = 'http://de.soccerway.com/national/netherlands/eredivisie/20162017/regular-season/r36385/matches/?ICID=PL_3N_02'

ligaID = 61
odds_data = pd.concat((pd.read_csv(f, dtype={'Date':str, 'AwayTeam':str, 'HomeTeam':str}) for f in csv_files_dict[ligaID]))
odds_data['Date'] = odds_data['Date'].apply(date_conv)
odds_data.rename(columns={'AS':'AShot', 'HS':'HShot', 'BbAv>2.5':'BbAvo25', 'BbAv<2.5':'BbAvu25', 'BbMx>2.5':'BbMxo25', 'BbMx<2.5':'BbMxu25' }, inplace=True)
tm2 = {v[1]:k for k,v in utils.match_club_names([x for x in c.execute('SELECT DISTINCT name, sofifaTeamID FROM team WHERE ligaID=? GROUP BY name', (ligaID,))],list( odds_data['AwayTeam'].dropna().unique())).items()}
driver.get(url_base)
# this can be activated to skip pages in the beginnning
# for _ in range(2):
#     driver.find_element_by_css_selector('.previous').click()
#     time.sleep(0.5)
for select in range(2,10):
    soccerway_teams = []
    while True:
        time.sleep(1.3)
        matches = driver.find_elements_by_css_selector('.score-time.score')
        for match in matches:
            url = match.find_element_by_tag_name('a').get_attribute('href')
            url_ = url.split('/')
            match_dict = dict.fromkeys(match_cols, -1)

            print(url)
            date = datetime(*[int(x) for x in url_[4:7]])
            patch = find_patch_id(date)

            
            
#             TODO select team name by title if errors occur due to incomplete team names
            for teamb in driver.find_elements_by_xpath('//td[@class="team team-b "]/a'):
                soccerway_teams.append(teamb.get_attribute('href').split('/')[5].replace('-', ' '))
            for teamb in driver.find_elements_by_xpath('//td[@class="team team-a "]/a'):
                soccerway_teams.append(teamb.get_attribute('href').split('/')[5].replace('-', ' '))
               
            soccerway_teams = list(set(filter(None,soccerway_teams)))
            if len(soccerway_teams) > 15:
                tm_raw = utils.match_club_names([name for name in c.execute('SELECT name, teamID FROM team WHERE patchID IS '+str(find_patch_id(date))+' AND ligaID IS '+str(ligaID))], soccerway_teams)
                tm = {key:val[1] for key,val in zip(tm_raw.keys(), tm_raw.values())}
            matchID = url_[11]
            #todo map names team_mapping = map_names(stuffstuff)
            homeID, awayID = [tm[x] for x in [' '.join(url_[9].split('-')),' '.join(url_[10].split('-'))]]
            driver2.get(url)
            time.sleep(0.6)
            names = [[],[]]
            
            for p in driver2.find_elements_by_xpath('//div[@class="container left"]//tr'):
                try:
                    jersey = p.find_element_by_class_name('shirtnumber').text
                    name = ' '.join(p.find_element_by_tag_name('a').get_attribute('href').split('/')[4].split('-'))
                    names[0].append((name, jersey))
                except NoSuchElementException:
                    pass
                
            for p in driver2.find_elements_by_xpath('//div[@class="container right"]//tr'):
                try:
                    jersey = p.find_element_by_class_name('shirtnumber').text
                    name = ' '.join(p.find_element_by_tag_name('a').get_attribute('href').split('/')[4].split('-'))
                    names[1].append((name, jersey))
                except NoSuchElementException:
                    pass

            pid = utils.match_player([[x for x in c.execute('SELECT name, playerID, jersey from player WHERE teamID IS ?', (homeID, ))],[x for x in c.execute('SELECT name, playerID, jersey from player WHERE teamID IS ?', (awayID, ))]], names)
            match_dict['patch'] = patch
            match_dict['matchID'] = matchID
            match_dict['date'] = date
            match_dict['ligaID'] = ligaID
            match_dict['homeID'] = homeID
            match_dict['awayID'] = awayID
            match_dict['url'] = url
            #die Daten aus pandas auslesen und ins data_dict uebertragen
            
            game = odds_data[(odds_data['Date']==date) & (odds_data['HomeTeam']== tm2[int(homeID/1000000)]) & (odds_data['AwayTeam']==tm2[int(awayID/1000000)]) ]
            if game.shape[0] == 0:
                continue
            for col in match_cols[42:]:
                if col in game.columns:
                    match_dict[col] = game[col].iloc([0])[0]
            for i, p in enumerate(match_cols[6:42]):
                match_dict[p] = pid[i]

            match_values =  [match_dict[x] for x in match_cols]
            try:
                c.execute('INSERT OR IGNORE INTO match VALUES('+','.join(["?" for x in match_cols ])+')', (*match_values,))
                con.commit()
            except sqlite3.OperationalError:
                print('INSERT OR IGNORE INTO match VALUES('+','.join(["?" for x in match_cols ])+')', (*[match_dict[x] for x in match_cols]))
        print('=============Page Done============')
        try:
            driver.find_element_by_css_selector('.previous.disabled')
            break
        except NoSuchElementException:
            driver.find_element_by_css_selector('.previous').click()

    s = Select(driver.find_element_by_id('season_id_selector'))
    s.select_by_index(select)
    driver.find_elements_by_xpath('//*[@id="submenu"]//li')[1].click()

http://de.soccerway.com/matches/2017/04/22/england/league-two/crewe-alexandra-fc/leyton-orient-fc/2248351/?ICID=PL_MS_01
http://de.soccerway.com/matches/2017/04/22/england/league-two/wycombe-wanderers-fc/doncaster-rovers-fc/2248359/?ICID=PL_MS_02
http://de.soccerway.com/matches/2017/04/22/england/league-two/hartlepool-united-fc/barnet-fc/2248354/?ICID=PL_MS_03
http://de.soccerway.com/matches/2017/04/22/england/league-two/exeter-city-fc/morecambe-fc/2248352/?ICID=PL_MS_04
http://de.soccerway.com/matches/2017/04/22/england/league-two/newport-county-afc/accrington-stanley-fc/2248356/?ICID=PL_MS_05
http://de.soccerway.com/matches/2017/04/22/england/league-two/crawley-town-football-club/carlisle-united-fc/2248350/?ICID=PL_MS_06
http://de.soccerway.com/matches/2017/04/29/england/league-two/cambridge-united-fc/crawley-town-football-club/2248362/?ICID=PL_MS_07
http://de.soccerway.com/matches/2017/04/29/england/league-two/barnet-fc/grimsby-town-fc/2248361/?ICID=PL_MS_08
http://de.soccerway.com/

TimeoutException: Message: timeout
  (Session info: chrome=62.0.3202.62)
  (Driver info: chromedriver=2.31.488763 (092de99f48a300323ecf8c2a4e2e7cab51de5ba8),platform=Linux 4.9.0-4-amd64 x86_64)


In [11]:
c.execute('select * from match')
match_cols =[desc[0] for desc in c.description]

# url_base = 'http://de.soccerway.com/national/england/league-one/20162017/regular-season/r36641/matches/?ICID=PL_3N_02'
url_base = 'http://de.soccerway.com/national/england/league-two/20162017/regular-season/r36644/matches/?ICID=PL_3N_02

# url_base = 'http://de.soccerway.com/national/netherlands/eredivisie/20162017/regular-season/r36385/matches/?ICID=PL_3N_02'

ligaID = 61
odds_data = pd.concat((pd.read_csv(f, dtype={'Date':str, 'AwayTeam':str, 'HomeTeam':str}) for f in csv_files_dict[ligaID]))
odds_data['Date'] = odds_data['Date'].apply(date_conv)
odds_data.rename(columns={'AS':'AShot', 'HS':'HShot', 'BbAv>2.5':'BbAvo25', 'BbAv<2.5':'BbAvu25', 'BbMx>2.5':'BbMxo25', 'BbMx<2.5':'BbMxu25' }, inplace=True)
tm2 = {v[1]:k for k,v in utils.match_club_names([x for x in c.execute('SELECT DISTINCT name, sofifaTeamID FROM team WHERE ligaID=? GROUP BY name', (ligaID,))],list( odds_data['AwayTeam'].dropna().unique())).items()}
driver.get(url_base)
# this can be activated to skip pages in the beginnning
# for _ in range(2):
#     driver.find_element_by_css_selector('.previous').click()
#     time.sleep(0.5)
for select in range(2,10):
    soccerway_teams = []
    while True:
        time.sleep(2.3)
        matches = driver.find_elements_by_css_selector('.score-time.score')
        for match in matches:
            url = match.find_element_by_tag_name('a').get_attribute('href')
            url_ = url.split('/')
            match_dict = dict.fromkeys(match_cols, -1)

            print(url)
            date = datetime(*[int(x) for x in url_[4:7]])
            patch = find_patch_id(date)

            
            
#             TODO select team name by title if errors occur due to incomplete team names
            for teamb in driver.find_elements_by_xpath('//td[@class="team team-b "]/a'):
                soccerway_teams.append(teamb.get_attribute('href').split('/')[5].replace('-', ' '))
            for teamb in driver.find_elements_by_xpath('//td[@class="team team-a "]/a'):
                soccerway_teams.append(teamb.get_attribute('href').split('/')[5].replace('-', ' '))
               
            soccerway_teams = list(set(filter(None,soccerway_teams)))
            if len(soccerway_teams) > 15:
                tm_raw = utils.match_club_names([name for name in c.execute('SELECT name, teamID FROM team WHERE patchID IS '+str(find_patch_id(date))+' AND ligaID IS '+str(ligaID))], soccerway_teams)
                tm = {key:val[1] for key,val in zip(tm_raw.keys(), tm_raw.values())}
            matchID = url_[11]
            #todo map names team_mapping = map_names(stuffstuff)
            homeID, awayID = [tm[x] for x in [' '.join(url_[9].split('-')),' '.join(url_[10].split('-'))]]
            driver2.get(url)
            time.sleep(0.6)
            names = [[],[]]
            
            for p in driver2.find_elements_by_xpath('//div[@class="container left"]//tr'):
                try:
                    jersey = p.find_element_by_class_name('shirtnumber').text
                    name = ' '.join(p.find_element_by_tag_name('a').get_attribute('href').split('/')[4].split('-'))
                    names[0].append((name, jersey))
                except NoSuchElementException:
                    pass
                
            for p in driver2.find_elements_by_xpath('//div[@class="container right"]//tr'):
                try:
                    jersey = p.find_element_by_class_name('shirtnumber').text
                    name = ' '.join(p.find_element_by_tag_name('a').get_attribute('href').split('/')[4].split('-'))
                    names[1].append((name, jersey))
                except NoSuchElementException:
                    pass

            pid = utils.match_player([[x for x in c.execute('SELECT name, playerID, jersey from player WHERE teamID IS ?', (homeID, ))],[x for x in c.execute('SELECT name, playerID, jersey from player WHERE teamID IS ?', (awayID, ))]], names)
            match_dict['patch'] = patch
            match_dict['matchID'] = matchID
            match_dict['date'] = date
            match_dict['ligaID'] = ligaID
            match_dict['homeID'] = homeID
            match_dict['awayID'] = awayID
            match_dict['url'] = url
            #die Daten aus pandas auslesen und ins data_dict uebertragen
            
            game = odds_data[(odds_data['Date']==date) & (odds_data['HomeTeam']== tm2[int(homeID/1000000)]) & (odds_data['AwayTeam']==tm2[int(awayID/1000000)]) ]
            if game.shape[0] == 0:
                continue
            for col in match_cols[42:]:
                if col in game.columns:
                    match_dict[col] = game[col].iloc([0])[0]
            for i, p in enumerate(match_cols[6:42]):
                match_dict[p] = pid[i]

            match_values =  [match_dict[x] for x in match_cols]
            try:
                c.execute('INSERT OR IGNORE INTO match VALUES('+','.join(["?" for x in match_cols ])+')', (*match_values,))
                con.commit()
            except sqlite3.OperationalError:
                print('INSERT OR IGNORE INTO match VALUES('+','.join(["?" for x in match_cols ])+')', (*[match_dict[x] for x in match_cols]))
        print('=============Page Done============')
        try:
            driver.find_element_by_css_selector('.previous.disabled')
            break
        except NoSuchElementException:
            driver.find_element_by_css_selector('.previous').click()

    s = Select(driver.find_element_by_id('season_id_selector'))
    s.select_by_index(select)
    driver.find_elements_by_xpath('//*[@id="submenu"]//li')[1].click()

SyntaxError: EOL while scanning string literal (<ipython-input-11-2c2103d3a1d8>, line 5)

In [12]:
c.execute('select * from match')
match_cols =[desc[0] for desc in c.description]

# url_base = 'http://de.soccerway.com/national/england/league-one/20162017/regular-season/r36641/matches/?ICID=PL_3N_02'
#url_base = 'http://de.soccerway.com/national/england/league-two/20162017/regular-season/r36644/matches/?ICID=PL_3N_02

url_base = 'http://de.soccerway.com/national/netherlands/eredivisie/20162017/regular-season/r36385/matches/?ICID=PL_3N_02'

ligaID = 10
odds_data = pd.concat((pd.read_csv(f, dtype={'Date':str, 'AwayTeam':str, 'HomeTeam':str}) for f in csv_files_dict[ligaID]))
odds_data['Date'] = odds_data['Date'].apply(date_conv)
odds_data.rename(columns={'AS':'AShot', 'HS':'HShot', 'BbAv>2.5':'BbAvo25', 'BbAv<2.5':'BbAvu25', 'BbMx>2.5':'BbMxo25', 'BbMx<2.5':'BbMxu25' }, inplace=True)
tm2 = {v[1]:k for k,v in utils.match_club_names([x for x in c.execute('SELECT DISTINCT name, sofifaTeamID FROM team WHERE ligaID=? GROUP BY name', (ligaID,))],list( odds_data['AwayTeam'].dropna().unique())).items()}
driver.get(url_base)
# this can be activated to skip pages in the beginnning
# for _ in range(2):
#     driver.find_element_by_css_selector('.previous').click()
#     time.sleep(0.5)
for select in range(2,10):
    soccerway_teams = []
    while True:
        time.sleep(2.3)
        matches = driver.find_elements_by_css_selector('.score-time.score')
        for match in matches:
            url = match.find_element_by_tag_name('a').get_attribute('href')
            url_ = url.split('/')
            match_dict = dict.fromkeys(match_cols, -1)

            print(url)
            date = datetime(*[int(x) for x in url_[4:7]])
            patch = find_patch_id(date)

            
            
#             TODO select team name by title if errors occur due to incomplete team names
            for teamb in driver.find_elements_by_xpath('//td[@class="team team-b "]/a'):
                soccerway_teams.append(teamb.get_attribute('href').split('/')[5].replace('-', ' '))
            for teamb in driver.find_elements_by_xpath('//td[@class="team team-a "]/a'):
                soccerway_teams.append(teamb.get_attribute('href').split('/')[5].replace('-', ' '))
               
            soccerway_teams = list(set(filter(None,soccerway_teams)))
            if len(soccerway_teams) > 15:
                tm_raw = utils.match_club_names([name for name in c.execute('SELECT name, teamID FROM team WHERE patchID IS '+str(find_patch_id(date))+' AND ligaID IS '+str(ligaID))], soccerway_teams)
                tm = {key:val[1] for key,val in zip(tm_raw.keys(), tm_raw.values())}
            matchID = url_[11]
            #todo map names team_mapping = map_names(stuffstuff)
            homeID, awayID = [tm[x] for x in [' '.join(url_[9].split('-')),' '.join(url_[10].split('-'))]]
            driver2.get(url)
            time.sleep(0.6)
            names = [[],[]]
            
            for p in driver2.find_elements_by_xpath('//div[@class="container left"]//tr'):
                try:
                    jersey = p.find_element_by_class_name('shirtnumber').text
                    name = ' '.join(p.find_element_by_tag_name('a').get_attribute('href').split('/')[4].split('-'))
                    names[0].append((name, jersey))
                except NoSuchElementException:
                    pass
                
            for p in driver2.find_elements_by_xpath('//div[@class="container right"]//tr'):
                try:
                    jersey = p.find_element_by_class_name('shirtnumber').text
                    name = ' '.join(p.find_element_by_tag_name('a').get_attribute('href').split('/')[4].split('-'))
                    names[1].append((name, jersey))
                except NoSuchElementException:
                    pass

            pid = utils.match_player([[x for x in c.execute('SELECT name, playerID, jersey from player WHERE teamID IS ?', (homeID, ))],[x for x in c.execute('SELECT name, playerID, jersey from player WHERE teamID IS ?', (awayID, ))]], names)
            match_dict['patch'] = patch
            match_dict['matchID'] = matchID
            match_dict['date'] = date
            match_dict['ligaID'] = ligaID
            match_dict['homeID'] = homeID
            match_dict['awayID'] = awayID
            match_dict['url'] = url
            #die Daten aus pandas auslesen und ins data_dict uebertragen
            
            game = odds_data[(odds_data['Date']==date) & (odds_data['HomeTeam']== tm2[int(homeID/1000000)]) & (odds_data['AwayTeam']==tm2[int(awayID/1000000)]) ]
            if game.shape[0] == 0:
                continue
            for col in match_cols[42:]:
                if col in game.columns:
                    match_dict[col] = game[col].iloc([0])[0]
            for i, p in enumerate(match_cols[6:42]):
                match_dict[p] = pid[i]

            match_values =  [match_dict[x] for x in match_cols]
            try:
                c.execute('INSERT OR IGNORE INTO match VALUES('+','.join(["?" for x in match_cols ])+')', (*match_values,))
                con.commit()
            except sqlite3.OperationalError:
                print('INSERT OR IGNORE INTO match VALUES('+','.join(["?" for x in match_cols ])+')', (*[match_dict[x] for x in match_cols]))
        print('=============Page Done============')
        try:
            driver.find_element_by_css_selector('.previous.disabled')
            break
        except NoSuchElementException:
            driver.find_element_by_css_selector('.previous').click()

    s = Select(driver.find_element_by_id('season_id_selector'))
    s.select_by_index(select)
    driver.find_elements_by_xpath('//*[@id="submenu"]//li')[1].click()

http://de.soccerway.com/matches/2017/04/16/netherlands/eredivisie/stichting-fc-groningen/bvo-fc-zwolle/2240540/?ICID=PL_MS_01
http://de.soccerway.com/matches/2017/04/16/netherlands/eredivisie/feyenoord-rotterdam-nv/stichting-fc-utrecht/2240539/?ICID=PL_MS_02
http://de.soccerway.com/matches/2017/04/16/netherlands/eredivisie/afc-ajax/sportclub-heerenveen/2240541/?ICID=PL_MS_03
http://de.soccerway.com/matches/2017/04/21/netherlands/eredivisie/sportclub-heerenveen/sbv-willem-ii-tilburg/2240542/?ICID=PL_MS_04
http://de.soccerway.com/matches/2017/04/22/netherlands/eredivisie/nijmegen-eendracht-combinatie/sbv-excelsior/2240543/?ICID=PL_MS_05
http://de.soccerway.com/matches/2017/04/22/netherlands/eredivisie/stichting-az/stichting-fc-twente-65/2240946/?ICID=PL_MS_06
http://de.soccerway.com/matches/2017/04/22/netherlands/eredivisie/bvo-fc-zwolle/stichting-heracles-almelo/2240544/?ICID=PL_MS_07
http://de.soccerway.com/matches/2017/04/22/netherlands/eredivisie/stichting-go-ahead-eagles/stichting-f

TypeError: 'NoneType' object is not subscriptable