# Import Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import time
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from sklearn import preprocessing
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Get Info's

## Get comment's

In [2]:
def get_comments(url: str) -> pd.DataFrame:
    """
    :param url: the url from foxsports site with the commentaries
    :return: return a Data Frame after concat the input df and info's of url
    """
    # make a get in url of game and save the page source to work
    dict_names = {'1. FC Köln': 'FC Koln', '1. FC Union Berlin': 'Union Berlin', '1899 Hoffenheim': 'Hoffenheim',
                  'Bayer Leverkusen': 'Bayer Leverkusen', 'Bayern Munich': 'Bayern Munich',
                  'Borussia Dortmund': 'Dortmund',
                  'Eintracht Frankfurt': 'Eintracht Frankfurt', 'FC Augsburg': 'Augsburg', 'FC Schalke 04': 'Schalke',
                  'FSV Mainz 05': 'Mainz', 'Fortuna Düsseldorf': 'Dusseldorf', 'Hertha BSC Berlin': 'Hertha Berlin',
                  'Mönchengladbach': 'B. Monchengladbach', 'RB Leipzig': 'RB Leipzig', 'SC Freiburg': 'Freiburg',
                  'SC Paderborn': 'Paderborn', 'VfL Wolfsburg': 'Wolfsburg', 'Werder Bremen': 'Werder Bremen'}
    path_crome = 'C:/Users/pedro/selenium/chromedriver.exe'
    driver = webdriver.Chrome(executable_path=path_crome)
    driver.get(url)
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//*[@id="fullCommentary"]')))
    element = driver.find_element_by_id('fullCommentary')
    driver.execute_script("arguments[0].click();", element)
    html = driver.page_source
    # read the html with BeautifulSoup and get the name of teams
    soup = BeautifulSoup(html, 'lxml')
    home = soup.find_all('span', {'class': 'wisbb_bsName'})[0].text
    home = dict_names[home]
    away = soup.find_all('span', {'class': 'wisbb_bsName'})[1].text
    away=dict_names[away]
    home_goals_final = int(soup.find_all('td', {'class': 'wisbb_bsTotal'})[0].text.strip())
    away_goals_final = int(soup.find_all('td', {'class': 'wisbb_bsTotal'})[1].text.strip())
    # save all commentaries in one list to work, and set the home image to select what team is the commentary
    commentaries = soup.find('table', {'class': 'wisbb_bsCPbpSmallTable'}).find_all('tr')
    img_home = commentaries[0].find('img')['src']
    # make empty lists, and work in each commentary to save commentary, team, and time of commentary
    urls, time, team, comm = [], [], [], []
    for comments in commentaries:
        urls.append(url)
        time_aux = comments.find('td', {'class': 'wisbb_bsSoccerPbpTimeCol'}).text.replace("'", '').split('+')
        time.append(sum([int(item) for item in time_aux]))
        team.append(home if comments.find('img')['src'] == img_home else away)
        comm.append(comments.find('span', {'class': 'wisbb_bsSoccerPbpDesc'}).text)
    infos = {'url': urls,'home_team':home, 'away_team':away, 'time': time, 'team': team,
             'home_goals_final': home_goals_final, 'away_goals_final' : away_goals_final, 'comm': comm}
    # save infos in Data Frame and concat with previous Data Frame
    df_aux = pd.DataFrame(infos)
    driver.quit()
    return df_aux

## Get statistcs

In [3]:
def get_stats_game(url: str) -> pd.DataFrame:
    """
    With the selenium web driver, open a url get stats of a game and append in Data Frame the stats of game
    :param driver: Web driver from selenium
    :param url: link with de stats we want to scraping
    :param df: Data Frame with storage values
    :return: Data frame append new values
    """
    # make a driver to create a section were we going to work, and get the page source
    path_crome = 'C:/Users/pedro/selenium/chromedriver.exe'
    driver = webdriver.Chrome(executable_path=path_crome)
    driver.get(url)
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//*[@id="summary-content"]/div[1]/div[3]/div[2]/span[1]')))
    time.sleep(2)
    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')
    # get infos with main page
    infos = {}
    infos['url'] = url
    infos['round'] = int(soup.find('span', {'class': 'description__country'}).text.split(' ')[-1])
    infos['home_team'] = soup.find('div', {'class': 'tname__text'}).text.strip()
    infos['away_team'] = soup.find_all('div', {'class': 'tname__text'})[1].text.strip()
    infos['goals_home_final'] = int(soup.find('span', {'class': 'scoreboard'}).text)
    infos['goals_away_final'] = int(soup.find_all('span', {'class': 'scoreboard'})[1].text)
    infos['final_result'] = 'Away' if infos['goals_home_final'] < infos['goals_away_final'] else 'Home' if infos[
                                                                                                               'goals_home_final'] > \
                                                                                                           infos[
                                                                                                               'goals_away_final'] else 'Draw'
    infos['goals_home_1half'] = int(soup.find('span', {'class': 'p1_home'}).text)
    infos['goals_away_1half'] = int(soup.find('span', {'class': 'p1_away'}).text)
    infos['1half_result'] = 'Away' if infos['goals_home_1half'] < infos['goals_away_1half'] else 'Home' if infos[
                                                                                                               'goals_home_1half'] > \
                                                                                                           infos[
                                                                                                               'goals_away_1half'] else 'Draw'
    # change page to statistic in first half, and get info's
    WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="a-match-statistics"]'))).click()
    WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="statistics-1-statistic"]/span/a'))).click()
    time.sleep(2)
    # get body info with text and make a list by info
    html_code = driver.find_element_by_tag_name("body").text.split('\n')
    # set first info Ball Possession and, make a while to end into last info dangerous_attacks
    i = 0
    x = html_code.index('Ball possession')
    stat = html_code[x + i * 3].lower().replace(' ', '_')
    while stat != 'dangerous_attacks':
        stat = html_code[x + i * 3].lower().replace(' ', '_')
        if stat not in ['throw_ins', 'completed_passes']:
            infos[stat + 'home'] = int(html_code[x + i * 3 - 1].replace('%', ''))
            infos[stat + 'away'] = int(html_code[x + i * 3 + 1].replace('%', ''))
        i += 1
    # make a aux Data Frame, concat the data to previous Data Frame and return a Data Frame
    df_aux = pd.DataFrame(infos, index=[0])
    if 'yellow_cardshome' not in list(df_aux.columns):
        df_aux['yellow_cardshome'] = 0
        df_aux['yellow_cardsaway'] = 0
    if 'red_cardshome' not in list(df_aux.columns):
        df_aux['red_cardshome'] = 0
        df_aux['red_cardsaway'] = 0
    driver.quit()
    return df_aux

# Transform data

## Transform comments

In [4]:
def transform_comments(df):
    df.drop(columns=['url'], inplace= True)
    with open ('C:/Users/pedro/Projetos/bundesbet/models/commentaries/words.p', 'rb') as fp:
        my_words = pickle.load(fp)
    X = list(df['comm'])
    dataprep = Pipeline([('count_vectorizer', CountVectorizer(ngram_range=(1,3), min_df=1, stop_words=my_words))])
    n_com=5
    pipeline = Pipeline([
        ('dataprep', dataprep),
        ('topic_modelling', LatentDirichletAllocation(n_components=n_com, random_state=42,n_jobs=-1))])
    pipeline.fit(X)
    topic_values = pipeline.transform(X)
    df['labels'] = topic_values.argmax(axis=1)
    def who_wins(row):
        result = 'Away' if row['home_goals_final'] < row['away_goals_final'] else 'Home' if row['home_goals_final'] >row['away_goals_final'] else 'Draw'
        return result
    df['result']= df.apply(lambda row: who_wins(row), axis=1)
    df.drop(columns=['home_goals_final','away_goals_final'], inplace=True)
    X=pd.get_dummies(df, columns=['labels'])
    df_group=X.groupby(['team','result']).sum().reset_index()
    df_group.drop(columns=['time'],inplace= True)
    df_home=pd.DataFrame(df_group.iloc[1,:]).T
    home_columns= ['labels_'+str(i)+'_home' for i in np.arange(0,n_com)]
    home_columns=['team_home','result']+home_columns
    df_home.columns=home_columns
    df_away=pd.DataFrame(df_group.iloc[0,:]).T
    away_columns= ['labels_'+str(i)+'_away' for i in np.arange(0,n_com)]
    away_columns=['team_away','result']+away_columns
    df_away.columns=away_columns
    df_result=pd.merge(left=df_home, right=df_away, on=['result'])
    def func(x):
        """
        Applying a function that change values of a Draw Match to 0, when Home team won to 1 and Away team won to -1.
        """
        if x == 'Draw':
            return 0
        elif x == 'Home':
            return 1
        else:
            return -1
    def apply_func(dataframe, column):
        dataframe[column] = dataframe[column].apply(func)
    apply_func(df_result, 'result')
    df_final=df_result.copy()
    le = preprocessing.LabelEncoder()
    df_result[['team_home','team_away']] = df_result[['team_home','team_away']].apply(le.fit_transform)
    model_lda = pickle.load(open('C:/Users/pedro/Projetos/bundesbet/models/commentaries/ldamodel.sav', 'rb'))
    X=df_result.drop(columns=['result'])
    proba= model_lda.predict_proba(X)
    df_final['proba_home']=proba[:,2]
    df_final['proba_draw']=proba[:,1]
    df_final['proba_away']=proba[:,0]
    return df_final

## Transform stats

In [5]:
def transform_stats(df):
    def func(x):
        """
        Applying a function that change values of a Draw Match to 0, when Home team won to 1 and Away team won to -1.
        """
        if x == 'Draw':
            return 0
        elif x == 'Home':
            return 1
        else:
            return -1

    def apply_func(dataframe, column_list):
        for column in column_list:
            dataframe[column] = dataframe[column].apply(func)
    df_aux=df.copy()
    df_aux.drop(columns=['url','round','final_result','goals_home_final', 'goals_away_final'], inplace= True)
    list_result = ['1half_result']
    apply_func(df_aux, list_result)
    scaler = StandardScaler()
    le = LabelEncoder()
    df_aux[['home_team','away_team']] = df[['home_team','away_team']].apply(le.fit_transform)
    model_stats = pickle.load(open('C:/Users/pedro/Projetos/bundesbet/models/game stats/game_stats_model.sav', 'rb'))
    X=df_aux
    df['proba_stats_home']=model_stats.predict_proba(X)[:,2]
    df['proba_stats_draw']=model_stats.predict_proba(X)[:,1]
    df['proba_stats_away']=model_stats.predict_proba(X)[:,0]
    return df

# Stak Models

In [6]:
def stak(comments,stats):
    columns=['home_team','away_team','proba_stats_home', 'proba_stats_draw', 'proba_stats_away']
    stats=stats.loc[:,columns]
    columns=['team_home','team_away','proba_home','proba_draw','proba_away']
    comments=comments.loc[:,columns]
    df=pd.concat([comments,stats], axis=1)
    X_proba=df[['proba_draw','proba_stats_draw']]
    model_draw = pickle.load(open('C:/Users/pedro/Projetos/bundesbet/models/stack/draw_model.sav', 'rb'))
    draw_proba=model_draw.predict_proba(X_proba)[:,1]
    X_proba=df[['proba_home','proba_stats_home']]
    model_home = pickle.load(open('C:/Users/pedro/Projetos/bundesbet/models/stack/home_model.sav', 'rb'))
    home_proba=model_home.predict_proba(X_proba)[:,1]
    X_proba=df[['proba_away','proba_stats_away']]
    model_away = pickle.load(open('C:/Users/pedro/Projetos/bundesbet/models/stack/away_model.sav', 'rb'))
    away_proba=model_away.predict_proba(X_proba)[:,1]
    infos={}
    infos[df['home_team'].values[0]]=home_proba
    infos['draw']=draw_proba
    infos[df['away_team'].values[0]]=away_proba
    return (pd.DataFrame(infos).round(2)*100)

# Function main

In [7]:
def find_stats(url_comments,url_stats):
    comments_raw=get_comments(url_comments)
    stats_raw=get_stats_game(url_stats)
    comments=transform_comments(comments_raw)
    stats=transform_stats(stats_raw)
    return stak(comments,stats)

In [8]:
url_stats='https://www.scoreboard.com/game/I9hb3B4S/#match-summary'
url_comments='https://www.foxsports.com/soccer/boxscore?id=63353'
find_stats(url_comments,url_stats)

Unnamed: 0,Dortmund,draw,Schalke
0,72.0,9.0,36.0
