# Pronosoft Football Data - Webscrape

---

---

# Import Lib

In [9]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as req

import sys
import os
from os import path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv

import datetime as dt
import ipywidgets as widgets
from IPython.display import display

---

# Paths

In [2]:
folder_data = '..\Data'
csv_name = folder_data + '\pronosoft_data.csv'

---

# Functions

Parse Page

In [3]:
# REQUESTS ACCESS & PARSES PAGE

def parse_page(url):
    # Grabs the page
    r = req(url)
    page_html = r.read()
    r.close()

    # Parsing means to divide (a sentence) into grammatical parts and identify the parts and their relations to each other
    page_soup = soup(page_html,"html.parser")  
    
    return page_soup

Extract Info

In [4]:
# TAKES IN VALUES FROM PRONOSOFT

def get_info(page_soup, pick_date):
    day_scrape = []
    
    leagues_soup = page_soup.find_all("div", {"class": "dev-desktop-league-comp"})
    
    for league_soup in leagues_soup:
        league = league_soup.find_all("h3")[0].get_text()
        matches_soup = league_soup.table.tbody.find_all("tr")[1:]
        
        for match_soup in matches_soup:
            info_match_soup = match_soup.find_all("td")
            info = [i.get_text() for i in info_match_soup]
            
            if check_info_complete(info):
                info_clean = get_info_clean(info)
                
                match_scrape = [pick_date, league] + info_clean
                day_scrape.append(match_scrape)
                            
    return day_scrape

Check Info Complete

In [5]:
# CHECK INFO IS COMPLETE (Prob., odds and scores)

def check_info_complete(info):
    if (info[3][0] == '-' or info[3][-1] == '%') or (info[7][0] == '-' or info[7][-1] == '%') or (info[9] == 'Annulé') or (info[-1] == 'Reporté' or info[-1] == '?-?'):
        complete = False
    else:
        complete = True
    
    return complete

Clean Up Info

In [6]:
def get_info_clean(info):
    info_clean = []
    
    # Match time
    info_clean.append(info[0].replace('h', ':'))
    
    # Team names
    info_clean.append(info[1].split('-')[0][:-1])
    info_clean.append(info[1].split('-')[1][1:])
    
    # Seperate prob. & odds 
    info_clean.append(np.round(0.01 * int(info[2].split('%')[0]), 2))
    info_clean.append(info[2].split('%')[1].replace(',', '.'))
    info_clean.append(np.round(0.01 * int(info[3].split('%')[0]), 2))
    info_clean.append(info[3].split('%')[1].replace(',', '.'))
    info_clean.append(np.round(0.01 * int(info[4].split('%')[0]), 2))
    info_clean.append(info[4].split('%')[1].replace(',', '.'))
    prediction_team = info[5]
    if prediction_team == '-':
        prediction_team = None
    info_clean.append(prediction_team)
    info_clean.append(np.round(0.01 * int(info[6].split('%')[0]), 2))
    info_clean.append(info[6].split('%')[1].replace(',', '.'))
    info_clean.append(np.round(0.01 * int(info[7].split('%')[0]), 2))
    info_clean.append(info[7].split('%')[1].replace(',', '.'))
    prediction_uo = info[8]
    if prediction_uo == '-':
        prediction_uo = None
    info_clean.append(prediction_uo)
    
    # Scores
    info_clean.append(info[9].split('-')[0])
    info_clean.append(info[9].split('-')[1])
            
    return info_clean

Save to CSV

In [7]:
# SAVE CSV
# Under / Over is betting whether some of scores will be Under or Over 2.5

def save_info_csv(day_scrape):
    
    field_names = [ 
                    "date",
                    "league",
                    "time",
                    "team_1_name",
                    "team_2_name",
                    "team_1_prob",
                    "team_1_bet_odds",
                    "nul_prob",
                    "nul_bet_odds",
                    "team_2_prob",
                    "team_2_bet_odds",
                    "prediction_team_pronosoft",
                    "under_prob",
                    "under_bet_odds",
                    "over_prob",
                    "over_bet_odds",
                    "prediction_uo_pronosoft",
                    "team_1_score",
                    "team_2_score"
                  ]
        
#     Create csv with fields, if it doesn't exist
    if not path.exists(csv_name):
        df_day_scrape = pd.DataFrame(columns = field_names)
        df_day_scrape.to_csv(csv_name, index=False)
        
    dates = np.unique(pd.read_csv(csv_name)['date'])
    
    # Check if new scrape is not empty, and not already in database
    if len(day_scrape) != 0 and day_scrape[0][0] not in dates:
        with open(csv_name, 'a', newline = '', encoding = 'utf-8') as f:
            writer = csv.writer(f)
            for scrape in day_scrape:
                writer.writerow(scrape)

Webscrape

In [8]:
def webscrape(date_range):
    num_dates = len(date_range)
    
    print('\nPronosoft Web-Scraping Application\n')
    print('\nYou are accessing ...\n')
    print('   Website:  Pronosoft')
    print('   URL:      http://www.pronosoft.com/fr/bookmakers/pronostics/\n')
    print('\nProcessing & Scrapping ... ')
    
    prog = widgets.FloatProgress(min = 0, max = num_dates, bar_style = 'info')
    prog.layout.margin = '24px'
    display(prog)
    
    print('   ', end = '')

    for count, date in enumerate(date_range):
        website = 'http://www.pronosoft.com/fr/bookmakers/pronostics/' + date[:5] + '-20' + date[-2:] + '/'

        page_soup = parse_page(website)
        
        if count == 0 or date[:2] == '01' or date[:2] == '15' or count == num_dates - 1:
            print(date + ' | ', end = '')

        day_scrape = get_info(page_soup, date)
        save_info_csv(day_scrape)
                
        prog.value = count + 1
    
    print('\n')

Get Date Range

In [9]:
# DATE FORMAT: from Day Month Year (dmy) to Year Month Day (ymd) 

def date_format_revert(date):    
    return '%s-%s-%s' % tuple(date.split('-')[::-1])

In [10]:
# DATE RANGE: gives all dates between date_low and date_high

def get_date_range(start_date, end_date):
    pd_dates = pd.date_range(start = date_format_revert(start_date), end = date_format_revert(end_date))
    date_range = [date_format_revert(str(date.date())) for date in pd_dates]     
    
    return date_range

Get Last Scrapped Date

In [11]:
def get_last_scrape_date(file_csv):
    try:
        df = pd.read_csv(file_csv)
        most_recent_date = list(df['date'])[-1]
        print('\nLast scrape on %s' % most_recent_date, '\n')
        
        return most_recent_date
    
    except FileNotFoundError:
        print('\nPronosoft data not yet scrapped\n')

---

---

# Web-Scrape

In [12]:
most_recent_date = get_last_scrape_date(csv_name)


Last scrape on 22-08-2021 



In [13]:
# Web Scrape range of dates | First possible date : 01-10-2018 

start_date = '19-08-2021'
end_date = '22-08-2021'

dates = get_date_range(start_date, end_date)

webscrape(dates)


Pronosoft Web-Scraping Application


You are accessing ...

   Website:  Pronosoft
   URL:      http://www.pronosoft.com/fr/bookmakers/pronostics/


Processing & Scrapping ... 


FloatProgress(value=0.0, bar_style='info', layout=Layout(margin='24px'), max=4.0)

   19-08-2021 | 22-08-2021 | 



---

# Web-Scrape | with widgets

In [14]:
# output = widgets.Output()

# @output.capture(clear_output = False, wait = True) 
# def sayHello(b):
#     start_date_str = date_format_revert(str(start_date.value))
#     end_date_str = date_format_revert(str(end_date.value))
    
#     dates = get_date_range(start_date_str, end_date_str)
    
#     print('\n')
#     webscrape(dates)
    
    

# print("\nHello, please enter a range of dates to webscrape:\n\n")

# start_date = widgets.DatePicker(description='Start Date', disabled=False)
# end_date = widgets.DatePicker(description='End Date', disabled=False)
# display(widgets.HBox([start_date, end_date]))

# print('\n')
# run_button = widgets.Button(description = 'Continue')
# run_button.style.button_color = 'mediumaquamarine'
# run_button.on_click(sayHello)
# display(run_button)

# output


Hello, please enter a range of dates to webscrape:




HBox(children=(DatePicker(value=None, description='Start Date'), DatePicker(value=None, description='End Date'…





Button(description='Continue', style=ButtonStyle(button_color='mediumaquamarine'))

Output()

---

---