In [1]:
# Program to find out if best of five sets are important:
#1) Scrape all data from Wikipedia since 1970 (from 1970-1989, the Australian open didn't have enough participants
# to work with my scraping template below). 
#2) Data to get:
#	- Save each tournament to a new (csv) file.
#	- Each tournament should have:
#		- Every game saved separately:
#			- Each game should have year, tournament, round, opponents, score, ratings 
#			  of match (all saved in a line). Score as a list.

In [2]:
import requests
from bs4 import BeautifulSoup
from itertools import zip_longest
from time import sleep
from csv import reader, writer
import os

In [3]:
directory="Slam_Data_2\\"#Folder where to save all the slam csv files
os.mkdir(directory)

In [4]:
#First get webpage:
all_tour_url="https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Tennis/Grand_Slam_Project"
base_url="https://en.wikipedia.org/"

###### Get all the tournaments url extensions to add onto base_url
response=requests.get(all_tour_url)
soup=BeautifulSoup(response.text,"html.parser")
tennis_rows=soup.find_all(class_="wikitable")[1].find_all("tr")
tour_urls_l=[]
tour_urls_l_name=[]
tour_urls={}
for row in tennis_rows[11:]: #11 is the 1970s, so 12=80,13=90,14=00,15=10,16=20
	decade=row.find_all("td")[1:11]
	for year in decade:
		slams_year=year.find_all("a")
		for slam in slams_year:
			tour_urls.update({
				slam["title"].split(" ")[0]+" "+slam.get_text():
				slam["href"]
				})
			tour_urls_l.append(slam["href"])
			tour_urls_l_name.append(slam["title"].split(" ")[0]+" "+slam.get_text())

In [5]:
##### Get data from tournament webpage, save it to csv, repeat in for loop
for tour_url, tour_name in zip_longest(tour_urls_l,tour_urls_l_name):
	print(f"Getting data from the slam {tour_name}.....")
	response=requests.get(base_url+tour_url)#'https://en.wikipedia.org/wiki/2019_Wimbledon_Championships_%E2%80%93_Men%27s_Singles')
	soup=BeautifulSoup(response.text,"html.parser")

	##### Put all data for one match (year, tournament, round, opponents, score, rating in one list)
	all_matches_l=[]
	##### First, do the final bracke:
	final_brac=soup.find_all("tbody")[3] #Gets the final bracket table.
	table_index=3#Most pages, the scores are in the fourth table. Variable also used later in line 89ish
	if len(final_brac.find_all("tr"))<10: #In case the table is not the 4th but the 5th.
		final_brac=soup.find_all("tbody")[4]
		table_index=4
	names=final_brac.find_all(class_="flagicon") #Gets the flag icon.
	###### Get the score:
	for i, name in enumerate(names[0::2]):
		match_l=[]
		match_l.append(tour_name)
		if (len(names[0::2])+1)/2-1==i:	#selects final match
			match_l.append("Finals")
		elif i%2==0: #Quarterfinal matches
			match_l.append("Quarterfinals")
		else: #Semifinal matches
			match_l.append("Semifinals")
		for name in names[i*2:i*2+2]: #Gets first name in a match, skip the second.
			match_l.append(name.find_parent("td").previous_sibling.get_text())#Rating of player (move up a layer and back to where the rating is found).
			match_l.append(name.find("a").parent.parent.find_all('a')[1].get_text())#name of player
			match=name.find_parent("td").next_sibling #Move up to the layer where the score for a player is found
			p_score=[]
			while True:
				try:
					score=int(match.get_text())#[0])
				except ValueError:
					break
				p_score.append(score)
				match=match.next_sibling #next_sibling goes to next set.
			match_l.append(p_score)
		all_matches_l.append(match_l)

	######### Do all the other brackets.
	brackets=soup.find_all("tbody") #Gets the table tag with the entire bracket.
	for brac in brackets[table_index+1:table_index+9]: #Iterates through all the brackets.
		table_lines=brac.find_all("tr") #Gets every line in the bracket.
		p_name_and_score=[]
		for i, line in enumerate(table_lines[2:]): #scores and names start the third line.
			players_in_line=line.find_all(class_="flagicon") #find the players' flags in the line
			for player in players_in_line:
				p_name_and_score.append(player) #put all the player data in one list to later rearrange.
		if len(p_name_and_score)<17:#NOTE: If there are not enough flagicons, this breaks out.
			break
		p_name_sorted=[]
		for i, player in enumerate(p_name_and_score): #loop to rearrange player data so that index 0 vs 1, 2 vs 3, etc.
			if (i+1)%4==0:
				p_name_sorted.append(p_name_and_score[i+1]) #swap index 3 and 4, 7 and 8, etc.
				p_name_sorted.append(player)
			elif i%4==0 and i!=0:
				pass
			else:
				p_name_sorted.append(player)
		for i, player in enumerate(p_name_sorted[0::2]): #take every second player to organize by match round
			match_l=[]
			match_l.append(tour_name)
			if (len(p_name_sorted[0::2])+1)/2-1==i:	
				match_l.append("Fourth Round")
			elif i%2==0:
				match_l.append("First Round")
			elif (i+1)%2==0 and (i+1)%4!=0:
				match_l.append("Second Round")
			else:
				match_l.append("Third Round")
			match=[player,p_name_sorted[i*2+1]]
			for p in match:
				match_l.append(p.find_parent("td").previous_sibling.get_text())#Rating of player (move up a layer and back to where the rating is found).
				match_l.append(p.find("a").parent.parent.find_all('a')[1].get_text())#Name of player
				p_score=[]
				set_score=p.find_parent("td").next_sibling
				while True:
					try:
						score=int(set_score.get_text())#[0])
					except ValueError: #loop to get score until the line has no text (game is over)
						break
					p_score.append(score) #make a list of the score for one player
					set_score=set_score.next_sibling
				match_l.append(p_score) #put the score in the line with the rest of the match details.
			all_matches_l.append(match_l)#put all the match details into a list with all the other matches.
	# print(all_matches_l)

	##### Write all the matches in a slam to a .csv file:
	with open(directory+tour_name+".csv", "w", encoding="utf-8") as file:
		csv_writer=writer(file)
		for match in all_matches_l:
			csv_writer.writerow(match)

	sleep_time=4 # Don't scrape Wikipedia too fast or you'll be banned!
	print(f"Sleeping for {sleep_time} seconds...")
	sleep(sleep_time)

Getting data from the slam 1970 a.....
Sleeping for 4 seconds...
Getting data from the slam 1970 f.....
Sleeping for 4 seconds...
Getting data from the slam 1970 w.....
Sleeping for 4 seconds...
Getting data from the slam 1970 u.....
Sleeping for 4 seconds...
Getting data from the slam 1971 a.....
Sleeping for 4 seconds...
Getting data from the slam 1971 f.....
Sleeping for 4 seconds...
Getting data from the slam 1971 w.....
Sleeping for 4 seconds...
Getting data from the slam 1971 u.....
Sleeping for 4 seconds...
Getting data from the slam 1972 a.....
Sleeping for 4 seconds...
Getting data from the slam 1972 f.....
Sleeping for 4 seconds...
Getting data from the slam 1972 w.....
Sleeping for 4 seconds...
Getting data from the slam 1972 u.....
Sleeping for 4 seconds...
Getting data from the slam 1973 a.....
Sleeping for 4 seconds...
Getting data from the slam 1973 f.....
Sleeping for 4 seconds...
Getting data from the slam 1973 w.....
Sleeping for 4 seconds...
Getting data from the sla

Sleeping for 4 seconds...
Getting data from the slam 2001 u.....
Sleeping for 4 seconds...
Getting data from the slam 2002 a.....
Sleeping for 4 seconds...
Getting data from the slam 2002 f.....
Sleeping for 4 seconds...
Getting data from the slam 2002 w.....
Sleeping for 4 seconds...
Getting data from the slam 2002 u.....
Sleeping for 4 seconds...
Getting data from the slam 2003 a.....
Sleeping for 4 seconds...
Getting data from the slam 2003 f.....
Sleeping for 4 seconds...
Getting data from the slam 2003 w.....
Sleeping for 4 seconds...
Getting data from the slam 2003 u.....
Sleeping for 4 seconds...
Getting data from the slam 2004 a.....
Sleeping for 4 seconds...
Getting data from the slam 2004 f.....
Sleeping for 4 seconds...
Getting data from the slam 2004 w.....
Sleeping for 4 seconds...
Getting data from the slam 2004 u.....
Sleeping for 4 seconds...
Getting data from the slam 2005 a.....
Sleeping for 4 seconds...
Getting data from the slam 2005 f.....
Sleeping for 4 seconds...