In [60]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

#Create request and get the data from the websites
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

#Looking on the TransferMarket website, there are 10 pages related to ronaldo's scored penalties
noPages = 10
pageBaseURL = "https://www.transfermarkt.com/cristiano-ronaldo/elfmetertore/spieler/8198/ajax/yw1/saison_id//wettbewerb_id//plus/1/page/"
pages = [pageBaseURL + str(i+1) for i in range(noPages)]
#pages is now an array of URLs referencing all the pages we want access to. pageBaseURL1 pageBaseURL2 pageBaseURL3 and so on.

#pageSoups is all the HTML data we need to scrape for each page
pageSoups = np.zeros(noPages, dtype=object)
for no in range(noPages):
    page = pages[no]
    pageTree = requests.get(page, headers=headers)
    pageSoups[no] = (BeautifulSoup(pageTree.content, 'html.parser'))
    


In [114]:
#Extract Data from the soup
Season = []
Competition = []
Club = []
Date = []
HomeTeam = []
Final_result = []
AwayTeam = []
Minute = []
ScoreAfterPK = []
Goalkeeper = []

allData = [Season,
    Competition,
    Club,
    Date,
    HomeTeam,
    Final_result,
    AwayTeam,
    Minute,
    ScoreAfterPK,
    Goalkeeper]


#First we need to refine the HTML to the tables we want
tableSoups = []
for pageSoup in pageSoups:
    tableSoups.append(pageSoup.find_all('table', 'items'))

#now get the data from the table
for table in tableSoups:
    #The table variable contains a list. You need to call .tbody on its [0] member, not on the entire thing.
    body = table[0].tbody
    
    #We need to go into the tr rows and get the td column data for each row
    #get a list of the rows
    rows = body.contents
    
    for row in rows:
        #some entries are '\n'. Skip them
        if row == '\n':
            continue
        
        #now get the column data from the rows
        columns = row.contents
        
        arrayNo = 0
        for column in columns:
            #some entries are '\n'. Skip them
            if column == '\n':
                continue
                
            allData[arrayNo].append(column)
            arrayNo += 1
        
        if arrayNo != 10:
            raise ValueError("Not all arrays have been updated")
        


In [115]:
# #Now we need to refine the data so that we have strings/links

#Usually there are two ways to refine

def strRefine(data):
    temp = []
    for d in data:
        #.string is the attribute for the contents of s
        temp.append(d.string)
    return temp

def imgRefine(data):
    temp = []
    for d in data:
        #Change the entry into a string, and then split that string to get between
        #" class and <img alt="
        temp.append(str(d).split('<img alt="',1)[1].split('" class',1)[0])
    return temp

Season = strRefine(Season)
Competition = imgRefine(Competition)
Club = imgRefine(Club)
Date = strRefine(Date)
HomeTeam = imgRefine(HomeTeam)
Final_result = strRefine(Final_result)
AwayTeam = imgRefine(AwayTeam)
Minute = strRefine(Minute)
ScoreAfterPK = strRefine(ScoreAfterPK)
Goalkeeper = strRefine(Goalkeeper)


In [118]:
#Put all into a Pandas array and save as a csv
d = {'Season': Season,
    'Competition': Competition,
    'Club': Club,
    'Date': Date,
    'HomeTeam': HomeTeam,
    'Final_result': Final_result,
    'AwayTeam': AwayTeam,
    'Minute': Minute,
    'ScoreAfterPK': ScoreAfterPK,
    'Goalkeeper': Goalkeeper}
df = pd.DataFrame(d)
df.to_csv("Ronaldo Penalties Scored.csv")