### 1.4. Python Basics for Web Scraping

In [3]:
states = ['California','Texas','Florida','New York']

display(states[0])

'California'

In [5]:
for state in states:
    if state == 'Florida':
        print(state)

Florida


In [7]:
# Will open and close file (r,w)
with open('test.txt','w') as file:
    file.write('Data successfully scraped!')

In [14]:
import pandas as pd

population = [39613493,29730311,21944577,19299981]

# Create dict from 2 lists
dict_states = {'States':states,'Population':population}
display(dict_states)

# Create dataframe from dict
df_states = pd.DataFrame.from_dict(dict_states)
display(df_states)

{'States': ['California', 'Texas', 'Florida', 'New York'],
 'Population': [39613493, 29730311, 21944577, 19299981]}

Unnamed: 0,States,Population
0,California,39613493
1,Texas,29730311
2,Florida,21944577
3,New York,19299981


In [16]:
# Creates states.csv
df_states.to_csv('states.csv',index=False)

In [19]:
new_list = [2,4,5,'California']

for element in new_list:
    try:
        print(element/2)
    except:
        print('The element is not a number')

1.0
2.0
2.5
The element is not a number


In [23]:
n = 4
while n > 0:
    print(n)
    n = n - 1
    
    if n==2:
        break
print('Message')

4
3
Message


### 2.10. Beautiful Soup Basics

In [27]:
import requests
from bs4 import BeautifulSoup

# Fetch the pages
result = requests.get('http://www.google.com')

# Page content
content = result.text

# Create soup
soup = BeautifulSoup(content, 'lxml')

In [37]:
# # Find elements with beautiful soup
# soup.find(id='specific_id')

# # Find by class name
# soup.find('tag', class='class_name')

In [30]:
html_example = """
<article class = "main-article">
    ><h1> Titanic (1997)</h1> title
    <p class="'plot"> 84 years later...</p> description
    <div class="'full script"> 13 meters. You ... </div>
</article>
"""

In [36]:
soup = BeautifulSoup(html_example, 'html.parser')
soup.find('article', {'class': 'main-article'})

<article class="main-article">
    &gt;<h1> Titanic (1997)</h1> title
    <p class="'plot"> 84 years later...</p> description
    <div class="'full script"> 13 meters. You ... </div>
</article>

In [41]:
soup.find('h1')

<h1> Titanic (1997)</h1>

In [48]:
# #How to import normally
# website = 'https://subslikescript.com/movie/Titanic-120338'
# result = requests.get(website)
# content = result.text
# soup = BeautifulSoup(content, 'lxml')

# Alternatively, download the html code from sources to continue
with open('Titanic-120338.html') as file:
    html = file.read()

soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<html dir="ltr" lang="en">
<head>
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-120598793-1"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'UA-120598793-1');
</script>
<meta charset="utf-8"/>
<title>Titanic (1997) Movie Script  | Subs like Script</title>
<meta content="Read Titanic Movie Script. Created from srt subtitles. " name="description"/>
<meta content="transcript, movie, subtitles, scripts, film, video, media, subs, srt " name="keywords"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="index, follow" name="robots"/>
<link href="/favicon.ico" rel="shortcut icon"/>
<link href="/styles/main.css" media="all" rel="stylesheet" type="text/css"/>
<script async="" data-ad-client="ca-pub-6250492176235895" src="https://page

In [55]:
# Narrow down to main article, and get the title
box = soup.find('article', {'class': 'main-article'})
title = box.find('h1').get_text()
display(title)

'Titanic (1997) - full transcript'

In [62]:
# Strip the str based on separators
transcript = box.find('div', {'class':'full-script'}).get_text(strip=True,separator=' ')

import re

# Print the first 3 sentences
sentences = re.split(r'[.!?]', transcript)
print('. '.join(sentences[:3]) + '.')

13 meters.  You should see it.  Okay, take her up and over the bow rail.


In [61]:
sentences = transcript.split('. ')
print('. '.join(sentences[:3]) + '.')

13 meters. You should see it. Okay, take her up and over the bow rail.


In [64]:
with open(f'{title}.txt','w') as file: #title + '.txt'
    file.write(transcript)

### 3.15. Scraping multiple links

In [72]:
with open('subslikescript_movies.html') as file:
    html = file.read()

soup = BeautifulSoup(html, 'html.parser')
box = soup.find('article', {'class': 'main-article'})

In [81]:
# Print only the first a href element
box.find('a', href=True)

<a href="movie/Blind_Detective-2332707" title="Read transcript of Movie 'Blind Detective'"><li>Blind Detective (2013)</li></a>

In [82]:
# Get only the url link from each href
links = []
for link in box.find_all('a', href=True):
    links.append(link['href'])

display(links)

['movie/Blind_Detective-2332707',
 'movie/Killer_Workout-91339',
 'movie/Budapest_Noir-5161018',
 'movie/Firefly-3582840',
 'movie/Warriors_of_the_Rainbow_Seediq_Bale_II-4164468',
 'movie/Sex_Appeal-91927',
 'movie/Lonesome-15258032',
 'movie/Corri_uomo_corri-62825',
 'movie/Faraway-18747542',
 'movie/The_Treasure_of_Swamp_Castle-137226',
 'movie/Fantastic_Girls-4028826',
 'movie/Meet_Me_in_the_Bathroom-16378298',
 'movie/Shot-301171',
 'movie/LasseMajas_detektivbyr_-_Stella_Nostra-4397382',
 'movie/In_Real_Life-1087453',
 'movie/The_Earth_Dies_Screaming-58050',
 'movie/The_Mystery_of_Anthrax_Island-19863080',
 'movie/Men_of_Plastic-23157348',
 'movie/Ela_Veezha_Poonchira-15516546',
 'movie/Married_by_Mistake-21403538',
 'movie/The_Mask_of_the_Red_Death_Part_2-21443400',
 'movie/A_spol_macskak-67816',
 'movie/0815_-_In_der_Heimat-47789',
 'movie/Level_Five-116866',
 'movie/Mothman-1514425',
 'movie/My_Favorite_Christmas_Tree-19895682',
 'movie/Mistletoe_Match-18559372',
 'movie/Adieu_P

In [97]:
# for link in links:
#     root = 'https://subslikescript.com'
#     website = f'{root}/{link}'
#     result = requests.get(website)
#     content = result.text
#     soup = BeautifulSoup(content, 'lxml')

#     title = box.find('h1').get_text()
#     transcript = box.find('div', {'class':'full-script'}).get_text(strip=True,separator=' ')
#     with open(f'{title}.txt','w') as file: #title + '.txt'
#         file.write(transcript)

### 4.18. Xpath

Xpath syntax
//tagName[@AttributeName="Value"]

Xpath expression
//tagName[(expression_1) and (expression_2)]

Special characters
* /: select the children from the node set on the left side of this character
* // specifies that the node should be in the document
* . specifies the current context should be used
* .. refers to a parent node
* * selects all elements or attributes regardless of name
* @ select an attribute
* () grouping an xpath expression
* [n] indicates that a node with index "n" should be selected

### 4.25. Selenium

Other capabilities
* dropdowns
* logins
* waits

In [85]:
# pip install selenium==3.141

import selenium
print(selenium.__version__)

4.9.1


In [101]:
from selenium import webdriver

# Opens up automatic web driver
website = 'https://www.adamchoi.co.uk/overs/detailed'
path = r'C:\Users\mhuh22\Documents\Python\chromedriver_win32\chromedriver' # Specify location of chromedriver
driver = webdriver.Chrome(path)
driver.get(website)

# Find the element in xpath using tag_name[@event="text"]
all_matches_button = driver.find_element_by_xpath('//label[@analytics-event="All matches"]')
all_matches_button.click() # Clicks the button

matches = driver.find_elements_by_tag_name('tr')

date = []
home_team = []
score = []
away_team = []

for match in matches:
    date.append(match.find_element_by_xpath('./td[1]').text)
    home_team.append(match.find_element_by_xpath('./td[2]').text)
    score.append(match.find_element_by_xpath('./td[3]').text)
    away_team.append(match.find_element_by_xpath('./td[4]').text)
    print(match.text)

# driver.quit()

05-08-2022 Crystal Palace 0 - 2 Arsenal
13-08-2022 Arsenal 4 - 2 Leicester
20-08-2022 Bournemouth 0 - 3 Arsenal
27-08-2022 Arsenal 2 - 1 Fulham
31-08-2022 Arsenal 2 - 1 Aston Villa
04-09-2022 Man United 3 - 1 Arsenal
18-09-2022 Brentford 0 - 3 Arsenal
01-10-2022 Arsenal 3 - 1 Tottenham
09-10-2022 Arsenal 3 - 2 Liverpool
16-10-2022 Leeds 0 - 1 Arsenal
23-10-2022 Southampton 1 - 1 Arsenal
30-10-2022 Arsenal 5 - 0 Nott'm Forest
06-11-2022 Chelsea 0 - 1 Arsenal
12-11-2022 Wolves 0 - 2 Arsenal
26-12-2022 Arsenal 3 - 1 West Ham
31-12-2022 Brighton 2 - 4 Arsenal
03-01-2023 Arsenal 0 - 0 Newcastle
15-01-2023 Tottenham 0 - 2 Arsenal
22-01-2023 Arsenal 3 - 2 Man United
04-02-2023 Everton 1 - 0 Arsenal
11-02-2023 Arsenal 1 - 1 Brentford
15-02-2023 Arsenal 1 - 3 Man City
18-02-2023 Aston Villa 2 - 4 Arsenal
25-02-2023 Leicester 0 - 1 Arsenal
01-03-2023 Arsenal 4 - 0 Everton
04-03-2023 Arsenal 3 - 2 Bournemouth
12-03-2023 Fulham 0 - 3 Arsenal
19-03-2023 Arsenal 4 - 1 Crystal Palace
01-04-2023 Arsen

22-10-2022 Everton 3 - 0 Crystal Palace
29-10-2022 Crystal Palace 1 - 0 Southampton
06-11-2022 West Ham 1 - 2 Crystal Palace
12-11-2022 Nott'm Forest 1 - 0 Crystal Palace
26-12-2022 Crystal Palace 0 - 3 Fulham
31-12-2022 Bournemouth 0 - 2 Crystal Palace
04-01-2023 Crystal Palace 0 - 4 Tottenham
15-01-2023 Chelsea 1 - 0 Crystal Palace
18-01-2023 Crystal Palace 1 - 1 Man United
21-01-2023 Crystal Palace 0 - 0 Newcastle
04-02-2023 Man United 2 - 1 Crystal Palace
11-02-2023 Crystal Palace 1 - 1 Brighton
18-02-2023 Brentford 1 - 1 Crystal Palace
25-02-2023 Crystal Palace 0 - 0 Liverpool
04-03-2023 Aston Villa 1 - 0 Crystal Palace
11-03-2023 Crystal Palace 0 - 1 Man City
15-03-2023 Brighton 1 - 0 Crystal Palace
19-03-2023 Arsenal 4 - 1 Crystal Palace
01-04-2023 Crystal Palace 2 - 1 Leicester
09-04-2023 Leeds 1 - 5 Crystal Palace
15-04-2023 Southampton 0 - 2 Crystal Palace
22-04-2023 Crystal Palace 0 - 0 Everton
25-04-2023 Wolves 2 - 0 Crystal Palace
29-04-2023 Crystal Palace 4 - 3 West Ham
0

15-02-2023 Arsenal 1 - 3 Man City
18-02-2023 Nott'm Forest 1 - 1 Man City
25-02-2023 Bournemouth 1 - 4 Man City
04-03-2023 Man City 2 - 0 Newcastle
11-03-2023 Crystal Palace 0 - 1 Man City
01-04-2023 Man City 4 - 1 Liverpool
08-04-2023 Southampton 1 - 4 Man City
15-04-2023 Man City 3 - 1 Leicester
26-04-2023 Man City 4 - 1 Arsenal
30-04-2023 Fulham 1 - 2 Man City
03-05-2023 Man City 3 - 0 West Ham
06-05-2023 Man City 2 - 1 Leeds
14-05-2023 Everton ? - ? Man City
07-08-2022 Man United 1 - 2 Brighton
13-08-2022 Brentford 4 - 0 Man United
22-08-2022 Man United 2 - 1 Liverpool
27-08-2022 Southampton 0 - 1 Man United
01-09-2022 Leicester 0 - 1 Man United
04-09-2022 Man United 3 - 1 Arsenal
02-10-2022 Man City 6 - 3 Man United
09-10-2022 Everton 1 - 2 Man United
16-10-2022 Man United 0 - 0 Newcastle
19-10-2022 Man United 2 - 0 Tottenham
22-10-2022 Chelsea 1 - 1 Man United
30-10-2022 Man United 1 - 0 West Ham
06-11-2022 Aston Villa 3 - 1 Man United
13-11-2022 Fulham 1 - 2 Man United
27-12-202

02-04-2023 West Ham 1 - 0 Southampton
05-04-2023 West Ham 1 - 5 Newcastle
08-04-2023 Fulham 0 - 1 West Ham
16-04-2023 West Ham 2 - 2 Arsenal
23-04-2023 Bournemouth 0 - 4 West Ham
26-04-2023 West Ham 1 - 2 Liverpool
29-04-2023 Crystal Palace 4 - 3 West Ham
03-05-2023 Man City 3 - 0 West Ham
07-05-2023 West Ham 1 - 0 Man United
14-05-2023 Brentford ? - ? West Ham
06-08-2022 Leeds 2 - 1 Wolves
13-08-2022 Wolves 0 - 0 Fulham
20-08-2022 Tottenham 1 - 0 Wolves
28-08-2022 Wolves 1 - 1 Newcastle
31-08-2022 Bournemouth 0 - 0 Wolves
03-09-2022 Wolves 1 - 0 Southampton
17-09-2022 Wolves 0 - 3 Man City
01-10-2022 West Ham 2 - 0 Wolves
08-10-2022 Chelsea 3 - 0 Wolves
15-10-2022 Wolves 1 - 0 Nott'm Forest
18-10-2022 Crystal Palace 2 - 1 Wolves
23-10-2022 Wolves 0 - 4 Leicester
29-10-2022 Brentford 1 - 1 Wolves
05-11-2022 Wolves 2 - 3 Brighton
12-11-2022 Wolves 0 - 2 Arsenal
26-12-2022 Everton 1 - 2 Wolves
31-12-2022 Wolves 0 - 1 Man United
04-01-2023 Aston Villa 1 - 1 Wolves
14-01-2023 Wolves 1 - 0 

In [96]:
# # Finding elements with selenium
# driver.find_element_by_id('id')

# # Find using xpath
# dirver.find_element_by_xpath('//tag[@AttributeName="Value"]')

In [104]:
# Close the webpage
driver.quit()

In [103]:
df = pd.DataFrame({'date':date,
                 'home_team': home_team,
                 'score':score,
                 'away_team':away_team})
df.head()

Unnamed: 0,date,home_team,score,away_team
0,05-08-2022,Crystal Palace,0 - 2,Arsenal
1,13-08-2022,Arsenal,4 - 2,Leicester
2,20-08-2022,Bournemouth,0 - 3,Arsenal
3,27-08-2022,Arsenal,2 - 1,Fulham
4,31-08-2022,Arsenal,2 - 1,Aston Villa


In [105]:
df.to_csv('football_data.csv', index=False)

In [107]:
# Used for dropdown
from selenium.webdriver.support.ui import Select
import time

In [110]:
# Opens up automatic web driver
website = 'https://www.adamchoi.co.uk/overs/detailed'
path = r'C:\Users\mhuh22\Documents\Python\chromedriver_win32\chromedriver' # Specify location of chromedriver
driver = webdriver.Chrome(path)
driver.get(website)

# Find the element in xpath using tag_name[@event="text"]
all_matches_button = driver.find_element_by_xpath('//label[@analytics-event="All matches"]')
all_matches_button.click() # Clicks the button

# Select country dropdown
dropdown = Select(driver.find_element_by_id('country'))
dropdown.select_by_visible_text('Spain')

# Pause code for [3] seconds
time.sleep(3)

matches = driver.find_elements_by_tag_name('tr')

date = []
home_team = []
score = []
away_team = []

for match in matches:
    date.append(match.find_element_by_xpath('./td[1]').text)
    home_team.append(match.find_element_by_xpath('./td[2]').text)
    score.append(match.find_element_by_xpath('./td[3]').text)
    away_team.append(match.find_element_by_xpath('./td[4]').text)
    print(match.text)

14-08-2022 Almeria 1 - 2 Real Madrid
22-08-2022 Elche 1 - 1 Almeria
27-08-2022 Almeria 2 - 1 Sevilla
05-09-2022 Valladolid 1 - 0 Almeria
12-09-2022 Almeria 0 - 1 Osasuna
17-09-2022 Mallorca 1 - 0 Almeria
30-09-2022 Ath Bilbao 4 - 0 Almeria
08-10-2022 Almeria 3 - 1 Vallecano
16-10-2022 Betis 3 - 1 Almeria
20-10-2022 Almeria 3 - 2 Girona
23-10-2022 Villarreal 2 - 1 Almeria
29-10-2022 Almeria 3 - 1 Celta
05-11-2022 Barcelona 2 - 0 Almeria
09-11-2022 Almeria 1 - 0 Getafe
30-12-2022 Cadiz 1 - 1 Almeria
08-01-2023 Almeria 0 - 2 Sociedad
15-01-2023 Almeria 1 - 1 Ath Madrid
23-01-2023 Valencia 2 - 2 Almeria
27-01-2023 Almeria 3 - 1 Espanol
06-02-2023 Vallecano 2 - 0 Almeria
11-02-2023 Almeria 2 - 3 Betis
17-02-2023 Girona 6 - 2 Almeria
26-02-2023 Almeria 1 - 0 Barcelona
04-03-2023 Almeria 0 - 2 Villarreal
12-03-2023 Sevilla 2 - 1 Almeria
18-03-2023 Almeria 1 - 1 Cadiz
02-04-2023 Celta 2 - 2 Almeria
09-04-2023 Almeria 2 - 1 Valencia
16-04-2023 Ath Madrid 2 - 1 Almeria
22-04-2023 Almeria 1 - 2 A

11-09-2022 Elche 1 - 4 Ath Bilbao
17-09-2022 Barcelona 3 - 0 Elche
03-10-2022 Vallecano 2 - 1 Elche
10-10-2022 Elche 1 - 1 Mallorca
15-10-2022 Valencia 2 - 2 Elche
19-10-2022 Elche 0 - 3 Real Madrid
23-10-2022 Espanol 2 - 2 Elche
31-10-2022 Elche 0 - 1 Getafe
05-11-2022 Valladolid 2 - 1 Elche
08-11-2022 Elche 1 - 2 Girona
29-12-2022 Ath Madrid 2 - 0 Elche
06-01-2023 Elche 0 - 1 Celta
16-01-2023 Cadiz 1 - 1 Elche
22-01-2023 Elche 1 - 1 Osasuna
28-01-2023 Sevilla 3 - 0 Elche
04-02-2023 Elche 3 - 1 Villarreal
15-02-2023 Real Madrid 4 - 0 Elche
19-02-2023 Elche 0 - 1 Espanol
24-02-2023 Elche 2 - 3 Betis
04-03-2023 Mallorca 0 - 1 Elche
11-03-2023 Elche 1 - 1 Valladolid
19-03-2023 Sociedad 2 - 0 Elche
01-04-2023 Elche 0 - 4 Barcelona
08-04-2023 Osasuna 2 - 1 Elche
16-04-2023 Girona 2 - 0 Elche
23-04-2023 Elche 0 - 2 Valencia
26-04-2023 Celta 1 - 0 Elche
29-04-2023 Elche 4 - 0 Vallecano
02-05-2023 Almeria 2 - 1 Elche
14-05-2023 Elche ? - ? Ath Madrid
13-08-2022 Celta 2 - 2 Espanol
19-08-2022 

15-10-2022 Mallorca 0 - 1 Sevilla
18-10-2022 Sevilla 1 - 1 Valencia
22-10-2022 Real Madrid 3 - 1 Sevilla
29-10-2022 Sevilla 0 - 1 Vallecano
06-11-2022 Betis 1 - 1 Sevilla
09-11-2022 Sevilla 1 - 2 Sociedad
30-12-2022 Celta 1 - 1 Sevilla
08-01-2023 Sevilla 2 - 1 Getafe
14-01-2023 Girona 2 - 1 Sevilla
21-01-2023 Sevilla 1 - 0 Cadiz
28-01-2023 Sevilla 3 - 0 Elche
05-02-2023 Barcelona 3 - 0 Sevilla
11-02-2023 Sevilla 2 - 0 Mallorca
19-02-2023 Vallecano 1 - 1 Sevilla
26-02-2023 Sevilla 2 - 3 Osasuna
04-03-2023 Ath Madrid 6 - 1 Sevilla
12-03-2023 Sevilla 2 - 1 Almeria
19-03-2023 Getafe 2 - 0 Sevilla
01-04-2023 Cadiz 0 - 2 Sevilla
07-04-2023 Sevilla 2 - 2 Celta
16-04-2023 Valencia 0 - 2 Sevilla
23-04-2023 Sevilla 2 - 1 Villarreal
27-04-2023 Ath Bilbao 0 - 1 Sevilla
01-05-2023 Sevilla 0 - 2 Girona
04-05-2023 Sevilla 3 - 2 Espanol
14-05-2023 Valladolid ? - ? Sevilla
14-08-2022 Cadiz 0 - 1 Sociedad
21-08-2022 Sociedad 1 - 4 Barcelona
27-08-2022 Elche 0 - 1 Sociedad
03-09-2022 Sociedad 1 - 1 Ath M