### Scraping a static wikipedia page

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_coffee_production'
result = requests.get(url).text
soup = BeautifulSoup(result)
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of countries by coffee production - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled";(function(){var cookie=docume

In [3]:
soup.title # Access the title tag
soup.get_text() # Extract only the text on the webpage without all the tags.
soup.find_all('a') # Shows all the hyperlinks (anchors)
all_links = soup.find_all('a')
for link in all_links:
    print(link.get("href")) # prints out all the strings in quotations after the 'href=' attributes.

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
/w/index.php?title=Special:CreateAccount&returnto=List+of+countries+by+coffee+production
/w/index.php?title=Special:UserLogin&returnto=List+of+countries+by+coffee+production
/w/index.php?title=Special:CreateAccount&returnto=List+of+countries+by+coffee+production
/w/index.php?title=Special:UserLogin&returnto=List+of+countries+by+coffee+production
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#Main_exporters_by_country
#See_also
#References
https://ar.wikipedia.org/wiki/%D9%

In [4]:
rows = soup.find_all('tr')
print(rows[:5]) # Print the first 5 table rows.
# Each <td> tag means a cell. I.E first cell has the value '1', next cell has value 'Brazil'(with formatting), then next has value '44,200,000' and so on.

[<tr>
<th>Rank
</th>
<th>Country
</th>
<th>60 kilogram bags
</th>
<th>Metric tons
</th>
<th>Pounds
</th></tr>, <tr>
<td>1
</td>
<td style="text-align:left"><span class="flagicon"><a href="/wiki/Brazil" title="Brazil"><img alt="Brazil" class="thumbborder" data-file-height="504" data-file-width="720" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/22px-Flag_of_Brazil.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/33px-Flag_of_Brazil.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/43px-Flag_of_Brazil.svg.png 2x" width="22"/></a></span> <a href="/wiki/Coffee_production_in_Brazil" title="Coffee production in Brazil">Brazil</a>
</td>
<td>44,200,000
</td>
<td>2,652,000
</td>
<td>5,714,381,000
</td></tr>, <tr>
<td>2
</td>
<td style="text-align:left"><span class="flagicon"><a href="/wiki/Vietnam" title="Vietnam"><img alt="Vietnam" class="thumbborder" data-file-height="600" dat

In [5]:
for row in rows:
    row_td = row.find_all('td') # In every iteration find the value of each cell (<td> tag). Result is a ResultSet object
print(row_td)
type(row_td) # Shows the result with <td> tags including the tags.

[<td class="navbox-abovebelow" colspan="2"><div><div class="hlist" style="text-align:center">
<ul><li><a href="/wiki/List_of_international_rankings" title="List of international rankings">List of international rankings</a></li>
<li><a href="/wiki/Lists_by_country" title="Lists by country">Lists by country</a></li></ul>
</div></div></td>]


bs4.element.ResultSet

In [6]:
# Removing using Regex (highly not recommended, error prone, messy)

import re

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)
print(clean2)
type(clean2)

[
List of international rankings
Lists by country
]


str

In [7]:
# Removing tags using BeautifulSoup (better alternative), this one uses the .split() method, which is not perfect because it also splits country names with spaces like 'Costa Rica'

table = soup.find('table')

data = []

for row in table.find_all('tr'):
    temp = row.text.replace('\n\n',' ').strip() # Extracts text content from the current row object, replaces consecutive newline characters (\n\n) with a single space. Finally, strip() is called to remove leading/trailing whitespace.
    temp_list = temp.split() # Splitting the string using whitespaces as the delimiter. Split method w/o argument will use whitespaces as default.
    data.append(temp_list)

print(data)

pd.DataFrame(data)

[['Rank', 'Country', '60', 'kilogram', 'bags', 'Metric', 'tons', 'Pounds'], ['1', 'Brazil', '44,200,000', '2,652,000', '5,714,381,000'], ['2', 'Vietnam', '27,500,000', '1,650,000', '3,637,627,000'], ['3', 'Colombia', '13,500,000', '810,000', '1,785,744,000'], ['4', 'Indonesia', '11,000,000', '660,000', '1,455,050,000'], ['5', 'Ethiopia', '6,400,000', '384,000', '846,575,000'], ['6', 'Honduras', '5,800,000', '348,000', '767,208,000'], ['7', 'India', '5,800,000', '348,000', '767,208,000'], ['8', 'Uganda', '4,800,000', '288,000', '634,931,000'], ['9', 'Mexico', '3,900,000', '234,000', '515,881,000'], ['10', 'Guatemala', '3,400,000', '204,000', '449,743,000'], ['11', 'Peru', '3,200,000', '192,000', '423,287,000'], ['12', 'Nicaragua', '2,200,000', '132,000', '291,010,000'], ['13', 'China(2013–14', 'est.)[7]', '1,947,000', '116,820', '257,544,000'], ['14', 'Ivory', 'Coast', '1,800,000', '108,000', '238,099,000'], ['15', 'Costa', 'Rica', '1,492,000', '89,520', '197,357,000'], ['16', 'Kenya', 

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Rank,Country,60,kilogram,bags,Metric,tons,Pounds,
1,1,Brazil,44200000,2652000,5714381000,,,,
2,2,Vietnam,27500000,1650000,3637627000,,,,
3,3,Colombia,13500000,810000,1785744000,,,,
4,4,Indonesia,11000000,660000,1455050000,,,,
5,5,Ethiopia,6400000,384000,846575000,,,,
6,6,Honduras,5800000,348000,767208000,,,,
7,7,India,5800000,348000,767208000,,,,
8,8,Uganda,4800000,288000,634931000,,,,
9,9,Mexico,3900000,234000,515881000,,,,


In [8]:
# Breaking down the above code line by line

print(row, '\n', '---') # Shows each element in each <tr>. Each element is basically a cell, enclosed in <td>
print(row.text, '---') # Extract the text within each element (each <td>), effectively eliminating the <td> tag
print(row.text.replace('\n\n',' ').strip(), '\n', '---') # Replacing line breaks with single whitespace, then removing leading or trailing whitespaces
print(row.text.replace('\n\n',' ').strip().split()) # Splitting the string using whitespaces as the delimiter. Split method w/o argument will use whitespaces as default.

<tr>
<td>51
</td>
<td style="text-align:left"><span class="flagicon"><a href="/wiki/Zambia" title="Zambia"><img alt="Zambia" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/0/06/Flag_of_Zambia.svg/23px-Flag_of_Zambia.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/06/Flag_of_Zambia.svg/35px-Flag_of_Zambia.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/06/Flag_of_Zambia.svg/45px-Flag_of_Zambia.svg.png 2x" width="23"/></a></span> <a href="/wiki/Zambia" title="Zambia">Zambia</a>
</td>
<td>2,000
</td>
<td>120
</td>
<td>264,000
</td></tr> 
 ---

51

 Zambia

2,000

120

264,000
 ---
51  Zambia 2,000 120 264,000 
 ---
['51', 'Zambia', '2,000', '120', '264,000']


In [9]:
# Alternative method to splitting with whitespaces

table = soup.find('table')

data = []

# In this updated code, instead of splitting the entire row's text using temp.split()
for row in table.find_all('tr'): # First we iterate over each row <tr>
    temp_list = [] # This variable will be populated with a list of lists after each iteration of the inner loop.
    for cell in row.find_all('td'): # Within each row, we iterate over each <td> cell in the row using row.find_all('td')
        temp = cell.text.replace('\n\n', ' ').strip() # We extract the text (with .text), remove and replace consecutive newline characters (with replace('\n\n', ' ')), and strip any leading/trailing whitespace for each cell. This will iterate until we run out of <td>.
        temp_list.append(temp) # The processed cell value is then appended to the temp_list as a list with x number of elements, depending on how many <td> there are in the row. The process for subsequent iteration is the same, each as it's own list. This way each list represents a row and does not get mixed up. 
        # And since we never use the split() method, each cell value remains intact, country names with spaces will not be split into separate columns.
    data.append(temp_list) # Finally append the list of lists to data, where we can convert to a dataframe.

pd.DataFrame(data) # Realise how index 0 is a row of none, this is because in that <tr> tag, there was no <td> tag within, only <th> tag (table header). Since we specifically coded to extract <td> so it extracts nothing.

Unnamed: 0,0,1,2,3,4
0,,,,,
1,1.0,Brazil,44200000.0,2652000.0,5714381000.0
2,2.0,Vietnam,27500000.0,1650000.0,3637627000.0
3,3.0,Colombia,13500000.0,810000.0,1785744000.0
4,4.0,Indonesia,11000000.0,660000.0,1455050000.0
5,5.0,Ethiopia,6400000.0,384000.0,846575000.0
6,6.0,Honduras,5800000.0,348000.0,767208000.0
7,7.0,India,5800000.0,348000.0,767208000.0
8,8.0,Uganda,4800000.0,288000.0,634931000.0
9,9.0,Mexico,3900000.0,234000.0,515881000.0


In [10]:
# Breakdown of code above
print(cell.text.replace('\n\n', ' ').strip(), '\n')
print(temp_list, '\n') # Each iteration has it's own list
print(data) # A list of lists from each iteration, we can convert this to a dataframe.


264,000 

['51', 'Zambia', '2,000', '120', '264,000'] 

[[], ['1', 'Brazil', '44,200,000', '2,652,000', '5,714,381,000'], ['2', 'Vietnam', '27,500,000', '1,650,000', '3,637,627,000'], ['3', 'Colombia', '13,500,000', '810,000', '1,785,744,000'], ['4', 'Indonesia', '11,000,000', '660,000', '1,455,050,000'], ['5', 'Ethiopia', '6,400,000', '384,000', '846,575,000'], ['6', 'Honduras', '5,800,000', '348,000', '767,208,000'], ['7', 'India', '5,800,000', '348,000', '767,208,000'], ['8', 'Uganda', '4,800,000', '288,000', '634,931,000'], ['9', 'Mexico', '3,900,000', '234,000', '515,881,000'], ['10', 'Guatemala', '3,400,000', '204,000', '449,743,000'], ['11', 'Peru', '3,200,000', '192,000', '423,287,000'], ['12', 'Nicaragua', '2,200,000', '132,000', '291,010,000'], ['13', 'China(2013–14 est.)[7]', '1,947,000', '116,820', '257,544,000'], ['14', 'Ivory Coast', '1,800,000', '108,000', '238,099,000'], ['15', 'Costa Rica', '1,492,000', '89,520', '197,357,000'], ['16', 'Kenya', '833,000', '49,980', '11

In [11]:
# Alternative method to splitting with whitespaces + header

table = soup.find('table')

data = []
header = []

# Add this extract header code
header_row = table.find('tr')
for cell in header_row.find_all('th'):
    header.append(cell.text.strip())

data.append(header)  # Add header to data list

# Extract data rows
for row in table.find_all('tr')[1:]:
    temp_list = []
    for cell in row.find_all('td'):
        temp = cell.text.replace('\n\n', ' ').strip()
        temp_list.append(temp)
    data.append(temp_list)

print(data)

pd.DataFrame(data)
# This code adds another list comprehension to extract the <th> tag and append that value extracted to 'data', so our variable now has the <th> values as index 0

[['Rank', 'Country', '60 kilogram bags', 'Metric tons', 'Pounds'], ['1', 'Brazil', '44,200,000', '2,652,000', '5,714,381,000'], ['2', 'Vietnam', '27,500,000', '1,650,000', '3,637,627,000'], ['3', 'Colombia', '13,500,000', '810,000', '1,785,744,000'], ['4', 'Indonesia', '11,000,000', '660,000', '1,455,050,000'], ['5', 'Ethiopia', '6,400,000', '384,000', '846,575,000'], ['6', 'Honduras', '5,800,000', '348,000', '767,208,000'], ['7', 'India', '5,800,000', '348,000', '767,208,000'], ['8', 'Uganda', '4,800,000', '288,000', '634,931,000'], ['9', 'Mexico', '3,900,000', '234,000', '515,881,000'], ['10', 'Guatemala', '3,400,000', '204,000', '449,743,000'], ['11', 'Peru', '3,200,000', '192,000', '423,287,000'], ['12', 'Nicaragua', '2,200,000', '132,000', '291,010,000'], ['13', 'China(2013–14 est.)[7]', '1,947,000', '116,820', '257,544,000'], ['14', 'Ivory Coast', '1,800,000', '108,000', '238,099,000'], ['15', 'Costa Rica', '1,492,000', '89,520', '197,357,000'], ['16', 'Kenya', '833,000', '49,980

Unnamed: 0,0,1,2,3,4
0,Rank,Country,60 kilogram bags,Metric tons,Pounds
1,1,Brazil,44200000,2652000,5714381000
2,2,Vietnam,27500000,1650000,3637627000
3,3,Colombia,13500000,810000,1785744000
4,4,Indonesia,11000000,660000,1455050000
5,5,Ethiopia,6400000,384000,846575000
6,6,Honduras,5800000,348000,767208000
7,7,India,5800000,348000,767208000
8,8,Uganda,4800000,288000,634931000
9,9,Mexico,3900000,234000,515881000


In [12]:
# Even faster and easier method using pandas .read_html() method

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_coffee_production'
result = requests.get(url).text
df = pd.read_html(result) # Takes the HTML content as input and attempts to parse and extract any tables present in the HTML. It returns a list of DataFrame objects, with each DataFrame representing a table found on the page. The function returns a list

In [13]:
df[0] # indexing '0' to access the first table in the list. You can adjust the index if there are multiple tables and you want to access a different one.

Unnamed: 0,Rank,Country,60 kilogram bags,Metric tons,Pounds
0,1,Brazil,44200000,2652000,5714381000
1,2,Vietnam,27500000,1650000,3637627000
2,3,Colombia,13500000,810000,1785744000
3,4,Indonesia,11000000,660000,1455050000
4,5,Ethiopia,6400000,384000,846575000
5,6,Honduras,5800000,348000,767208000
6,7,India,5800000,348000,767208000
7,8,Uganda,4800000,288000,634931000
8,9,Mexico,3900000,234000,515881000
9,10,Guatemala,3400000,204000,449743000


In [14]:
tdtag = soup.find('td')
sibling = tdtag.next_sibling # The reason why one '.next_sibling' shows no result is because of whitespaces or linebreaks '\n'. These are considered siblings too, so the output is actually a whitespace
sibling2 = tdtag.find_next_siblings() # To skip whitespaces use 'find_next_sibling('tag')' to access the next <td>, or find_next_siblings('tag'), siblings with 's', to access all the <td> tag siblings within that tree.

print('Next sibling is:', sibling)
print('Next sibling is:', sibling2)
print(tdtag)

Next sibling is: 

Next sibling is: [<td style="text-align:left"><span class="flagicon"><a href="/wiki/Brazil" title="Brazil"><img alt="Brazil" class="thumbborder" data-file-height="504" data-file-width="720" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/22px-Flag_of_Brazil.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/33px-Flag_of_Brazil.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/43px-Flag_of_Brazil.svg.png 2x" width="22"/></a></span> <a href="/wiki/Coffee_production_in_Brazil" title="Coffee production in Brazil">Brazil</a>
</td>, <td>44,200,000
</td>, <td>2,652,000
</td>, <td>5,714,381,000
</td>]
<td>1
</td>


In [15]:
tdtag = soup.find('td')
parent = tdtag.parent
parent2 = tdtag.parents

print('Parent is:', '\n', parent, '\n')
print('Parents are:', '\n', parent2)

Parent is: 
 <tr>
<td>1
</td>
<td style="text-align:left"><span class="flagicon"><a href="/wiki/Brazil" title="Brazil"><img alt="Brazil" class="thumbborder" data-file-height="504" data-file-width="720" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/22px-Flag_of_Brazil.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/33px-Flag_of_Brazil.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/43px-Flag_of_Brazil.svg.png 2x" width="22"/></a></span> <a href="/wiki/Coffee_production_in_Brazil" title="Coffee production in Brazil">Brazil</a>
</td>
<td>44,200,000
</td>
<td>2,652,000
</td>
<td>5,714,381,000
</td></tr> 

Parents are: 
 <generator object PageElement.parents at 0x14a1cb740>


In [16]:
trtag = soup.find('tr')
child = tdtag.children
child2 = tdtag.descendants

print(tdtag)
print('Children is:', '\n', child, '\n')
print('Childrens are:', '\n', child2)

<td>1
</td>
Children is: 
 <list_iterator object at 0x14a322760> 

Childrens are: 
 <generator object Tag.descendants at 0x14a1cb890>


In [17]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_coffee_production'
result = requests.get(url).text
soup = BeautifulSoup(result)

divtag = soup.find('div')

# Generators and iterators are closely related concepts in Python. Generators are a specific type of iterator that generate values on-the-fly as you iterate over them. Every generator is an iterator, but not the other way round.

print(type(tdtag.parents)) # TagGenerator/list iterator object. Creates a list of the immediate parent all the way up to to root level tag. like <HTML>, the 'ancestor'
print(type(trtag.children)) # list iterator. Use for loops to iterate the list, like how we usually do.
print(type(tdtag.contents)) # list. '.contents' returns a list iterator object that represents the direct children as a list, including the children tag.
print(type(trtag.descendants)) # TagGenerator/list iterator object. Descendants include the children, grandchildren, great-grandchildren and so on of the tag if we print it, so if we iterate over a 5 level nesting, at each level all the decesdants will be printed once, creating a long list.
print(type(tdtag.next_siblings)) # TagGenerator/list iterator object
print(type(tdtag.previous_siblings)) # TagGenerator/list iterator object
print(type(tdtag.find_next_sibling())) # Tag object
print(type(tdtag.find_previous_sibling())) # Tag object
print(type(tdtag.parent)) # Tag object

<class 'generator'>
<class 'list_iterator'>
<class 'list'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'bs4.element.Tag'>
<class 'NoneType'>
<class 'bs4.element.Tag'>


### This section below utilises another module Selenium to perform automation such as scrolling to scrape from websites with dynamic content loading on scroll

In [18]:
import pandas as pd
import numpy as np  
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

In [19]:
# Using selenium to scrape information from a dynamic website, which our BeautifulSoup method failed when trying to scrape jobstreet data
# We have to download geckodriver for Firefox and place the executable somewhere
# Then set the path using terminal 'export PATH=$PATH:<executable directory>'/ Replace <executable directory> with the path to the geckodriver executable

browser = webdriver.Firefox() # Initiate an instance to start our scraping

In [20]:
url = 'https://www.youtube.com/@programmingwithmosh/videos'
browser.get(url)
browser.implicitly_wait(10) # Wait 10 seconds to make sure web page is fully loaded, including Javascript elements. Ensuring we can get everything.

In [24]:
# Extracting the video title and video link

contents = browser.find_element(By.ID, "contents") # The first id='contents' includes all the videos on the webpage, hence 'find_element' works without the need to use 'find_elements'
vid_elements = contents.find_elements(By.ID, 'video-title-link') # video-title-link has all the elements we want, within the <a> tag

titles = []
links = []

for video in vid_elements:
    try:
        vid_title = video.get_attribute('title')
        titles.append(vid_title)
        vid_link = video.get_attribute('href')
        links.append(vid_link)
        print(f'Title: {vid_title} \n link: {vid_link}, \n')
        
    except StaleElementReferenceException:
        # Bypass the exception and continue the loop
        continue
    

# print(titles)
# print(links)

Title: React Tutorial for Beginners 
 link: https://www.youtube.com/watch?v=SqcY0GlETPk, 

Title: A New React Course is on the Way! 
 link: https://www.youtube.com/watch?v=hZB5bHDCmeY, 

Title: ChatGPT Tutorial for Developers - 38 Ways to 10x Your Productivity 
 link: https://www.youtube.com/watch?v=sTeoEFzVNSc, 

Title: Mosh's Xmas gift to you! 
 link: https://www.youtube.com/watch?v=uN6JO-5GW8w, 

Title: Don't write code like John Smith! 
 link: https://www.youtube.com/watch?v=FhyHvFXXkbo, 

Title: C++ Tutorial for Beginners - Learn C++ in 1 Hour 
 link: https://www.youtube.com/watch?v=ZzaPdXTrSb8, 

Title: TypeScript Tutorial for Beginners 
 link: https://www.youtube.com/watch?v=d56mG7DezGs, 

Title: Docker Compose Tutorial 
 link: https://www.youtube.com/watch?v=HG6yIjZapSA, 

Title: Java Collections Tutorial 
 link: https://www.youtube.com/watch?v=rH0winlka8A, 

Title: Java Generics Tutorial 
 link: https://www.youtube.com/watch?v=7i3Rliqzquw, 

Title: Java Interfaces Tutorial 
 l

In [22]:
# Extracting video image

images = contents.find_elements(By.TAG_NAME, 'img')

image_links = []

for i in images:
    all_image = i.get_attribute('src')
    if all_image: # Skip if src is 'None"
        print(all_image)
        image_links.append(all_image)

# print(image_links)

https://i.ytimg.com/vi/SqcY0GlETPk/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLDUgA8KDm8iR8b275G0319wL0LCCA
https://i.ytimg.com/vi/hZB5bHDCmeY/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBZiuzQQVOrpcsS0zfzjnm9-W0Lrg
https://i.ytimg.com/vi/sTeoEFzVNSc/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLD13Y1ivr0MBj9jkSBWHgri9zKJdg
https://i.ytimg.com/vi/uN6JO-5GW8w/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAb0JBV-IDUexFT9WFpVHg8qxBd7g
https://i.ytimg.com/vi/FhyHvFXXkbo/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBJZwg6sJoZuWYvKkzlj6Ttb0ZLYQ
https://i.ytimg.com/vi/ZzaPdXTrSb8/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLDu5GgcYuJDF-yOUQqMCRzXKG-Lpg
https://i.ytimg.com/vi/d56mG7DezGs/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLDOAPU6yAfqk754AVFQgO2W6LCKDQ
https://i.ytimg.com/

In [25]:
metadata = contents.find_elements(By.ID, 'metadata-line')

metadata_list = []

for i in metadata:
    views = i.find_elements(By.TAG_NAME, 'span') # There are 2 span tags, first span tag is the no. of views, second span tag is the day posted.

    span_data = []
    for elements in views:
        span_data.append(elements.text)
    metadata_list.append(span_data)
    print(span_data)

# print(metadata_list)

['654K views', '3 months ago']
['42K views', '3 months ago']
['3.3M views', '5 months ago']
['43K views', '6 months ago']
['72K views', '7 months ago']
['1.7M views', '10 months ago']
['664K views', '1 year ago']
['276K views', '1 year ago']
['113K views', '1 year ago']
['45K views', '1 year ago']
['54K views', '1 year ago']
['99K views', '1 year ago']
['54K views', '1 year ago']
['80K views', '1 year ago']
['101K views', '1 year ago']
['1.6M views', '2 years ago']
['1.8M views', '2 years ago']
['45K views', '2 years ago']
['6.7M views', '2 years ago']
['1.4M views', '2 years ago']
['207K views', '2 years ago']
['83K views', '2 years ago']
['383K views', '2 years ago']
['2.1M views', '2 years ago']
['12M views', '2 years ago']
['1.8M views', '2 years ago']
['407K views', '2 years ago']
['216K views', '2 years ago']
['247K views', '3 years ago']
['140K views', '3 years ago']
['2.5M views', '3 years ago']
['971K views', '3 years ago']
['1.1M views', '3 years ago']
['1.1M views', '3 years

In [26]:
# If we want to save the data into two different list:

view_data = []
posted_data = []

for data in metadata_list:
    view_data.append(data[0])
    posted_data.append(data[1])

print(view_data)
print() # For linebreak
print(posted_data)

['654K views', '42K views', '3.3M views', '43K views', '72K views', '1.7M views', '664K views', '276K views', '113K views', '45K views', '54K views', '99K views', '54K views', '80K views', '101K views', '1.6M views', '1.8M views', '45K views', '6.7M views', '1.4M views', '207K views', '83K views', '383K views', '2.1M views', '12M views', '1.8M views', '407K views', '216K views', '247K views', '140K views', '2.5M views', '971K views', '1.1M views', '1.1M views', '449K views', '28K views', '151K views', '1.4M views', '37K views', '82K views', '9.3M views', '182K views', '331K views', '9.5M views', '34M views', '2.2M views', '68K views', '85K views', '75K views', '53K views', '54K views', '232K views', '110K views', '135K views', '166K views', '205K views', '221K views', '684K views', '387K views', '219K views']

['3 months ago', '3 months ago', '5 months ago', '6 months ago', '7 months ago', '10 months ago', '1 year ago', '1 year ago', '1 year ago', '1 year ago', '1 year ago', '1 year ag

In [27]:
df = pd.DataFrame(
    list(zip(titles, links, image_links, view_data, posted_data)),
    columns=['title', 'link', 'image_link', 'views', 'published']
)
df.head(10)

Unnamed: 0,title,link,image_link,views,published
0,React Tutorial for Beginners,https://www.youtube.com/watch?v=SqcY0GlETPk,https://i.ytimg.com/vi/SqcY0GlETPk/hqdefault.j...,654K views,3 months ago
1,A New React Course is on the Way!,https://www.youtube.com/watch?v=hZB5bHDCmeY,https://i.ytimg.com/vi/hZB5bHDCmeY/hqdefault.j...,42K views,3 months ago
2,ChatGPT Tutorial for Developers - 38 Ways to 1...,https://www.youtube.com/watch?v=sTeoEFzVNSc,https://i.ytimg.com/vi/sTeoEFzVNSc/hqdefault.j...,3.3M views,5 months ago
3,Mosh's Xmas gift to you!,https://www.youtube.com/watch?v=uN6JO-5GW8w,https://i.ytimg.com/vi/uN6JO-5GW8w/hqdefault.j...,43K views,6 months ago
4,Don't write code like John Smith!,https://www.youtube.com/watch?v=FhyHvFXXkbo,https://i.ytimg.com/vi/FhyHvFXXkbo/hqdefault.j...,72K views,7 months ago
5,C++ Tutorial for Beginners - Learn C++ in 1 Hour,https://www.youtube.com/watch?v=ZzaPdXTrSb8,https://i.ytimg.com/vi/ZzaPdXTrSb8/hqdefault.j...,1.7M views,10 months ago
6,TypeScript Tutorial for Beginners,https://www.youtube.com/watch?v=d56mG7DezGs,https://i.ytimg.com/vi/d56mG7DezGs/hqdefault.j...,664K views,1 year ago
7,Docker Compose Tutorial,https://www.youtube.com/watch?v=HG6yIjZapSA,https://i.ytimg.com/vi/HG6yIjZapSA/hqdefault.j...,276K views,1 year ago
8,Java Collections Tutorial,https://www.youtube.com/watch?v=rH0winlka8A,https://i.ytimg.com/vi/rH0winlka8A/hqdefault.j...,113K views,1 year ago
9,Java Generics Tutorial,https://www.youtube.com/watch?v=7i3Rliqzquw,https://i.ytimg.com/vi/7i3Rliqzquw/hqdefault.j...,45K views,1 year ago
