# Web Scraping in Python

In [1]:
import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/wiki/Main_Page'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#Extract title of page
page_title = soup.title.text
print('Page title:', page_title)

Page title: Wikipedia, the free encyclopedia


# Web Scraping for Images

In [34]:
import requests
from PIL import Image
from io import BytesIO
url = "https://images.unsplash.com/photo-1503023345310-bd7c1de61c7d?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxzZWFyY2h8Mnx8aHVtYW58ZW58MHx8MHx8fDA%3D&w=1000&q=80.png"
response = requests.get(url)
img = Image.open(BytesIO(response.content))
img.save("wiki_logo.png")

# Working with CSV files


In [5]:
import csv
# Writing to a CSV file
with open('students.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Name", "Age", "Grade"])
    writer.writerow(["Mausam", 18, "Collage"])
    writer.writerow(["Lakpa", 19, "Collage"])
# Reading from a CSV file
with open('students.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        print(row)

['Name', 'Age', 'Grade']
['Mausam', '18', 'Collage']
['Lakpa', '19', 'Collage']


# Working with PDFs


In [24]:
import PyPDF2
with open ('sample.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    print('Number of Pages:', len(reader.pages))
    page=reader.pages[0]
    print('Print Text:', page.extract_text())
    

Number of Pages: 2
Print Text:  A Simple PDF File 
 This is a small demonstration .pdf file - 
 just for use in the Virtual Mechanics tutorials. More text. And more 
 text. And more text. And more text. And more text. 
 And more text. And more text. And more text. And more text. And more 
 text. And more text. Boring, zzzzz. And more text. And more text. And 
 more text. And more text. And more text. And more text. And more text. 
 And more text. And more text. 
 And more text. And more text. And more text. And more text. And more 
 text. And more text. And more text. Even more. Continued on page 2 ...


# Working with spreadsheets

In [27]:
# Create a new workbook and add some data
from openpyxl import Workbook
wb = Workbook()
ws = wb.active

data = [
    ["Fruit", "Quantity"],
    ["Kiwi", 3],
    ["Grape", 15],
    ["Apple", 7]
]

for row in data:
    ws.append(row)

wb.save("fruits.xlsx")

In [28]:
from openpyxl import load_workbook

wb = load_workbook(filename ='fruits.xlsx')
sheet = wb.active

for row in sheet.iter_rows(values_only=True):
    print(row)

('Fruit', 'Quantity')
('Kiwi', 3)
('Grape', 15)
('Apple', 7)


# Working with Emails


In [29]:
import smtplib

In [33]:
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

# your credentials
mail_content = 'Hello, this is a simple email from Python.'
sender_address = 'mausamsubba90@gmail.com'
sender_pass = 'mimsum2003'
receiver_address = 'mausamsubba90@gmail.com'

#Setup MIME
message = MIMEMultipart()
message['From'] = sender_address
message['To'] = receiver_address
message['Subject'] = 'A test mail sent by Python. It has an attachment.'

message.attach(MIMEText(mail_content, 'plain'))

#use gmail with port
session = smtplib.SMTP('smtp.gmail.com', 587)

#start tls for security
session.starttls() 

#authentication
session.login(sender_address, sender_pass)

#send the mail
text = message.as_string()
session.sendmail(sender_address, receiver_address, text)
session.quit()

print('Mail Sent')

SMTPAuthenticationError: (535, b'5.7.8 Username and Password not accepted. Learn more at\n5.7.8  https://support.google.com/mail/?p=BadCredentials x7-20020a62fb07000000b006675c242548sm4760015pfm.182 - gsmtp')

# Advanced Web Scraping with Selenium


In [37]:
!pip install selenium

Collecting selenium
  Using cached selenium-4.10.0-py3-none-any.whl (6.7 MB)
Collecting trio~=0.17
  Using cached trio-0.22.2-py3-none-any.whl (400 kB)
Collecting trio-websocket~=0.9
  Using cached trio_websocket-0.10.3-py3-none-any.whl (17 kB)
Collecting exceptiongroup>=1.0.0rc9
  Using cached exceptiongroup-1.1.2-py3-none-any.whl (14 kB)
Collecting outcome
  Using cached outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting wsproto>=0.14
  Using cached wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Using cached h11-0.14.0-py3-none-any.whl (58 kB)
Installing collected packages: outcome, h11, exceptiongroup, wsproto, trio, trio-websocket, selenium
Successfully installed exceptiongroup-1.1.2 h11-0.14.0 outcome-1.2.0 selenium-4.10.0 trio-0.22.2 trio-websocket-0.10.3 wsproto-1.2.0


In [44]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

driver = webdriver.Chrome( 'C:\Users\Lenovo\Desktop\chromedriver_linux64\')

# Navigate to url
driver.get("https://en.wikipedia.org/wiki/Main_Page")

# Simulate button click
button = driver.find_element_by_link_text('View history')
button.click()

# Wait for page to load
time.sleep(5)

# Extract page source
page_source = driver.page_source

print('Page Source:', page_source)

# close the driver
driver.quit()

SyntaxError: unterminated string literal (detected at line 5) (3961426539.py, line 5)