## Gathering and Working Data 

In [1]:
from collections import Counter
import math, random, csv, json, re
from bs4 import BeautifulSoup
import requests


## Files

In [2]:

# Format:   FB,64.5

# 'r' means read-only
file_for_reading = open('comma_delimited_stock_prices.txt', 'r')

# don't forget to close your files when you're done
file_for_reading.close()

In [3]:

with open('comma_delimited_stock_prices.txt','r') as f:
    for line in f: # look at each line in the file
        if re.match("^A",line): # use a regex to see if it starts with 'A'
            print(line)

AAPL,90.91



In [4]:
# the file format is tab limited
# 6/20/2014	AAPL	90.91
with open('tab_delimited_stock_prices.txt', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        print(date, symbol, closing_price)

6/20/2014 AAPL 90.91
6/20/2014 MSFT 41.68
6/20/2014 FB 64.5
6/19/2014 AAPL 91.86
6/19/2014 MSFT 41.51
6/19/2014 FB 64.34
2/5/2020 THYAO 18.91


In [5]:
#append new value
with open('tab_delimited_stock_prices.txt', 'a') as f:
    f.write("\n2/5/2020\tTHYAO\t18.91")

In [6]:
with open('colon_delimited_stock_prices.txt', 'r') as f:
    reader = csv.DictReader(f, delimiter=':')
    #d= list(reader)
    #print( type(d) )
    #print(d)
    for row in reader:
        date = row["date"]
        symbol = row["symbol"]
        closing_price = float(row["closing_price"])
        print(date, symbol, closing_price)

6/20/2014 AAPL 90.91
6/20/2014 MSFT 41.68
6/20/2014 FB 64.5


In [7]:
## write to file

today_prices = { 'AAPL' : 90.91, 'MSFT' : 41.68, 'FB' : 64.5 }
with open('comma_delimited_stock_prices_new.txt','w', newline='') as f:
    writer = csv.writer(f, delimiter=',')
    for stock, price in today_prices.items():
        print(stock, price)
        writer.writerow([stock, price])

AAPL 90.91
MSFT 41.68
FB 64.5


## Web Scraping

In [8]:
# Well-formed html: 
# <html><head><title>A web page</title></head><body><p></p></body></html>
# In the actual world, HTML is not generally well-formed, let alone annotated. This means we’ll need help making sense of it
# we can use the BeautifulSoup library, which builds a tree out of the various elements on a web page 
# and provides a simple interface for accessing them.
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [9]:
html = requests.get("https://www.atilim.edu.tr").text
soup = BeautifulSoup(html, 'html5lib') # 'html.parser'
print(soup)

<!DOCTYPE html>
<html data-lang-name="Turkish" lang="tr"><head>

<script>var isMobile=false, isTablet=false</script><meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="text/html; charset=utf-8" name="content-type"/>

<meta content="jT-3b_WCSabaHoTMz6xHuFZM0j4fP1sjme_pC-JeWMg" name="google-site-verification"/>
<meta content="k31XikwTaGBiyVX7H6CCsLcL8Kk5Yg05JXBY6UKb6uQ" name="google-site-verification"/>
<meta content="uwktodho9ecA7UD24A0aSV8o7ky8eSfBtr04u9DR" name="csrf-token"/>

<meta content="summary_large_image" name="twitter:card"/>
<meta content="@atilimuniv" name="twitter:site"/>
<meta content="@atilimuniv" name="twitter:creator"/>
<meta content="ATILIM ÜNİVERSİTESİ" name="twitter:title"/>
<meta content="Başkent Ankara'da &quot;kaliteli öğretim verilen,  bilim ve teknoloji üreten, araştırma yapan, ülkenin geleceğine ışık tutan bir üniversite olmak&quot; amacıyla kurulan

In [10]:
first_paragraph = soup.find('p')
print(first_paragraph.text)

2019 - 2020 Yılı ders programına buradan ulaşabilirsiniz.


In [11]:
all_paragraphs = soup.find_all('p')
print(all_paragraphs)

[<p>2019 - 2020 Yılı ders programına buradan ulaşabilirsiniz.</p>, <p>2019 - 2020 Yılı akademik takvim bilgilerine buradan ulaşabilirsiniz.</p>, <p>Üniversite personellerimiz ve öğrencilerimiz webmail girişlerini buradan yapabilirler.</p>, <p>2019 - 2020 Yılı ders programına buradan ulaşabilirsiniz.</p>, <p>2019 - 2020 Yılı akademik takvim bilgilerine buradan ulaşabilirsiniz.</p>, <p>Üniversite personellerimiz ve öğrencilerimiz webmail girişlerini buradan yapabilirler.</p>, <p>Üniversitemiz Enerji Sistemleri Mühendisliği Öğretim Üyesi Doç. Dr. Yılser Devrim’in, TÜBİTAK "1001-Bilimsel ve Teknolojik Araştırma Projelerini Destekleme Programı” kapsamında...</p>, <p>...</p>, <p>Beslenme ve Diyetetik Bölüm Başkanımız Dr. Öğr. Üyesi Begüm Kalyoncu, bağışıklık sisteminin güçlenmesi amacıyla......</p>, <p>Tıp Fakültesi Dekanımız Prof. Dr. Uğur Gönüllü CNN Türk ekranlarında sabah kuşağında yayınlanan Mercek Altında Programına katıldı....</p>, <p>Oyunculuk kadar yazarlık ve yönetmenlikte de yeten

In [12]:
all_paragraphs = soup.find_all('li')
print(all_paragraphs)

[<li>
                                    <a href="https://www.atilim.edu.tr/tr/home/page/1404/atilim-online">ATILIM ONLINE</a>
                                </li>, <li>
                                    <a href="https://ic.atilim.edu.tr/tr" target="_blank">Uluslararası</a>
                                </li>, <li>
                                    <a href="https://www.atilim.edu.tr/tr/argeda-teknoloji-transfer-ofisi-direktorlugu">ARGEDA - TTO</a>
                                </li>, <li>
                                    <a href="https://atusem.atilim.edu.tr/" target="_blank">Sürekli Eğitim</a>
                                </li>, <li>
                                    <a href="/ue">Uzaktan Eğitim</a>
                                </li>, <li>
                                    <a href="/library">Kütüphane</a>
                                </li>, <li>
                                    <a href="https://tourmake.net/tr/tour/1ba4cee675b876335fa20c59e4cb2a93" target=

In [13]:
with open("index.html") as file:
    soup = BeautifulSoup(file)
    print(soup.find('head') )
    #print (soup)
    


<head>
<meta charset="utf-8"/>
<title>Title of the document</title>
</head>


In [14]:
soup = BeautifulSoup("<html>data</html>")
print (soup)

<html><body><p>data</p></body></html>


In [15]:
id_soup = BeautifulSoup('<p id="my id"></p>')
id_soup.p['id']

'my id'

### Book scraping

In [16]:
main_url = "http://books.toscrape.com/index.html"
result = requests.get(main_url)
result.text[:1000]

'<!DOCTYPE html>\n<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->\n<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->\n<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->\n<!--[if gt IE 8]><!--> <html lang="en-us" class="no-js"> <!--<![endif]-->\n    <head>\n        <title>\n    All products | Books to Scrape - Sandbox\n</title>\n\n        <meta http-equiv="content-type" content="text/html; charset=UTF-8" />\n        <meta name="created" content="24th Jun 2016 09:29" />\n        <meta name="description" content="" />\n        <meta name="viewport" content="width=device-width" />\n        <meta name="robots" content="NOARCHIVE,NOCACHE" />\n\n        <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->\n        <!--[if lt IE 9]>\n        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>\n        <![endif]-->\n\n        \n            <link rel="shortcut icon" href

In [17]:
soup = BeautifulSoup(result.text, 'html.parser')
print(soup.prettify()[:1000])

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-us">
 <!--<![endif]-->
 <head>
  <title>
   All products | Books to Scrape - Sandbox
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="24th Jun 2016 09:29" name="created"/>
  <meta content="" name="description"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="NOARCHIVE,NOCACHE" name="robots"/>
  <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
  <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
  <link href="static/oscar/favicon.ico" rel="shortcut icon"/>
  <link href="static/oscar/css/styles.css" rel="stylesheet" type="tex

In [18]:
# To find the URL of every book product page
# The link of the product corresponds to the ‘href’ attribute of the ‘a’ tag. 
# This one belongs to an ‘article’ tag with the a class value ‘product_pod’. 
# This seems to be a reliable source to spot product URLs.
soup.find("article", class_ = "product_pod")

<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [19]:
# dive to the "a"
soup.find("article", class_ = "product_pod").div.a

<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>

In [20]:
# Need href
soup.find("article", class_ = "product_pod").div.a.get('href')

'catalogue/a-light-in-the-attic_1000/index.html'

In [21]:
myList = []
for x in soup.findAll("article", class_ = "product_pod") :
    myList.append(x.div.a.get('href'))
print(myList)


['catalogue/a-light-in-the-attic_1000/index.html', 'catalogue/tipping-the-velvet_999/index.html', 'catalogue/soumission_998/index.html', 'catalogue/sharp-objects_997/index.html', 'catalogue/sapiens-a-brief-history-of-humankind_996/index.html', 'catalogue/the-requiem-red_995/index.html', 'catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html', 'catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html', 'catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html', 'catalogue/the-black-maria_991/index.html', 'catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html', 'catalogue/shakespeares-sonnets_989/index.html', 'catalogue/set-me-free_988/index.html', 'catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html', 'catalogue/rip-it-up-and-start-again_986/index.html', 'catalogue/our-band-could-be-your-life-scenes-from-the-amer

In [22]:
# All products href
main_page_products_urls = [x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")]
print(str(len(main_page_products_urls)) + " fetched products URLs")

print("One example:", main_page_products_urls[0])


20 fetched products URLs
One example: catalogue/a-light-in-the-attic_1000/index.html


In [23]:
# Find book categories URLs
categories_urls = [main_url + x.get('href') for x in soup.find_all("a", href=re.compile("catalogue/category/books"))]
categories_urls = categories_urls[1:] # we remove the first one because it corresponds to all the books

print(str(len(categories_urls)) + " fetched categories URLs")

print("Some examples:")

categories_urls[:5]

50 fetched categories URLs
Some examples:


['http://books.toscrape.com/index.htmlcatalogue/category/books/travel_2/index.html',
 'http://books.toscrape.com/index.htmlcatalogue/category/books/mystery_3/index.html',
 'http://books.toscrape.com/index.htmlcatalogue/category/books/historical-fiction_4/index.html',
 'http://books.toscrape.com/index.htmlcatalogue/category/books/sequential-art_5/index.html',
 'http://books.toscrape.com/index.htmlcatalogue/category/books/classics_6/index.html']

In [24]:
pages_urls = []

new_page = "http://books.toscrape.com/catalogue/page-1.html"

while requests.get(new_page).status_code == 200:
    pages_urls.append(new_page)
    new_page = pages_urls[-1].split("-")[0] + "-" + str(int(pages_urls[-1].split("-")[1].split(".")[0]) + 1) + ".html"

print(str(len(pages_urls)) + " fetched URLs")

print("Some examples:")

pages_urls[:5]

50 fetched URLs
Some examples:


['http://books.toscrape.com/catalogue/page-1.html',
 'http://books.toscrape.com/catalogue/page-2.html',
 'http://books.toscrape.com/catalogue/page-3.html',
 'http://books.toscrape.com/catalogue/page-4.html',
 'http://books.toscrape.com/catalogue/page-5.html']

In [25]:
def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return(soup)

In [26]:
# Now retrieve book links

def getBooksURLs(url):
    soup = getAndParseURL(url)
    # remove the index.html part of the base url before returning the results
    return(["/".join(url.split("/")[:-1]) + "/" + x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")])

In [27]:
booksURLs = []
for page in pages_urls:
    booksURLs.extend(getBooksURLs(page))

print(str(len(booksURLs)) + " fetched URLs")
print("Some examples:")

booksURLs[:5]

1000 fetched URLs
Some examples:


['http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
 'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
 'http://books.toscrape.com/catalogue/soumission_998/index.html',
 'http://books.toscrape.com/catalogue/sharp-objects_997/index.html',
 'http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html']

In [28]:
names = []
prices = []
nb_in_stock = []
img_urls = []
categories = []
ratings = []

# scrape data for every book URL: this may take some time
for url in booksURLs:
    soup = getAndParseURL(url)
    # product name
    names.append(soup.find("div", class_ = "product_main").h1.text)
    # product price
    prices.append(soup.find("p", class_ = "price_color").text[2:]) # get rid of the pound sign
    # number of available products
    nb_in_stock.append(re.sub("[^0-9]", "", soup.find("p", class_ = "instock availability").text)) # get rid of non numerical characters
    # image url
    img_urls.append(url.replace("index.html", "") + soup.find("img").get("src"))
    # product category
    categories.append(soup.find("a", href = re.compile("../category/books/") ).get("href").split("/")[3])
    # ratings
    ratings.append(soup.find("p", class_ = re.compile("star-rating")).get("class")[1])
    
# add data into pandas df
import pandas as pd
scraped_data = pd.DataFrame({'name': names, 'price': prices, 'nb_in_stock': nb_in_stock, "url_img": img_urls, "product_category": categories, "rating": ratings})
scraped_data.head()

Unnamed: 0,name,price,nb_in_stock,url_img,product_category,rating
0,A Light in the Attic,51.77,22,http://books.toscrape.com/catalogue/a-light-in...,poetry_23,Three
1,Tipping the Velvet,53.74,20,http://books.toscrape.com/catalogue/tipping-th...,historical-fiction_4,One
2,Soumission,50.1,20,http://books.toscrape.com/catalogue/soumission...,fiction_10,One
3,Sharp Objects,47.82,20,http://books.toscrape.com/catalogue/sharp-obje...,mystery_3,Four
4,Sapiens: A Brief History of Humankind,54.23,20,http://books.toscrape.com/catalogue/sapiens-a-...,history_32,Five


## Scrapy

In [2]:
# https://docs.scrapy.org/en/latest/topics/spiders.html
import scrapy
from scrapy.crawler import CrawlerProcess

In [30]:
class HeadphonesSpider(scrapy.Spider):

    name = "headphones"

    def start_requests(self):
        urls = ['https://www.amazon.com/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords=headphones&rh=i%3Aaps%2Ck%3Aheadphones&ajr=2',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        img_urls = response.css('img::attr(src)').extract()
        with open('urls.txt', 'w') as f:
            for u in img_urls:
                f.write(u + "\n")

In [31]:
process_1 = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47'})

process_1.crawl(HeadphonesSpider)
process_1.start()
process_1.stop()

2020-04-29 13:27:08 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2020-04-29 13:27:08 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 20.3.0, Python 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g  21 Apr 2020), cryptography 2.8, Platform Windows-10-10.0.18362-SP0
2020-04-29 13:27:08 [scrapy.crawler] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47'}
2020-04-29 13:27:08 [scrapy.extensions.telnet] INFO: Telnet Password: 6294245e4128fdc3
2020-04-29 13:27:08 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2020-04-29 13:27:09 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermi

<DeferredList at 0x2942ec2b6c8 current result: []>

In [3]:
class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [4]:
import logging

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/page/1/',
        'http://quotes.toscrape.com/page/2/',
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'quoteresult.json'                        # Used for pipeline 2
    }
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

In [5]:
# You may need to restart kernel to get rid of the error
process_2 = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47'})
process_2.crawl(QuotesSpider)
process_2.start()
process_2.stop()

2020-04-29 13:28:49 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2020-04-29 13:28:49 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 20.3.0, Python 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g  21 Apr 2020), cryptography 2.8, Platform Windows-10-10.0.18362-SP0
2020-04-29 13:28:49 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'quoteresult.json', 'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47'}


<DeferredList at 0x1fb84e69448 current result: []>

In [7]:
import pandas as pd
dfjson = pd.read_json('quoteresult.json')
dfjson

Unnamed: 0,text,author,tags
0,“This life is what you make it. No matter what...,Marilyn Monroe,"[friends, heartbreak, inspirational, life, lov..."
1,“It takes a great deal of bravery to stand up ...,J.K. Rowling,"[courage, friends]"
2,"“If you can't explain it to a six year old, yo...",Albert Einstein,"[simplicity, understand]"
3,"“You may not be her first, her last, or her on...",Bob Marley,[love]
4,"“I like nonsense, it wakes up the brain cells....",Dr. Seuss,[fantasy]
5,"“I may not have gone where I intended to go, b...",Douglas Adams,"[life, navigation]"
6,"“The opposite of love is not hate, it's indiff...",Elie Wiesel,"[activism, apathy, hate, indifference, inspira..."
7,"“It is not a lack of love, but a lack of frien...",Friedrich Nietzsche,"[friendship, lack-of-friendship, lack-of-love,..."
8,"“Good friends, good books, and a sleepy consci...",Mark Twain,"[books, contentment, friends, friendship, life]"
9,“Life is what happens to us while we are makin...,Allen Saunders,"[fate, life, misattributed-john-lennon, planni..."


## Selenium

In [36]:
# need to have geckodriver
# https://github.com/mozilla/geckodriver/releases

In [37]:
from selenium import webdriver

import os 
import time

url=r"C:\\Users\\by\\anaconda_dir\\se_422\\notebooks\\notebook_11\\driver\\geckodriver.exe"
#print(os.path.isfile(url) )
driver=webdriver.Firefox(executable_path=url)
driver.get("http://www.google.com")

que=driver.find_element_by_xpath("//input[@name='q']")
que.send_keys("Python book")
que.submit()


#time.sleep(2)

#driver.close()
#driver.quit()


In [10]:
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
opts = Options()
opts.set_headless()
assert opts.headless  # Operating in headless mode
browser = Firefox(options=opts, executable_path=r"C:\\Users\\by\\anaconda_dir\\se_422\\notebooks\\notebook_11\\driver\\geckodriver.exe")
browser.get('https://duckduckgo.com')

search_form = browser.find_element_by_id('search_form_input_homepage')
search_form.send_keys('real python')
search_form.submit()


In [11]:
results = browser.find_elements_by_class_name('result')
print(results[0].text)

Python Tutorials - Real Python
https://realpython.com
In this course, you'll learn how to work with Python's set data type. You'll see how to define set objects in Python and discover the operations that they support. By the end of this course, you'll have a good feel for when a set is an appropriate choice in your own programs. Unsubscribe any time. At Real Python you can learn all things Python ...


In [40]:
## Twython

In [12]:
# To get tweets, Twython can be used
# You need to have CONSUMER_KEY, CONSUMER_SECRET
from twython import Twython
twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)
# search for tweets containing the phrase "data science"
for status in twitter.search(q='"data science"')["statuses"]:
    user = status["user"]["screen_name"].encode('utf-8')
    text = status["text"].encode('utf-8')
    print (user, ":", text)