In [1]:
import requests
import bs4

# sample_presentation_url = "https://pydata.org/warsaw2018/schedule/presentation/32/"
sample_presentation_url = "https://pydata.org/warsaw2018/schedule/presentation/23/"

# Parse sample presentation

In [2]:
sample_presentation = requests.get(sample_presentation_url )

In [3]:
presentation = bs4.BeautifulSoup(sample_presentation.content, "lxml")

In [4]:
presentation.find("div", {"class" : "description"}).find("p")

<p>Identifying the number of hidden units in a fully connected layer is considered a heuristically-guided craft. A PyTorch library, Delve, was developed that allows identifying the degree of over-parameterization of a layer, thus guiding architecture selection. The library compares the intrinsic dimensionality of the layer over training, providing the user with live feedback during training.</p>

In [5]:
tag = presentation.find("div", {"class" : "abstract"})
# https://pydata.org/warsaw2018/schedule/presentation/23/ has no "p"

In [6]:
tag.contents

[<ul>
 <li>Why is identifying the number of units in a hidden layer hard?</li>
 <li>How is layer saturation calculated?</li>
 <li>Eigendecomposition of the latent representations</li>
 <li>What is the advantage of this approach over alternatives?</li>
 <li>Live - during training</li>
 <li>Lightweight PyTorch extension</li>
 <li>Intuitive - visualized as horizontal bar graph in the terminal with tqdm</li>
 <li>How do I use <a href="https://github.com/justinshenk/delve">Delve</a>?</li>
 <li>Live demo</li>
 </ul>]

In [7]:
def fetch_div_by_class(soup, class_name):
    return soup.find("div", {"class" : class_name})

In [100]:
def find_pydata_divs(html_content):
    soup = bs4.BeautifulSoup(html_content, "lxml")
    description = fetch_div_by_class(soup, "description")
    abstract = fetch_div_by_class(soup, "abstract")
    author = soup.find_all("h4")[1].find("a").contents[0]
    title = soup.find("h2").contents[0]
    return {"description" : description, 
            "abstract" : abstract, 
            "author" : author,
            "title" : title}


In [101]:
request = requests.get(sample_presentation_url)
res = find_pydata_divs(request.content)

In [102]:
res

{'abstract': <div class="abstract"><ul>
 <li>Why is identifying the number of units in a hidden layer hard?</li>
 <li>How is layer saturation calculated?</li>
 <li>Eigendecomposition of the latent representations</li>
 <li>What is the advantage of this approach over alternatives?</li>
 <li>Live - during training</li>
 <li>Lightweight PyTorch extension</li>
 <li>Intuitive - visualized as horizontal bar graph in the terminal with tqdm</li>
 <li>How do I use <a href="https://github.com/justinshenk/delve">Delve</a>?</li>
 <li>Live demo</li>
 </ul></div>,
 'author': 'Justin Shenk',
 'description': <div class="description"><p>Identifying the number of hidden units in a fully connected layer is considered a heuristically-guided craft. A PyTorch library, Delve, was developed that allows identifying the degree of over-parameterization of a layer, thus guiding architecture selection. The library compares the intrinsic dimensionality of the layer over training, providing the user with live feedba

In [11]:
presentation.find("h2").contents[0]

'Optimizing Deep Neural Network Layer Topology with Delve'

In [12]:
presentation.find_all("h4")[1].find("a").contents[0]

'Justin Shenk'

# Parse main schedule page

In [13]:
main_url = "https://pydata.org/warsaw2018/schedule"
url_base = "https://pydata.org"
request = requests.get(main_url)
soup = bs4.BeautifulSoup(request.content, "lxml")

In [14]:
talk_slots = soup.find_all("td", {"class" : "slot-talk"})

In [15]:
len(talk_slots)

43

In [16]:
talk_slots[0]

<td class="slot slot-talk" colspan="1" rowspan="1">
<span class="title">
<a href="/warsaw2018/schedule/presentation/11/">PyTorch 1.0: now and in the future</a>
</span>
<span class="speaker">
                              Adam Paszke
                          </span>
</td>

In [17]:
talk_slots[0].find("a").attrs["href"]

'/warsaw2018/schedule/presentation/11/'

In [18]:
type(talk_slots[0].find("a"))

bs4.element.Tag

In [19]:
def extract_talk_relhref(talk_slot_td):
    return talk_slot_td.find("a").attrs["href"]

In [20]:
extract_talk_relhref(talk_slots[0])

'/warsaw2018/schedule/presentation/11/'

In [21]:
talk_urls = [url_base + extract_talk_relhref(slot) for slot in talk_slots]

In [22]:
talk_urls

['https://pydata.org/warsaw2018/schedule/presentation/11/',
 'https://pydata.org/warsaw2018/schedule/presentation/32/',
 'https://pydata.org/warsaw2018/schedule/presentation/51/',
 'https://pydata.org/warsaw2018/schedule/presentation/3/',
 'https://pydata.org/warsaw2018/schedule/presentation/52/',
 'https://pydata.org/warsaw2018/schedule/presentation/37/',
 'https://pydata.org/warsaw2018/schedule/presentation/17/',
 'https://pydata.org/warsaw2018/schedule/presentation/33/',
 'https://pydata.org/warsaw2018/schedule/presentation/38/',
 'https://pydata.org/warsaw2018/schedule/presentation/9/',
 'https://pydata.org/warsaw2018/schedule/presentation/34/',
 'https://pydata.org/warsaw2018/schedule/presentation/36/',
 'https://pydata.org/warsaw2018/schedule/presentation/16/',
 'https://pydata.org/warsaw2018/schedule/presentation/22/',
 'https://pydata.org/warsaw2018/schedule/presentation/40/',
 'https://pydata.org/warsaw2018/schedule/presentation/20/',
 'https://pydata.org/warsaw2018/schedule/p

# Combine parsing

In [103]:
fetched_pages = [requests.get(url).content for url in talk_urls]

In [108]:
result = []
for _i in range(len(fetched_pages)):
    _url = talk_urls[_i]
    print(_i, " ", _url)
    _page = fetched_pages[_i]
    _dict = find_pydata_divs(_page)
    _dict["url"] = _url
    result.append(_dict)

0   https://pydata.org/warsaw2018/schedule/presentation/11/
1   https://pydata.org/warsaw2018/schedule/presentation/32/
2   https://pydata.org/warsaw2018/schedule/presentation/51/
3   https://pydata.org/warsaw2018/schedule/presentation/3/
4   https://pydata.org/warsaw2018/schedule/presentation/52/
5   https://pydata.org/warsaw2018/schedule/presentation/37/
6   https://pydata.org/warsaw2018/schedule/presentation/17/
7   https://pydata.org/warsaw2018/schedule/presentation/33/
8   https://pydata.org/warsaw2018/schedule/presentation/38/
9   https://pydata.org/warsaw2018/schedule/presentation/9/
10   https://pydata.org/warsaw2018/schedule/presentation/34/
11   https://pydata.org/warsaw2018/schedule/presentation/36/
12   https://pydata.org/warsaw2018/schedule/presentation/16/
13   https://pydata.org/warsaw2018/schedule/presentation/22/
14   https://pydata.org/warsaw2018/schedule/presentation/40/
15   https://pydata.org/warsaw2018/schedule/presentation/20/
16   https://pydata.org/warsaw2018/s

In [109]:
[el["author"] for el in result]

['Adam Paszke',
 'Marcin Mosiołek',
 'Sylwia Brodacka',
 'Mateusz Opala',
 'Steph Samson',
 'Robert Kostrzewski',
 'Tymoteusz Wołodźko',
 'Łukasz Kopeć',
 'Jakub Sanojca',
 'Agata Chęcińska',
 'Szymon Wojciechowski',
 'Marcin Kostur',
 'Dawid Rymarczyk',
 'Shahnawaz Ahmed',
 'Dominik Lewy',
 'Katarina Milosevic, Ioana Gherman',
 'Kamila Stepniowska',
 'Michał Jadczuk',
 'Adam Svystun',
 'Donald Whyte',
 'Adam Witkowski',
 'Juan De Dios Santos',
 'Mia Polovina',
 'Łukasz Słabiński',
 'Stefania Druga',
 'Gene Kogan',
 'Alex Quemy',
 'Jacek Komorowski',
 'Rafał A. Bachorz',
 'Dr Maciej Hermanowicz',
 'Tomasz Bartczak',
 'Marcin Stachowiak',
 'Adam Słucki',
 'Przemyslaw Biecek',
 'Krzysztof Kotowski',
 'Marcin Możejko',
 'Pawel Cyrta',
 'Tomasz Włodarczyk',
 'Katarzyna Kańska',
 'Roel Bertens',
 'Steven Nooijen',
 'Sylwester Brzęczkowski',
 'Justin Shenk']

# List Descriptions

In [44]:
result[0].keys()

dict_keys(['description', 'abstract', 'author', 'title'])

In [51]:
result[0]["description"]

<div class="description"><p>PyTorch is one of the main tools used for machine learning research these days. It’s been developed in beta mode for over 2 years, but this October, a release candidate for 1.0 version has been finally released! In this talk, I’ll briefly introduce the library, and then move on to showcase the cutting edge features we introduced recently.</p></div>

In [52]:
html2text(str(result[0]["description"]))

'PyTorch is one of the main tools used for machine learning research these\ndays. It’s been developed in beta mode for over 2 years, but this October, a\nrelease candidate for 1.0 version has been finally released! In this talk,\nI’ll briefly introduce the library, and then move on to showcase the cutting\nedge features we introduced recently.\n\n'

In [53]:
html2text(str(result[0]["abstract"]))

'The talk will be divided into multiple sections. First, an extremely quick\nintroduction to what PyTorch is, and what can it be used for (including use\ncases outside of machine learning!). Then, I will cover a number of topics\nthat are interesting in the current context of the library, including: \\-\nHybrid frontend (JIT compiler) \\- Path from research to production \\- C++ API\nand inference \\- Caffe2 merger \\- New distributed backend\n\n'

In [57]:
html2text(str(result[0]["abstract"])).replace("\n", " ").replace("\\", "").strip()

'The talk will be divided into multiple sections. First, an extremely quick introduction to what PyTorch is, and what can it be used for (including use cases outside of machine learning!). Then, I will cover a number of topics that are interesting in the current context of the library, including: - Hybrid frontend (JIT compiler) - Path from research to production - C++ API and inference - Caffe2 merger - New distributed backend'

In [58]:
def tag_to_text(tag):
    return html2text(str(tag)).replace("\n", " ").replace("\\", "").strip() 

In [59]:
for tag in [el["description"] for el in result]:
    print(tag_to_text(tag))

PyTorch is one of the main tools used for machine learning research these days. It’s been developed in beta mode for over 2 years, but this October, a release candidate for 1.0 version has been finally released! In this talk, I’ll briefly introduce the library, and then move on to showcase the cutting edge features we introduced recently.
The talk is about to provide a gentle introduction into the world of 3D deep learning techniques, considering basic aspects such as input representation, typical problems and most popular models. After the talk you should be able understand common challenges occurring when working with point clouds, and more importantly, you should be able to tackle them properly.
Luna is a general-purpose programming language with an intuitive visual interface that is dedicated to data science applications. In this presentation we will show how to improve interpretation of results and communication, including with non-technical business experts, by making data visual

In [62]:
descriptions = [tag_to_text(el["description"]) for el in result]

In [111]:
element = result[0]

markdown link is like this: [link text](http://www.google.com)

In [127]:
"#### [" + element["title"] + "](" + element["url"] + ") - " + element["author"] + "\n"

'#### [PyTorch 1.0: now and in the future](https://pydata.org/warsaw2018/schedule/presentation/11/) - Adam Paszke\n'

### [PyTorch 1.0: now and in the future](https://pydata.org/warsaw2018/schedule/presentation/11/) - Adam Paszke

In [128]:
def make_heading(result_element):
    return "#### [" + result_element["title"] + "](" + result_element["url"] + ") - " + result_element["author"] + "\n"

def make_description(result_element):
    return make_heading(result_element) + tag_to_text(result_element["description"])

In [129]:
make_description(element)

'#### [PyTorch 1.0: now and in the future](https://pydata.org/warsaw2018/schedule/presentation/11/) - Adam Paszke\nPyTorch is one of the main tools used for machine learning research these days. It’s been developed in beta mode for over 2 years, but this October, a release candidate for 1.0 version has been finally released! In this talk, I’ll briefly introduce the library, and then move on to showcase the cutting edge features we introduced recently.'

In [130]:
desc_list = []
for _i in range(len(result)):
#     print(_i)
    _el = result[_i]
    _desc = make_description(_el)
    desc_list.append(_desc)
    desc_list.append("\n\n")

In [131]:
print("".join(desc_list))

#### [PyTorch 1.0: now and in the future](https://pydata.org/warsaw2018/schedule/presentation/11/) - Adam Paszke
PyTorch is one of the main tools used for machine learning research these days. It’s been developed in beta mode for over 2 years, but this October, a release candidate for 1.0 version has been finally released! In this talk, I’ll briefly introduce the library, and then move on to showcase the cutting edge features we introduced recently.

#### [Deep Learning for 3D World: Point Clouds](https://pydata.org/warsaw2018/schedule/presentation/32/) - Marcin Mosiołek
The talk is about to provide a gentle introduction into the world of 3D deep learning techniques, considering basic aspects such as input representation, typical problems and most popular models. After the talk you should be able understand common challenges occurring when working with point clouds, and more importantly, you should be able to tackle them properly.

#### [Where visual meets textual. Luna - overview.](ht