In [1]:

# The request library will grab the page
import requests
# The beautifulsoup library makes your code legible and helps you analyze the extracted page
import bs4

import pandas as pd

import re


# Make request to load wiki page

In [2]:

response = requests.get("https://en.wikipedia.org/wiki/Pikachu")

soup = bs4.BeautifulSoup(response.text, 'html.parser')

soup


<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Pikachu - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"X9j2bQpAAD0AAJ7H@soAAABN","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Pikachu","wgTitle":"Pikachu","wgCurRevisionId":992540349,"wgRevisionId":992540349,"wgArticleId":269816,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles containing Japanese-language text","CS1 Japanese-language sources (ja)","CS1 uses Japanese-language script (ja)","Webarchive template wayback links","Wikipedia indefinitely semi-protected pa

# Find all links

In [3]:

# next inspect the elements on the wiki page
soup.find_all('a', href=True)


[<a href="/wiki/Wikipedia:Good_articles" title="This is a good article. Click here for more information."><img alt="This is a good article. Click here for more information." data-file-height="185" data-file-width="180" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/19px-Symbol_support_vote.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/29px-Symbol_support_vote.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/39px-Symbol_support_vote.svg.png 2x" width="19"/></a>,
 <a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30

# Find link which is external url and add description

In [4]:

links_dict = {}

# The soup.find_all('a', href=True) call finds all <a> elements that have an href attribute; elements without the attribute are skipped.
for link in soup.find_all('a', href=True, class_="external text", title=str):
    links_dict[link["href"]] = link.text

links_dict


{'https://movie-news.jp/meitantei-pikachu/2019/05/03/pikachu/': '"ニュース｜映画『名探偵ピカチュウ』公式サイト"',
 'https://comicbook.com/gaming/2018/06/10/pokemon-pikachu-origin-character-design/': '"\'Pokemon\' Designers Reveal The Secret Behind Pikachu\'s Creation"',
 'http://www.polygon.com/2013/5/29/4377496/natsumes-e3-2013-hometown-story-harvest-moon': '"Harvest Moon creator\'s Hometown Story leads Natsume\'s E3 slate"',
 'http://www.usgamer.net/articles/the-new-zygarde-form-is-a-reminder-of-how-hard-it-is-to-design-a-good-pokmon': '"The New Zygarde Form is a Reminder of How Hard it is to Design a Good Pokémon"',
 'https://www.nintendo.co.jp/ds/interview/irbj/vol1/index2.html': '"2. 一新されたポケモンの世界"',
 'https://web.archive.org/web/20080116030930/http://www.computerandvideogames.com/article.php?id=91965': '"Game Freak on Pokémon!"',
 'http://www.computerandvideogames.com/article.php?id=91965': 'the original',
 'https://web.archive.org/web/20100501090040/http://www.time.com/time/asia/magazine/99/1122/pokem

# Create dataframe

In [5]:

# tuts: https://datatofish.com/dictionary-to-dataframe/
# make dataframe from a dict
df = pd.DataFrame(list(links_dict.items()),columns = ['url','description'])

# set display max row
pd.set_option('display.max_rows', 100)
df


Unnamed: 0,url,description
0,https://movie-news.jp/meitantei-pikachu/2019/0...,"""ニュース｜映画『名探偵ピカチュウ』公式サイト"""
1,https://comicbook.com/gaming/2018/06/10/pokemo...,"""'Pokemon' Designers Reveal The Secret Behind ..."
2,http://www.polygon.com/2013/5/29/4377496/natsu...,"""Harvest Moon creator's Hometown Story leads N..."
3,http://www.usgamer.net/articles/the-new-zygard...,"""The New Zygarde Form is a Reminder of How Har..."
4,https://www.nintendo.co.jp/ds/interview/irbj/v...,"""2. 一新されたポケモンの世界"""
5,https://web.archive.org/web/20080116030930/htt...,"""Game Freak on Pokémon!"""
6,http://www.computerandvideogames.com/article.p...,the original
7,https://web.archive.org/web/20100501090040/htt...,"""The Ultimate Game Freak"""
8,http://www.time.com/time/asia/magazine/99/1122...,the original
9,https://kotaku.com/pikachu-wasnt-based-on-a-mo...,"""Pikachu Wasn't Based On A Mouse, But A Squirrel"""


# Remove not secured webs

Sometimes you have paren ( ) groupings in the pattern, but which you do not want to extract. In that case, **write the parens with a ?: at the start, e.g. (?: )** and that left paren will not count as a group result.

Without (?: ) will get warning:
 - /opt/anaconda3/lib/python3.7/site-packages/pandas/core/strings.py:1952: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.
 - return func(self, *args, **kwargs)


In [6]:

# remove url that is not secured and start without https
pattern = re.compile(r'(?:^http:)|(?:^//w{3})')
filter = df['url'].str.contains(pattern)

# drop column that match regex pattern
df.drop(index=df[filter].index, inplace=True)

# reset index
pd.set_option('display.max_rows', 100)
df = df.reset_index(drop=True)

df

Unnamed: 0,url,description
0,https://movie-news.jp/meitantei-pikachu/2019/0...,"""ニュース｜映画『名探偵ピカチュウ』公式サイト"""
1,https://comicbook.com/gaming/2018/06/10/pokemo...,"""'Pokemon' Designers Reveal The Secret Behind ..."
2,https://www.nintendo.co.jp/ds/interview/irbj/v...,"""2. 一新されたポケモンの世界"""
3,https://web.archive.org/web/20080116030930/htt...,"""Game Freak on Pokémon!"""
4,https://web.archive.org/web/20100501090040/htt...,"""The Ultimate Game Freak"""
5,https://kotaku.com/pikachu-wasnt-based-on-a-mo...,"""Pikachu Wasn't Based On A Mouse, But A Squirrel"""
6,https://web.archive.org/web/20090327224428/htt...,"""Pokemon Platinum: Developer Interview!"""
7,https://www.nintendo.co.jp/nom/0007/gfreak/pag...,『ポケットモンスター』スタッフインタビュー
8,https://web.archive.org/web/20100501090040/htt...,"""The Ultimate Game Freak"""
9,https://www.gamespot.com/articles/dont-expect-...,"""Don't Expect Pikachu's Lost Evolution, Goroch..."
