# Let us start making soup with Wikipedia!

### Import dependencies

In [20]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from utils import *


### Using requests to download the page's contents

In [2]:
page_link = 'https://en.wikipedia.org/wiki/March_Comes_in_Like_a_Lion'
page_response = requests.get(page_link, timeout=5)
page_content = BeautifulSoup(page_response.content, 'html.parser')

# Sanity check
print(type(page_response))
print(type(page_content))
print('status code: ', page_response.status_code)

<class 'requests.models.Response'>
<class 'bs4.BeautifulSoup'>
status code:  200


## Extract some contents

### Tags
A tag object corresponds to an XML or HTML tag in the original document.
```
page_content.<tag_name>.<atribute>
```
### Atributes
Tags have atributes and methods

In [3]:
# Some title informations
print(page_content.title)
print(page_content.title.name)
print(page_content.title.string)

<title>March Comes in Like a Lion - Wikipedia</title>
title
March Comes in Like a Lion - Wikipedia


In [4]:
# Retrieve all the links
all_links = [link['href'] for link in page_content.find_all('a', href=True)]

# Links with wiki
wiki_links = [link for link in all_links if 'wikipedia' in link or 'wiki' in link]

print('All the wikipedia links in the page')
print_array(wiki_links, limit = 10)

All the wikipedia links in the page
/wiki/March_Comes_in_Like_a_Lion_(film)
/wiki/File:Sangatsu_no_Lion.jpg
/wiki/Coming-of-age_story
/wiki/Romantic_comedy
/wiki/Chica_Umino
/wiki/Hakusensha
/wiki/Seinen_manga
/wiki/Young_Animal_(magazine)
/wiki/Akiyuki_Shinbo
/wiki/Yukari_Hashimoto


In [5]:
# links and the text related to it
wiki_link_contents = [(link["title"], link["href"]) for link in page_content.find_all("a",href=True, title=True)]
prefix = "https://en.wikipedia.org"
with open("links.csv", "w") as f:
    f.write("\""+ "Nome" + "\"" + "," + "\"" + "Link" + "\"" + "\n")
    for name,link in wiki_link_contents:
        if prefix not in link:
            f.write("\""+ name + "\"" + "," + "\"" + prefix + link + "\"" + "\n")
        else:
            f.write("\"" + name + "\"" + "," + "\"" + link + "\"" + "\n")

# Sanity check
print(len(wiki_link_contents))
print_array(wiki_link_contents, limit=10)

698
('March Comes in Like a Lion (film)', '/wiki/March_Comes_in_Like_a_Lion_(film)')
('Coming-of-age story', '/wiki/Coming-of-age_story')
('Romantic comedy', '/wiki/Romantic_comedy')
('Chica Umino', '/wiki/Chica_Umino')
('Hakusensha', '/wiki/Hakusensha')
('Seinen manga', '/wiki/Seinen_manga')
('Young Animal (magazine)', '/wiki/Young_Animal_(magazine)')
('Akiyuki Shinbo', '/wiki/Akiyuki_Shinbo')
('Yukari Hashimoto', '/wiki/Yukari_Hashimoto')
('Shaft (company)', '/wiki/Shaft_(company)')


In [6]:
# Eliminate duplicated lines with pandas

file_name = "links.csv"
file_name_output = "no_dupes_links.csv"

df = pd.read_csv(file_name, sep=",")
# df = pd.DataFrame.from_records(wiki_link_contents,columns=['Nome', 'Link'])
# print(df['Nome'])
# Notes:
# - the `subset=None` means that every column is used 
#    to determine if two rows are different; to change that specify
#    the columns as an array
# - the `inplace=True` means that the data structure is changed and
#   the duplicate rows are gone  
df.drop_duplicates(subset=None, inplace=True)
df = df.sort_values(by=['Nome'])
# print(df['Nome'])
# # Write the results to a different file
df.to_csv(file_name_output, sep=',', index=False)

In [7]:
# Extract the content between the <body></body> tag

body_tag = page_content.body
# print(body_tag, '\n')

# The tag name
print(body_tag.name,'\n')

# Changing the tag's name
# body_tag = 'bla' # we can change

# Get the tag's atributes
print(body_tag.attrs, '\n')

body 

{'class': ['mediawiki', 'ltr', 'sitedir-ltr', 'mw-hide-empty-elt', 'ns-0', 'ns-subject', 'mw-editable', 'page-March_Comes_in_Like_a_Lion', 'rootpage-March_Comes_in_Like_a_Lion', 'skin-vector', 'action-view']} 



In [8]:
category_links = [link for name, link in wiki_link_contents if 'Category' in name]
print_array(category_links, limit=10)

/wiki/Category:Shaft_(company)
/wiki/Help:Category
/wiki/Category:Manga_series
/wiki/Category:2007_manga
/wiki/Category:2016_anime_television_series
/wiki/Category:Anime_series_based_on_manga
/wiki/Category:Aniplex
/wiki/Category:Coming-of-age_anime_and_manga
/wiki/Category:Hakusensha_franchises
/wiki/Category:Hakusensha_manga


In [18]:
node = '/wiki/Category:' + 'Anime_series_based_on_manga'
category_link = prefix + node
category_response = requests.get(category_link, timeout=5)
category_content = BeautifulSoup(category_response.content, 'html.parser')
contain = ['/wiki/']
# not_contain = []
category_links = extract_page_links(node, contain=contain)


In [19]:
print_array(category_links)


/wiki/Akame_ga_Kill!
/wiki/Engaged_to_the_Unidentified
/wiki/GA_Geijutsuka_Art_Design_Class
/wiki/Dragon_Ball_Super
/wiki/Ajin:_Demi-Human
/wiki/Portal:Featured_content
/wiki/Ace_of_Diamond
/wiki/Deadman_Wonderland
/wiki/Claymore_(manga)
/wiki/Special:SpecialPages
/wiki/Wikipedia:General_disclaimer
/wiki/Cells_at_Work!
/wiki/Category:Television_programs_based_on_manga
/wiki/Chimpui
/wiki/Chibi_Vampire
/wiki/Danchi_Tomoo
/wiki/Haikyu!!
/wiki/Blood_Blockade_Battlefront
https://fa.wikipedia.org/wiki/%D8%B1%D8%AF%D9%87:%D8%B3%D8%B1%DB%8C_%D8%A7%D9%86%DB%8C%D9%85%D9%87%E2%80%8C%D9%87%D8%A7%DB%8C_%D8%A8%D8%B1_%D9%BE%D8%A7%DB%8C%D9%87_%D9%85%D8%A7%D9%86%DA%AF%D8%A7
/wiki/Help:Category
/wiki/Buzzer_Beater_(manga)
/wiki/Domestic_Girlfriend
/wiki/%C4%92lDLIVE
/wiki/Bow_Wow_(manga)
/wiki/Game_Center_Arashi
/wiki/Angelic_Layer
/wiki/Chibi_Maruko-chan
/wiki/Buso_Renkin
/wiki/Category:Anime_series_based_on_manga
/wiki/Fist_of_the_North_Star
/wiki/The_Comic_Artist_and_His_Assistants
/wiki/Hakkenden:_