# Sources

* HTML basics:
    * https://developer.mozilla.org/en-US/docs/Learn/Getting_started_with_the_web/HTML_basics

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Overview

## Introduction

### Requirements

### Motivation

### Learning Objectives

* Tools
    * Pandas
        * For Data storage
        * Also for "scraping"
    * Requests/BeautifulSoup4/LXML
        * For scraping (DUH)

# Land Acknowledgement

# Ethics Of Web-Scraping

## Am I allowed to take this data? Can I use it in my research? What can I use it for?

* Site TOS
* robots.txt

## Can the site handle my requests?

* Speed considerations
* Small site?

# Actual Tutorial

# Background: Data Types & Pandas

## Data Types

### list

* Store multiple items (elements) in a single variable. 
* Elements are separated by commas.
* 

Note that indices start at 0 in Python!

In [59]:
# initialize a list
ex_list1 = [0, 1, 2]

# print the list
print("The 1st element of ex_list1 is", ex_list1[0])

# print length of a list
print("List Length:", len(ex_list1))

# lists can contain 
ex_list2 = ["Three", [4,5]]

# add two lists together 
ex_added_list = ex_list1 + ex_list2
print(ex_added_list)


# add an element to a list
ex_added_list.append('new element!')
print(ex_added_list)


# LIST COMPREHENSION! very useful, allows us to easily filter and modify lists
# the structure is generally []
#this is equivalent to "for loops" that some of you are familiar with from other languages

#this just duplicates the same list
print('list comprehension duplicate output:', [elmnt for elmnt in ex_added_list])

# we can use list comprehension to only keep integers (rather than the string "Three" or the list [4,5])
filtered_ex_added_list = [entry for entry in ex_added_list if type(entry)==int]
print('filtered_list = ', filtered_ex_added_list)

# we can also use list comprehension to easily modify lists
filtered_ex_added_list_plus_5 = [x+5 for x in filtered_ex_added_list]
print('after adding 5:', filtered_ex_added_list_plus_5)



The 1st element of ex_list1 is 0
List Length: 3
[0, 1, 2, 'Three', [4, 5]]
[0, 1, 2, 'Three', [4, 5], 'new element!']
list comprehension duplicate output: [0, 1, 2, 'Three', [4, 5], 'new element!']
filtered_list =  [0, 1, 2]
after adding 5: [5, 6, 7]



### Dict

### Set

## Pandas

### Series

### DataFrame

# Web Scraping

## Import Necessary packages

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

import lxml
import lxml.html
import cssselect
from datetime import datetime
from selenium import webdriver
import time

## Pandas read html

### USDA FIPS

In [None]:
# import data from USDA. Output is a LIST of tables
usda_fips_page = pd.read_html("https://www.nrcs.usda.gov/wps/portal/nrcs/detail/national/home/?cid=nrcs143_013697")

In [None]:
# print output to see what we have...
usda_fips_page

In [None]:
# Strange output!
usda_fips_page[0]

In [None]:
# looks like the first few dataframes are identical... this is strange but we only need one so it isn't a problem!
usda_fips_page[1]

In [None]:
# make variable for fips table we want
usda_fips = usda_fips_page[0]

In [None]:
# let's examine the dataframe more closely to make sure everything is correct

# change display options to show all rows
pd.set_option('display.max_rows', None)

usda_fips

In [None]:
# it looks like the last row is the only incorrect one, so let's delete it
usda_fips.tail()

In [None]:
# Let's confirm there are no other rows we need to drop. Check for nan values in any col pt. 1
usda_fips.isnull()

In [None]:
# Check for nan values in any col pt. 2
usda_fips.isnull().sum()

In [None]:
# drop the last row

# this is the output we want
usda_fips.drop(3232)

# now replace the existing "usda_fips" dataframe with the version missing the last row (inplace=True)
usda_fips.drop(3232, inplace=True)

### Wikipedia FIPS

In [None]:
# import data from wikipedia. Again, output is a LIST of tables
fips_page = pd.read_html("https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county")

In [None]:
# let's print the output and take a look...
fips_page

In [None]:
# first element?
fips_page[0]

In [None]:
fips_page[1]

In [None]:
# we want the 2nd table (recall: indices start at 0)
fips = fips_page[1]

# remove all hyperlinks (these look like "... County [h]", etc.)
fips['County or equivalent'] = fips['County or equivalent'].str.replace(r"\[.*\]","")

# convert to uppercase
fips['County or equivalent'] = fips['County or equivalent'].apply(lambda x: x.upper())
fips['State or equivalent'] = fips['State or equivalent'].apply(lambda x: x.upper())

# replace "St." with "Saint"
fips['County or equivalent'] = [x.replace('ST.','SAINT') for x in fips['County or equivalent']]

# remove everything after a comma in a county name (e.g. "ANCHORAGE, MUNICIPALITY OF")
fips['County or equivalent'] = [x.split(',')[0] for x in fips['County or equivalent']]

# replace DC info to correspond to GeoCov19 format
dc_loc = fips[fips['County or equivalent']=='DISTRICT OF COLUMBIA'].index.tolist()[0]
fips['State or equivalent'].loc[dc_loc] = 'WASHINGTON, D.C.'
fips['County or equivalent'].loc[dc_loc] = 'WASHINGTON'

############################################################################################

# Save FIPS dataframe to file

FIPS_SAVE_PATH = 

fips.to_csv(FIPS_SAVE_PATH, index=False)

## Request  + BeautifulSoup

 Red-bellied Snake (Wikipedia) Text Analysis

### Red-bellied Snake (Wikipedia) Text Analysis

Hypothetical: we want to get a list of all words, and their frequency, from this wikipedia page

In [23]:
wiki_snake_url = "https://en.wikipedia.org/wiki/Red-bellied_black_snake"
wiki_snake_response = requests.get(wiki_snake_url)

# printing "200" means that the page was successfully downloaded!
print(wiki_snake_response)

<Response [200]>


In [24]:
#let's examine the output... which turns out to be a mess (this is where BeautifulSoup comes in handy)
wiki_snake_response.text

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Red-bellied black snake - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"21852482-4147-410c-a3cc-0a16b202d6f6","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Red-bellied_black_snake","wgTitle":"Red-bellied black snake","wgCurRevisionId":1033489125,"wgRevisionId":1033489125,"wgArticleId":2290687,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles containing Ancient Greek (to 1453)-language text","CS1 French-language sources (fr)","CS1 German-language sources (de)","C

In [48]:
wiki_snake_soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Red-bellied black snake - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"21852482-4147-410c-a3cc-0a16b202d6f6","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Red-bellied_black_snake","wgTitle":"Red-bellied black snake","wgCurRevisionId":1033489125,"wgRevisionId":1033489125,"wgArticleId":2290687,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles containing Ancient Greek (to 1453)-language text","CS1 French-language sources (fr)","CS1 German-language sources (de)","CS1: a

In [25]:
wiki_snake_soup = BeautifulSoup(wiki_snake_response.text, 'html.parser')

# BeautifulSoup gives us a more readable format (barely - it doesn't make a huge difference in this example)
wiki_snake_soup.prettify()

'<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   Red-bellied black snake - Wikipedia\n  </title>\n  <script>\n   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"21852482-4147-410c-a3cc-0a16b202d6f6","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Red-bellied_black_snake","wgTitle":"Red-bellied black snake","wgCurRevisionId":1033489125,"wgRevisionId":1033489125,"wgArticleId":2290687,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles containing Ancient Greek (to 1453)-language text","CS1 French-language sources (fr)","CS1 German-lang

In [55]:
# this shows us all html with a "p" tag (paragraphs), but we just want the text
# print([paragraph for paragraph in wiki_snake_soup.find_all('p')])

# we need to display the text!
# print([paragraph.text for paragraph in wiki_snake_soup.find_all('p')])

# this shows us all html with a "p" tag (paragraphs), but we just want the text
for paragraph in wiki_snake_soup.find_all('p'):
    print(paragraph)


# we need to display the text!

for paragraph in wiki_snake_soup.find_all('p'):
    print(paragraph.text)
    

<p class="mw-empty-elt">
</p>
<p class="mw-empty-elt">
</p>
<p>The <b>red-bellied black snake</b> (<i>Pseudechis porphyriacus</i>) is a <a href="/wiki/Species" title="Species">species</a> of <a href="/wiki/Venomous_snake" title="Venomous snake">venomous snake</a> in the <a href="/wiki/Family_(biology)" title="Family (biology)">family</a> <a href="/wiki/Elapidae" title="Elapidae">Elapidae</a>, indigenous to <a href="/wiki/Australia" title="Australia">Australia</a>. Originally described by <a href="/wiki/George_Shaw" title="George Shaw">George Shaw</a> in 1794 as a species new to science, it is one of <a class="mw-redirect" href="/wiki/Eastern_Australia" title="Eastern Australia">eastern Australia</a>'s most commonly encountered snakes. Averaging around 1.25 m (4 ft 1 in) in length, it has glossy black upperparts, bright red or orange flanks, and a pink or dull red belly. It is not aggressive and generally retreats from human encounters, but can attack if provoked.  Although its <a href=

In [44]:
# we're almost there, we just need to combine this output (we'll use list comprehension)

wiki_snake_text = ' '.join([paragraph.text for paragraph in wiki_snake_soup.find_all('p')])
#[paragraph.text for paragraph in wiki_snake_soup.find_all('p')] is a list!

# #this is equivalent to the following code which some might find more familiar:
# wiki_snake_text_list = []

# for paragraph in wiki_snake_soup.find_all('p'):
#     wiki_snake_text_list.append(paragraph)
    
# wiki_snake_text = ' '.join(wiki_snake_text)

In [45]:
wiki_snake_text

'\n \n The red-bellied black snake (Pseudechis porphyriacus) is a species of venomous snake in the family Elapidae, indigenous to Australia. Originally described by George Shaw in 1794 as a species new to science, it is one of eastern Australia\'s most commonly encountered snakes. Averaging around 1.25\xa0m (4\xa0ft 1\xa0in) in length, it has glossy black upperparts, bright red or orange flanks, and a pink or dull red belly. It is not aggressive and generally retreats from human encounters, but can attack if provoked.  Although its venom can cause significant illness, no deaths have been recorded from its bite, which is less venomous than other Australian elapid snakes. The venom contains neurotoxins, myotoxins, and coagulants and has haemolytic properties. Victims can also lose their sense of smell.\n Common in woodlands, forests and swamplands, the red-bellied black snake often ventures into nearby urban areas. It forages in bodies of shallow water, commonly with tangles of water pla

In [None]:
# wiki_snake_text
# # looks good, but we should remove '\n'

# wiki_snake_text = wiki_snake_text.replace('\n','')

In [None]:
wiki_snake_text

In [47]:
# now let's count occurences of each word
import nltk

nltk.FreqDist(wiki_snake_text)

FreqDist({' ': 2352, 'e': 1481, 'a': 1045, 'n': 876, 't': 845, 's': 814, 'i': 775, 'r': 738, 'o': 729, 'l': 602, ...})

In [None]:
# looks like we need to split the text first:
split_wiki_snake_text = wiki_snake_text.split(' ')

In [None]:
wiki_snake_word_freqs = nltk.FreqDist(split_wiki_snake_text)

In [None]:
#what are we working with?
type(wiki_snake_word_freqs)

In [None]:
wiki_snake_freqs

In [None]:
wiki_snake_words = []
wiki_snake_freqs = []

for word in wiki_snake_word_freqs:
    wiki_snake_words.append(word)
    wiki_snake_freqs.append(wiki_snake_word_freqs[word])

print(wiki_snake_words)
print(wiki_snake_freqs)

In [None]:
# finally, we convert it to a pandas dataframe so we can easily save it in .csv format
wiki_snake_output = pd.DataFrame({'Word': wiki_snake_words, 'Frequency': wiki_snake_freqs})

In [None]:
wiki_snake_output

In [None]:
wiki_snake_output.to_csv(r"Red Bellied Snake Words & Frequencies.csv")

## Request  + BeautifulSoup

# lichess.org user data

Let's get the information of every top "bullet" chess player

In [9]:
# here are the top bullet players
players_df = pd.read_html("https://lichess.org/player/top/200/bullet")[0]

In [10]:
del players_df[0]

In [11]:
players_df.rename(columns={1:'User', 
                       2:'Rating',
                       3:'Abs. Rating Change'}, inplace=True)

In [21]:
# we need to split the "User" column, otherwise we get urls like "https://lichess.org/@/GM RebeccaHarris" instead of "https://lichess.org/@/RebeccaHarris"

# players_df['Has Title?'] = players_df['User'].str.split().map(len)==2

def get_title(title_user_str):
    """
    title_user_str looks like "GM RebeccaHarris" or "Shprot86"
    """
    if len(title_user_str.split()==2):    #if the user is titled (has two words in their user field, rather than one)
        return title_user_str.split()[0]   #return their TITLE
    else:    # if the user is not titled (has one word in their user field)
        return  np.nan    #return a missing value

def get_user(title_user_str):
    """
    title_user_str looks like "GM RebeccaHarris" or "Shprot86"
    """
    if len(title_user_str.split()==2):    #if the user is titled (has two words in their user field, rather than one)
        return title_user_str.split()[1]   #return their USERNAME
    else:    # if the user is not titled (has one word in their user field)
        return title_user_str    #return the input string, since it is already their username without a title


# def split_name_from_title(title_user_str):    
#     """
#     title_user_str looks like "GM RebeccaHarris" or "Shprot86"
#     """
#     if len(title_user_str.split()==2):    #if the user is titled (has two words in their user field, rather than one)
#         return title_user_str.split()
#     else:    # if the user is not titled (has one word in their user field)
#         return ((np.nan, title_user_str))
    
    
players_df['Username'] = players_df['User'].apply(split_name_from_title)

In [22]:
players_df

Unnamed: 0,User,Rating,Abs. Rating Change,Has Title?
0,GM RebeccaHarris,3123,6,True
1,GM nihalsarin2004,3095,23,True
2,GM Zhigalko_Sergei,3051,26,True
3,GM muisback,3029,3,True
4,GM Federicov93,3008,4,True
5,Shprot86,2986,22,False
6,IM MatthewG-p4p,2983,27,True
7,DrHotPotato,2978,51,False
8,GM Arka50,2976,15,True
9,FM Heisenberg01,2976,21,True


In [5]:
players_df['Title'] = players_df['User'].str.split('\xa0').str[0]

players_df['User'] = players_df['User'].str.split('\xa0').str[1]

In [12]:
pd.set_option('display.max_rows', 200)

players_df

Unnamed: 0,User,Rating,Abs. Rating Change
0,GM RebeccaHarris,3123,6
1,GM nihalsarin2004,3095,23
2,GM Zhigalko_Sergei,3051,26
3,GM muisback,3029,3
4,GM Federicov93,3008,4
5,Shprot86,2986,22
6,IM MatthewG-p4p,2983,27
7,DrHotPotato,2978,51
8,GM Arka50,2976,15
9,FM Heisenberg01,2976,21


In [25]:
import time

# define function
def get_user_info(lichess_user_id):
    print(user_ID)
    
    player_url = "https://lichess.org/@/" + user_ID
    print(player_url)
    
    player_response = requests.get(player_url)
    
    player_soup = BeautifulSoup(player_response.text, 'html.parser')
    
    join_date = get_join_date(player_soup=player_soup)
    
    time_spent_online = get_time_spent_online(player_soup=player_soup)
    
    followers = get_num_followers(player_soup=player_soup)

# apply function
for user_ID in [players_df['User'].iloc[0]]:
  
    get_user_info(user_ID)

    time.sleep(.5) 

RebeccaHarris
https://lichess.org/@/RebeccaHarris
 1 Sep 2016


In [19]:
# let's get their join date
member_since = player_soup.select('.thin:nth-child(1)')

# let's see its format
print(member_since)

# we need to get the text from here and clean it
# member_since.text

# we forgot that member_since is a list! (even though it just has one element, it's' still a list).
# let's get the text from its only element:
print([x.text for x in member_since])

# great! we just need to clean this to get useful information out - we're almost there
def get_membership_date(string):
    split_str = string.split('since')[1]
    return split_str

#test our function:
print(get_membership_date([x.text for x in member_since][0]))

# ok the function looks good, let's put it into our 

[<p class="thin">Member since 1 Sep 2016</p>]
['Member since 1 Sep 2016']
 1 Sep 2016


In [22]:
def get_join_date(player_soup):
    member_since = player_soup.select('.thin:nth-child(1)')
    
    return get_membership_date([x.text for x in member_since][0])

### Get Time Spent Online

In [30]:
def get_time_spent_online(player_soup):

    time_spent = player_soup.select('p:nth-child(4)')
    
    time_spent = [x.text for x in time_spent]
    
    return time_spent

In [29]:
get_time_spent_online(player_soup)

['Time spent playing: 10 days, 7 hours and 24 minutes']

### Get # followers 

In [39]:
def get_num_followers(player_soup):
    num_followers = player_soup.select('.user-show__social .nm-item:nth-child(1)')
    
    num_followers = [x.text for x in num_followers]
    
    num_followers = num_followers[0].split('f')[0]
    
    num_followers = num_followers.replace(',','')
    
    num_followers = int(num_followers)
    
    return num_followers

In [40]:
get_num_followers(player_soup)

24957

### Get bio - NOT DONE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [51]:
def get_bio(player_soup):
    
    bio = player_soup.select('.bio')
    
    print(bio)

In [52]:
get_bio(player_soup)

[]


In [42]:
mik_url = "https://lichess.org/@/" + 'mgaster'
print(mik_url)

mik_response = requests.get(mik_url)

mik_soup = BeautifulSoup(mik_response.text, 'html.parser')

https://lichess.org/@/mgaster


In [46]:
player_soup.select('.patron')

[<i class="line patron" title="Lichess Patron"></i>,
 <a aria-label="Patron since 27 May 2021" class="trophy award patron icon3d" href="/patron" title="Patron since 27 May 2021"></a>,
 <i class="line patron" title="Lichess Patron"></i>,
 <i class="line patron" title="Lichess Patron"></i>,
 <i class="line patron" title="Lichess Patron"></i>,
 <i class="line patron" title="Lichess Patron"></i>]

In [44]:
mik_soup.select('.patron')

[]

# MAYBE TRY XPATH HERE?