In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Overview

## Introduction

### Requirements

### Motivation

### Learning Objectives

* Tools
    * Pandas
        * For Data storage
        * Also for "scraping"
    * Requests/BeautifulSoup4/LXML
        * For scraping (DUH)

# Land Acknowledgement

# Ethics Of Web-Scraping

## Am I allowed to take this data? Can I use it in my research? What can I use it for?

* Site TOS
* robots.txt

## Can the site handle my requests?

* Speed considerations
* Small site?

# Actual Tutorial

# Background: Data Types & Pandas

## Data Types

### list

* Store multiple items (elements) in a single variable. 
* Elements are separated by commas.
* 

Note that indices start at 0 in Python!

In [None]:
# initialize a list
ex_list1 = [0, 1, 2]

# print the list
print("The 1st element of ex_list1 is", ex_list1[0])

# print length of a list
print("List Length:", len(ex_list1))

# lists can contain 
ex_list2 = ["Three", [4,5]]

# add two lists together 
ex_added_list = ex_list1 + ex_list2
print(ex_added_list)

### Dict

### Set

## Pandas

### Series

### DataFrame

# Web Scraping

## Import Necessary packages

In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

import lxml
import lxml.html
import cssselect
from datetime import datetime
from selenium import webdriver
import time

## Pandas read html

### USDA FIPS

In [None]:
# import data from USDA. Output is a LIST of tables
usda_fips_page = pd.read_html("https://www.nrcs.usda.gov/wps/portal/nrcs/detail/national/home/?cid=nrcs143_013697")

In [None]:
# print output to see what we have...
usda_fips_page

In [None]:
# Strange output!
usda_fips_page[0]

In [None]:
# looks like the first few dataframes are identical... this is strange but we only need one so it isn't a problem!
usda_fips_page[1]

In [None]:
# make variable for fips table we want
usda_fips = usda_fips_page[0]

In [None]:
# let's examine the dataframe more closely to make sure everything is correct

# change display options to show all rows
pd.set_option('display.max_rows', None)

usda_fips

In [None]:
# it looks like the last row is the only incorrect one, so let's delete it
usda_fips.tail()

In [None]:
# Let's confirm there are no other rows we need to drop. Check for nan values in any col pt. 1
usda_fips.isnull()

In [None]:
# Check for nan values in any col pt. 2
usda_fips.isnull().sum()

In [None]:
# drop the last row

# this is the output we want
usda_fips.drop(3232)

# now replace the existing "usda_fips" dataframe with the version missing the last row (inplace=True)
usda_fips.drop(3232, inplace=True)

### Wikipedia FIPS

In [None]:
# import data from wikipedia. Again, output is a LIST of tables
fips_page = pd.read_html("https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county")

In [None]:
# let's print the output and take a look...
fips_page

In [None]:
# first element?
fips_page[0]

In [None]:
fips_page[1]

In [None]:
# we want the 2nd table (recall: indices start at 0)
fips = fips_page[1]

# remove all hyperlinks (these look like "... County [h]", etc.)
fips['County or equivalent'] = fips['County or equivalent'].str.replace(r"\[.*\]","")

# convert to uppercase
fips['County or equivalent'] = fips['County or equivalent'].apply(lambda x: x.upper())
fips['State or equivalent'] = fips['State or equivalent'].apply(lambda x: x.upper())

# replace "St." with "Saint"
fips['County or equivalent'] = [x.replace('ST.','SAINT') for x in fips['County or equivalent']]

# remove everything after a comma in a county name (e.g. "ANCHORAGE, MUNICIPALITY OF")
fips['County or equivalent'] = [x.split(',')[0] for x in fips['County or equivalent']]

# replace DC info to correspond to GeoCov19 format
dc_loc = fips[fips['County or equivalent']=='DISTRICT OF COLUMBIA'].index.tolist()[0]
fips['State or equivalent'].loc[dc_loc] = 'WASHINGTON, D.C.'
fips['County or equivalent'].loc[dc_loc] = 'WASHINGTON'

############################################################################################

# Save FIPS dataframe to file

FIPS_SAVE_PATH = 

fips.to_csv(FIPS_SAVE_PATH, index=False)

## Request  + BeautifulSoup

# Red-bellied Snake (Wikipedia) Text Analysis

### Red-bellied Snake (Wikipedia) Text Analysis

Hypothetical: we want to get a list of all words, and their frequency, from this wikipedia page

In [None]:
wiki_snake_url = "https://en.wikipedia.org/wiki/Red-bellied_black_snake"
wiki_snake_response = requests.get(wiki_snake_url)

# printing "200" means that the page was successfully downloaded!
print(wiki_snake_response)

In [None]:
#let's examine the output... which turns out to be a mess (this is where BeautifulSoup comes in handy)
wiki_snake_response.text

In [None]:
wiki_snake_soup = BeautifulSoup(wiki_snake_response.text, 'html.parser')

# BeautifulSoup gives us a more readable format (barely)
wiki_snake_soup.prettify()

In [None]:
# let's make this even more readable, so we can identify parts of the text we want to gather

# this shows us all html with an "p" tag (paragraphs), but we just want the text
for paragraph in wiki_snake_soup.find_all('p'):
    print(paragraph.text)
    

In [None]:
[paragraph.text for paragraph in wiki_snake_soup.find_all('p')]

In [None]:
# we're almost there, we just need to combine this output

wiki_snake_text = ' '.join([paragraph.text for paragraph in wiki_snake_soup.find_all('p')])

In [None]:
wiki_snake_text
# looks good, but we should remove '\n'

wiki_snake_text = wiki_snake_text.replace('\n','')

In [None]:
wiki_snake_text

In [None]:
# now let's count occurences of each word
# import nltk

nltk.FreqDist(wiki_snake_text)

In [None]:
# looks like we need to split the text first:
split_wiki_snake_text = wiki_snake_text.split(' ')

In [None]:
wiki_snake_word_freqs = nltk.FreqDist(split_wiki_snake_text)

In [None]:
#what are we working with?
type(wiki_snake_word_freqs)

In [None]:
wiki_snake_freqs

In [None]:
wiki_snake_words = []
wiki_snake_freqs = []

for word in wiki_snake_word_freqs:
    wiki_snake_words.append(word)
    wiki_snake_freqs.append(wiki_snake_word_freqs[word])

print(wiki_snake_words)
print(wiki_snake_freqs)

In [None]:
# finally, we convert it to a pandas dataframe so we can easily save it in .csv format
wiki_snake_output = pd.DataFrame({'Word': wiki_snake_words, 'Frequency': wiki_snake_freqs})

In [None]:
wiki_snake_output

In [None]:
wiki_snake_output.to_csv(r"Red Bellied Snake Words & Frequencies.csv")

## Request  + BeautifulSoup

# lichess.org user data

Let's get the information of every top "bullet" chess player

In [3]:
# here are the top bullet players
players_df = pd.read_html("https://lichess.org/player/top/200/bullet")[0]

In [4]:
del players_df[0]

In [5]:
players_df.rename(columns={1:'User', 
                       2:'Rating',
                       3:'Abs. Rating Change'}, inplace=True)

In [6]:
# we need to split the "User" column, otherwise we get urls like "https://lichess.org/@/GM RebeccaHarris" instead of "https://lichess.org/@/RebeccaHarris"

players_df['Title'] = players_df['User'].str.split('\xa0').str[0]

players_df['User'] = players_df['User'].str.split('\xa0').str[1]

In [7]:
players_df

Unnamed: 0,User,Rating,Abs. Rating Change,Title
0,RebeccaHarris,3121,17.0,GM
1,nihalsarin2004,3060,2.0,GM
2,Heisenberg01,3021,5.0,FM
3,chessbrahs,3010,3.0,GM
4,Alexander_Zubov,3010,23.0,GM
...,...,...,...,...
195,Avenger82,2775,21.0,FM
196,,2775,9.0,badbadger95
197,,2774,18.0,Estoiko
198,,2773,,chesszxj


In [8]:
import time

def get_user_info(lichess_user_id):
    print(user_ID)
    
    player_url = "https://lichess.org/@/" + user_ID
    print(player_url)
    
    player_response = requests.get(player_url)
    
    player_soup = BeautifulSoup(player_response.text, 'html.parser')
    
    player_info = player_soup.find_all(".thin:nth-child(1)")
    
    print(player_info)


for user_ID in [players_df['User'].iloc[0]]:
  
    get_user_info(user_ID)

    time.sleep(.5) 
    
#     print(user_ID)
    
#     player_url = "https://lichess.org/@/" + user_ID
#     print(player_url)
    
#     player_response = requests.get(player_url)
    
#     player_soup = BeautifulSoup(player_response.text, 'html.parser')
    
#     player_info = player_soup.find_all(".thin:nth-child(1)")
    
#     print(player_info)

RebeccaHarris
https://lichess.org/@/RebeccaHarris
[]


In [19]:
# let's get their join date
member_since = player_soup.select('.thin:nth-child(1)')

# let's see its format
print(member_since)

# we need to get the text from here and clean it
# member_since.text

# we forgot that member_since is a list! (even though it just has one element, it's' still a list).
# let's get the text from its only element:
print([x.text for x in member_since])

# great! we just need to clean this to get useful information out - we're almost there
def get_membership_date(string):
    split_str = string.split('since')[1]
    return split_str

#test our function:
print(get_membership_date([x.text for x in member_since][0]))

# ok the function looks good, let's put it into our 

[<p class="thin">Member since 1 Sep 2016</p>]
['Member since 1 Sep 2016']
 1 Sep 2016


# MAYBE TRY XPATH HERE?