In [None]:
# TODO: create a higher level function that will iterate with parse_dictionary_main_page 
#       and scrape the WHOLE dictionary (A - Z)
# TODO: descriptions longer than 300 char cause an error

In [10]:
import pandas as pd
import requests
import pickle
import re
import random
import praw
import time
from bs4 import BeautifulSoup

In [11]:
from utils import *

In [12]:
creds = pickle.load(open('creds', 'rb'))

In [13]:
def parse_entire_dictionary():
    stem = 'https://lifeprint.com/asl101/index/'
    full_dict = {}
    for letter in 'abcdefghijklmnopqrstuvwxyz': #
        letter_dict = parse_dictionary_main_page(stem + letter + '.htm')
        full_dict.update(letter_dict)
        time.sleep(0.5)
    return full_dict

def parse_dictionary_main_page(url):
    '''
    The dictionary main page contains links to subpages with words.
    Takes a url (ex. https://lifeprint.com/asl101/index/a.htm)
    Returns a dictionary with the dictionary entry as the key,
    and the link to the dictionary content page as the value
    '''
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    links_html = soup.find_all("a", href = True)
    links_html = [link for link in links_html if len(link.parent) > 1]
    master_dict = {}
    for blob in links_html:
        text = blob.text
        link = blob.get("href")
        link = re.sub('\.\.', 'https://lifeprint.com/asl101/', link)
        master_dict[text] = link
    return(master_dict)

def parse_dictionary_content_page(url, name):
    '''
    The content page contains images and videos describing the word
    Takes a url (ex. https://lifeprint.com/asl101//pages-signs/a/active.htm)
    Returns a pandas dataframe as follows:
    # pandas df:
    # name
    # type ("image", "gif", or "video")
    # text
    # location (the web address where the content can be found)
    '''
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    videos = soup.find_all('iframe')
    df = pd.DataFrame()
    for video in videos:
        row = {}
        row['name'] = name
        row['type'] = 'youtube'
        row['text'] = get_description(video)
        row['location'] = video['src']
        df = df.append(row, ignore_index=True)
    return(df)

In [14]:
def get_description(element):
    text_array = []
    temp = element
    while True:
        temp = temp.previous
        if temp.name == 'hr' or temp.name == 'font':
            break 
        if isinstance(temp, type('bs4.element.NavigableString')):
            text_array.append(temp)
    # reverse text_array, then
    text_array.reverse()
    description = ''.join(text_array)
    # Clean for Reddit Title 
    description = re.sub('(\n|\t)+', '', description)
    description = smart_truncate(description)
    return(description)

def smart_truncate(content, length=300, suffix='...'):
    # from https://stackoverflow.com/questions/250357/truncate-a-string-without-ending-in-the-middle-of-a-word
    if len(content) <= length:
        return content
    else:
        return ' '.join(content[:length+1].split(' ')[0:-1]) + suffix

In [15]:
def post_random_content(main_page, reddit_creds, content_type = 'random'):
    if content_type == 'random':
        content_type = random.choice(['image', 'youtube', 'mp4'])
    content_df = pd.DataFrame
    i = 0
    while True:
        name = random.choice([*main_page.keys()])
        url = main_page[name]
        content_df = parse_dictionary_content_page(url, name)
        i += 1
        if i > 100 or not content_df.empty:
            break
    chosen_row = content_df.iloc[random.choice(content_df.index)]
    if content_type == 'image':
        pass
    elif content_type == 'youtube':
        post_youtube(chosen_row, reddit_creds)
    elif content_type == 'mp4':
        pass
    return chosen_row

def post_youtube(df_row, reddit_creds):
    title = df_row['name'] + ' | ' + df_row['text']
    url = df_row['location']
    reddit = praw.Reddit(
        user_agent="test",
        client_id=creds['CLIENT_ID'],
        client_secret=creds['CLIENT_SECRET'],
        username=creds['USERNAME'],
        password=creds['PASSWORD'])
    reddit.validate_on_submit = True
    reddit.subreddit("u_hands---free").submit(title=title, url=url)
    print('YouTube embedded video posted to Reddit')

In [16]:
entire_dict = parse_entire_dictionary()
entire_dict

{'@': 'https://lifeprint.com/asl101//pages-signs/a/at.htm',
 "ABC's": 'https://lifeprint.com/asl101//pages-signs/a/abcs.htm',
 'ABBREVIATE': 'https://lifeprint.com/asl101//pages-signs/a/abbreviate.htm',
 'ABORTION': 'https://lifeprint.com/asl101//pages-signs/a/abortion.htm',
 'ABOUT': 'https://lifeprint.com/asl101//pages-signs/a/about.htm',
 'ABOVE': 'https://lifeprint.com/asl101//pages-signs/a/above.htm',
 'ABUSE': 'https://lifeprint.com/asl101//pages-signs/a/abuse.htm',
 'ACADEMIC': 'https://lifeprint.com/asl101//pages-signs/a/academic.htm',
 'ACCEPT': 'https://lifeprint.com/asl101//pages-signs/a/accept.htm',
 'ACCENT': 'https://lifeprint.com/asl101//pages-signs/a/accent.htm',
 'ACCIDENT': 'https://lifeprint.com/asl101//pages-signs/a/accident.htm',
 'accompany-[see WITH]': 'https://lifeprint.com/asl101//pages-signs/w/with.htm',
 'ACCOMPLICE': 'https://lifeprint.com/asl101//pages-signs/a/accomplice.ht.htm',
 'ACCOMPLISH': 'https://lifeprint.com/asl101//pages-signs/a/accomplish.htm',
 

In [18]:
pickle.dump(entire_dict, open('cached_dict', 'wb'))

In [7]:
post_random_content(entire_dict, creds, content_type = 'youtube')

YouTube embedded video posted to Reddit


location      https://www.youtube.com/embed/I3lZvG43Z14?rel=0
name                                                      POP
text        POP / "soda pop"Form the left hand into an "O....
type                                                  youtube
Name: 0, dtype: object

# Exploration

In [127]:
url = 'https://www.lifeprint.com/asl101/pages-signs/p/pineapple.htm'

page = requests.get(url)
soup_temp = BeautifulSoup(page.content, "html.parser")

In [172]:
videos = soup_temp.find_all('iframe')
video = videos[0]
get_description(video)

'ASL: "pineapple"The sign for "pineapple"\xa0has a many versions.PINEAPPLE:\xa0 S and modified-L version:'

In [157]:
isinstance(my_temp, type('bs4.element.NavigableString'))

False

In [170]:
my_temp = video.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous.name
my_temp

'font'

In [139]:
soup_temp

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html><head>
<title>ASL pineapple</title>
<meta content="ASL,pineapple,American Sign Language,Sign Language,learn sign language,Deaf,signing,what is the sign for,how do you sign" name="keywords"/>
<meta content='ASL pineapple. What is the sign for "pineapple" in American Sign Language (ASL)?' name="description"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="en-us" http-equiv="Content-Language"/>
<meta content="William Vicars" name="author"/>
<meta content="index,follow" name="robots"/>
<meta content="Global" name="distribution"/>
<meta content="30 days" name="revisit-after"/>
<meta content="Copyright William Vicars / Lifeprint.com" name="copyright"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">A {text-decoration: none} a:hover{color:#000033; background-color: yellow} p {margin-top:0px; margin-bott

In [63]:
page_letter_a = parse_dictionary_main_page("https://lifeprint.com/asl101/index/a.htm")
main_page

{'@': 'https://lifeprint.com/asl101//pages-signs/a/at.htm',
 "ABC's": 'https://lifeprint.com/asl101//pages-signs/a/abcs.htm',
 'ABBREVIATE': 'https://lifeprint.com/asl101//pages-signs/a/abbreviate.htm',
 'ABORTION': 'https://lifeprint.com/asl101//pages-signs/a/abortion.htm',
 'ABOUT': 'https://lifeprint.com/asl101//pages-signs/a/about.htm',
 'ABOVE': 'https://lifeprint.com/asl101//pages-signs/a/above.htm',
 'ABUSE': 'https://lifeprint.com/asl101//pages-signs/a/abuse.htm',
 'ACADEMIC': 'https://lifeprint.com/asl101//pages-signs/a/academic.htm',
 'ACCEPT': 'https://lifeprint.com/asl101//pages-signs/a/accept.htm',
 'ACCENT': 'https://lifeprint.com/asl101//pages-signs/a/accent.htm',
 'ACCIDENT': 'https://lifeprint.com/asl101//pages-signs/a/accident.htm',
 'accompany-[see WITH]': 'https://lifeprint.com/asl101//pages-signs/w/with.htm',
 'ACCOMPLICE': 'https://lifeprint.com/asl101//pages-signs/a/accomplice.ht.htm',
 'ACCOMPLISH': 'https://lifeprint.com/asl101//pages-signs/a/accomplish.htm',
 

In [None]:
# https://www.lifeprint.com/asl101/pages-signs/a/at.htm
soup2 = pickle.load(open('soup2', 'rb'))

In [7]:
videos = soup2.find_all('iframe')
videos

[<iframe allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen="" frameborder="0" height="315" src="https://www.youtube.com/embed/1hDXCyZm1vA?rel=0&amp;autoplay=1" width="560"></iframe>]

In [153]:
images = soup2.find_all('img')
images

[<img border="0" src="../../signjpegs/a/at.htm5.jpg"/>,
 <img border="0" height="219" src="../../signjpegs/a/at1.jpg" width="245"/>,
 <img border="0" height="219" src="../../signjpegs/a/at2.jpg" width="245"/>]

In [9]:
video = videos[0]

In [59]:


get_description(video)

'AT: To do the sign for "at" as in the symbol @ that is commonly used in email addresses use an "A" handshape and circle it in the air as if drawing the symbol:@ '

In [35]:
for video in videos:
    name = name
    type = 'video'
    text = ''.join([res.text for res in videos[0].fetchPreviousSiblings()])
    location = video['src']
    
    

NameError: name 'name' is not defined

In [36]:
''.join([res.text for res in videos[0].fetchPreviousSiblings()])

'@'

In [59]:
soup2

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">

<html>
<head>
<title>"at" American Sign Language (ASL)</title>
<meta content="The sign for at in American Sign Language (ASL)." name="description"/>
<meta content="at,American Sign Language,Sign Language,ASL,learn ASL,Deaf,signing" name="keywords"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<style type="text/css">A {text-decoration: none} a:hover{color:#000033; background-color: yellow}</style>
</head>
<body bgcolor="#FFFFFF" rightmargin="0" topmargin="0"><font face="Arial">
<font face="Arial">
<p align="right" style="margin-top: 0; margin-bottom: 0"> <a href="../../../index.htm" style="text-decoration: none" target="_top">ASL 
University ►</a></p>
<hr color="#000000" size="6" style="border-style: ridge; border-width: 2; padding-left: 0; padding-right: 0; padding-top: 1; padding-bottom: 1"/>
<table border="0" id="table1" width="100%">
<tr>
<td align="left" valign="top">
<font size="5">American Si