### Web Scraping Example

In [None]:
from requests import get
from contextlib import closing
from bs4 import BeautifulSoup

In [None]:
def get_a_page(url):
    header = {'User-agent' : 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'}
    with closing(get(url, headers=header)) as resp:
        return resp

In [None]:
raw_html = get_a_page("https://desmoines.craigslist.org/d/recreational-vehicles/search/rva")

In [None]:
html = BeautifulSoup(raw_html.text, 'html.parser')
postings = html.find_all('li', class_= 'result-row')
print(type(postings))
print(len(postings))

In [None]:
print (postings[0])

### Get data for a single posting to ensure this works

In [None]:
# Get price for first listing
post_1_price = postings[0].a.text
post_1_price = post_1_price.strip()
print(post_1_price)
# Get timestamp for first listing
post_1_time = postings[0].find('time', class_= 'result-date')
post_1_datetime = post_1_time['datetime']
print(post_1_datetime)

In [None]:
# Get text of title of first listing
post_1_title = postings[0].find('a', class_= 'result-title hdrlnk')
post_1_link = post_1_title['href']
print( post_1_link,post_1_title.text)

In [None]:
post_1_location = postings[0].find(class_= 'result-hood')
print(post_1_location.text)

### Now gather data for all listings

In [None]:
from time import sleep
import re
from random import randint #avoid throttling by not sending too many requests one after the other
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np

In [None]:
results_num = html.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text)
print(results_total)

In [None]:
pages = np.arange(0, results_total+1, 120)
print(pages)

In [None]:
iterations = 0

post_timing = []
post_hoods = []
post_title_texts = []
post_prices = []
post_links = []

In [None]:
for page in pages:
    
    #get request
    response = get("https://desmoines.craigslist.org/d/recreational-vehicles/search/rva?" 
                   + "s=" #the parameter for defining the page number 
                   + str(page) #the page number in the pages array from earlier
                   )

    sleep(randint(1,5))
     
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        
    #define the html text
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    #define the posts
    posts = page_html.find_all('li', class_= 'result-row')
        
    #extract data item-wise
    for post in posts:

        if post.find('span', class_ = 'result-hood') is not None:

            #posting date
            #grab the datetime element 0 for date and 1 for time
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            post_timing.append(post_datetime)

            #neighborhoods
            post_hood = post.find('span', class_= 'result-hood').text
            post_hoods.append(post_hood)

            #title text
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text
            post_title_texts.append(post_title_text)

            #post link
            post_link = post_title['href']
            post_links.append(post_link)
            
            #removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
            post_price = int(post.a.text.strip().replace("$", "")) 
            post_prices.append(post_price)
            
    iterations += 1
    print("Page " + str(iterations) + " scraped successfully!")

print("\n")

print("Scrape complete!")

In [None]:
# I can now load all my lists into a dictionary of lists (or whatever data structure makes the most sense)
all_listing_dict = {}
for listing in range (0,len(post_title_texts)):
    all_listing_dict[listing]=[post_timing[listing],post_hoods[listing],post_title_texts[listing],post_prices[listing],post_links[listing]]

print(all_listing_dict)

### File Input Example

In [None]:
# This example assumes you have the Moby_Dick_Chapter_1.txt file in the same directory as your program
with open('Moby_Dick_Chapter_1.txt','r') as input_file:
    for line in input_file:
        print(line)


In [None]:
# This example assumes you have the Moby_Dick_Chapter_1.txt file in the same directory as your program
import re
moby_dick_word_count = {}

with open('Moby_Dick_Chapter_1.txt','r') as input_file:
    for line in input_file:
        # First lowercase all characters in the line
        line = line.lower()
        # Next clean the line of any punctuation
        line_clean =  re.sub(r'[^\w\s]', '', line)
        # Now split the line into words
        line_split = line_clean.split()
        # Now we can add the words to our dictionary
        for word in line_split:
            if word in moby_dick_word_count.keys():
                moby_dick_word_count[word] += 1
            else:
                moby_dick_word_count[word] = 1
print(moby_dick_word_count)

In [None]:
sorted(moby_dick_word_count.items(), key=lambda x: x[1], reverse=True)

In [None]:
data_for_output = sorted(moby_dick_word_count.items(), key=lambda x: x[1], reverse=True)

with open("moby_dick_word_counts.txt",'w') as fileoutput:
    for item in data_for_output:
        #print(item)
        fileoutput.write(item[0] + " " + str(item[1])+ "\n")

### Speed considerations

In [None]:
def divide_by_2_50_times(number):
    for i in range(0,50):
        number /= 2
    return number

In [None]:
%%timeit
divide_by_2_50_times(1000000)

### Arrays

In [3]:
import array
my_array = array.array('i',[5,4,3,2])
print(my_array)

array('i', [5, 4, 3, 2])


In [4]:
my_array[0]

5

In [9]:
print(my_array[0:2])
my_array[1] = 5
my_array.append(9)
print(my_array)
my_array.remove(9)
print(my_array)

array('i', [5, 5])
array('i', [5, 5, 3, 2, 9, 9, 9])
array('i', [5, 5, 3, 2, 9, 9])


In [12]:
my_array.pop()

2

### NumPy

In [17]:
import numpy as np
my_narray = np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]])
print(my_narray)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]]
