# Module 3 - File I/O and NumPy

## Topic 1 - Web Scraping, File I/O

### There's a great deal of valuable data being created every second on the web. If we can gather this data and store it in a clean, structured way, it can be used in our programs

### Web Scraping Example

In [1]:
from requests import get
from contextlib import closing
from bs4 import BeautifulSoup

In [3]:
def get_a_page(url):
    header = {'User-agent' : 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'}
    with closing(get(url, headers=header)) as resp:
        return resp

In [4]:
raw_html = get_a_page("https://desmoines.craigslist.org/d/recreational-vehicles/search/rva")

In [5]:
html = BeautifulSoup(raw_html.text, 'html.parser')
postings = html.find_all('li', class_= 'result-row')
print(type(postings))
print(len(postings)) 

<class 'bs4.element.ResultSet'>
120


In [6]:
print (postings[0])

<li class="result-row" data-pid="7193746279">
<a class="result-image gallery" data-ids="3:00101_6eR19dI3ahL_0gw0co,3:00H0H_gBSytQSGE1r_0gw0co,3:00n0n_5FqInWZKx2S_09i0co,3:00101_bnMdl1imoEM_0gw0co,3:00j0j_l09aiiEnsqh_09i0co,3:00i0i_ducKJbxJnsD_0gw0co,3:00Y0Y_DAaXy879p9_0gw0co,3:00o0o_7yhpnRpNEJ2_0gw0co,3:00O0O_71EsnFWNqCa_0gw0co,3:00909_5Y5b0jDJrZP_0gw0co,3:00k0k_hKFhl4qlgxW_0gw0co,3:00505_aAh5qY7s0zK_09i0co,3:00g0g_85Qu23LJooA_09i0co,3:00n0n_hHA4W2QZd9h_0gw0co,3:00i0i_fbbn3F2uzCi_09i0co,3:00w0w_9eJNXa3DBX1_0gw0co" href="https://desmoines.craigslist.org/rvd/d/clinton-2014-keystone-springdale-293rk/7193746279.html">
<span class="result-price">$14,900</span>
</a>
<p class="result-info">
<span class="icon icon-star" role="button">
<span class="screen-reader-text">favorite this post</span>
</span>
<time class="result-date" datetime="2020-09-10 13:05" title="Thu 10 Sep 01:05:08 PM">Sep 10</time>
<a class="result-title hdrlnk" data-id="7193746279" href="https://desmoines.craigslist.org/rvd/d/

### Get data for a single posting to ensure this works

In [7]:
# Get price for first listing
post_1_price = postings[0].a.text
post_1_price = post_1_price.strip()
print(post_1_price)
# Get timestamp for first listing
post_1_time = postings[0].find('time', class_= 'result-date')
post_1_datetime = post_1_time['datetime']
print(post_1_datetime)

$14,900
2020-09-10 13:05


In [8]:
# Get text of title of first listing
post_1_title = postings[0].find('a', class_= 'result-title hdrlnk')
post_1_link = post_1_title['href']
print( post_1_link,post_1_title.text)

https://desmoines.craigslist.org/rvd/d/clinton-2014-keystone-springdale-293rk/7193746279.html 2014 Keystone Springdale 293RK


In [9]:
post_1_location = postings[0].find(class_= 'result-hood')
print(post_1_location.text)

 (Clinton, MO)


### Now gather data for all listings

In [10]:
from time import sleep
import re # regex library
from random import randint #avoid throttling by not sending too many requests one after the other
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np

In [11]:
results_num = html.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text)
print(results_total)

184


In [12]:
pages = np.arange(0, results_total+1, 120)
print(pages)

[  0 120]


In [13]:
iterations = 0

post_timing = []
post_hoods = []
post_title_texts = []
post_prices = []
post_links = []

In [14]:
for page in pages:
    
    #get request
    response = get("https://desmoines.craigslist.org/d/recreational-vehicles/search/rva?" 
                   + "s=" #the parameter for defining the page number 
                   + str(page) #the page number in the pages array from earlier
                   )

    sleep(randint(1,5))
     
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        
    #define the html text
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    #define the posts
    posts = page_html.find_all('li', class_= 'result-row')
        
    #extract data item-wise
    for post in posts:

        if post.find('span', class_ = 'result-hood') is not None:

            #posting date
            #grab the datetime element 0 for date and 1 for time
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            post_timing.append(post_datetime)

            #neighborhoods
            post_hood = post.find('span', class_= 'result-hood').text
            post_hoods.append(post_hood)

            #title text
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text
            post_title_texts.append(post_title_text)

            #post link
            post_link = post_title['href']
            post_links.append(post_link)
            
            #removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
            if post.a.text == "":
                post_price = 0
            else:
                post_price = int((post.a.text.strip().replace("$", "")).replace(",", "")) 
            post_prices.append(post_price)
            
    iterations += 1
    print("Page " + str(iterations) + " scraped successfully!")

print("\n")

print("Scrape complete!")

Page 1 scraped successfully!
Page 2 scraped successfully!


Scrape complete!


In [None]:
# I can now load all my lists into a dictionary of lists (or whatever data structure makes the most sense)
all_listing_dict = {}
for listing in range (0,len(post_title_texts)):
    all_listing_dict[listing]=[post_timing[listing],post_hoods[listing],post_title_texts[listing],post_prices[listing],post_links[listing]]

print(all_listing_dict)
#print(all_listing_dict[0])
#print(all_listing_dict[1])

In [None]:
for item in all_listing_dict:
    print(all_listing_dict[item][3])

### Topic 1 Continuted: File Input Example

In [None]:
# This example assumes you have the Moby_Dick_Chapter_1.txt file in the same directory as your program
with open('Moby_Dick_Chapter_1.txt','r') as input_file:
    for line in input_file:
        print(line)


In [None]:
# This example assumes you have the Moby_Dick_Chapter_1.txt file in the same directory as your program
import re
moby_dick_word_count = {}

with open('Moby_Dick_Chapter_1.txt','r') as input_file:
    for line in input_file:
        # First lowercase all characters in the line
        line = line.lower()
        # Next clean the line of any punctuation
        line_clean =  re.sub(r'[^\w\s]', '', line)
        # Now split the line into words
        line_split = line_clean.split()
        # Now we can add the words to our dictionary
        for word in line_split:
            if word in moby_dick_word_count.keys():
                moby_dick_word_count[word] += 1
            else:
                moby_dick_word_count[word] = 1
print(moby_dick_word_count)

In [None]:
sorted(moby_dick_word_count.items(), key=lambda x: x[1], reverse=True)

In [None]:
data_for_output = sorted(moby_dick_word_count.items(), key=lambda x: x[1], reverse=True)

with open("moby_dick_word_counts.txt",'w') as fileoutput:
    for item in data_for_output:
        #print(item)
        fileoutput.write(item[0] + " " + str(item[1])+ "\n")

## Topic 2 - Intro to SciPy Package

### This topic contains some links to information about the power of the SciPy library

## Topic 3 - NumPy Arrays

### Arrays, not the NumPy type first

In [None]:
import array
my_array = array.array('i',[5,4,3,2])
print(my_array)

In [None]:
my_array[0]

In [None]:
print(my_array[0:2])
my_array[1] = 5
my_array.append(9)
print(my_array)
my_array.remove(9)
print(my_array)

In [None]:
my_array.pop()

### NumPy

In [None]:
import numpy as np
my_narray = np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]])
print(my_narray)

### Common NumPy Properties

In [None]:
# Output the shape of the array
print(my_narray.shape)
# Output the number of items in the array
print(my_narray.size)
# Output the datatype of the array items
print(my_narray.dtype)

### Accessing data in a NumPy Array

In [None]:
# Returns the maximum value in the entire array
print(my_narray.max())
# Sums the entire array
print(my_narray.sum())

![axis%20order%20numpy%20resize.png](attachment:axis%20order%20numpy%20resize.png)

In [None]:
print(my_narray)
print("")
# Returns the maximum value on axis 0
print(my_narray.max(axis=0))
# Sums (collapses) the array along axis 1
print(my_narray.sum(axis=1))

In [None]:
print(my_narray)
my_array_divide_by_3 = my_narray%3==0
print(my_array_divide_by_3)

In [None]:
my_narray_slice = my_narray[1:3,1:3]
print(my_narray_slice)

In [None]:
array_1 = np.array([[1,2],[3,4]])
array_2 = np.array([[3,4],[5,6]])

In [None]:
array_sum = np.add(array_1,array_2)
print(array_sum)

In [None]:
array_sum = np.multiply(array_1,array_2)
print(array_sum)

## Topic 4 - Manipulation of NumPy Arrays

### There are a ton of different ways that NumPy arrays can be manipulated.  Let's have a look at a few of them.

### I'll start by creating a 2x6 array

In [None]:
import numpy as np
another_narray = np.array([[1,2,3,4,5,6],[7,8,9,10,11,12]])
print(another_narray)

### Let's mess with the shape of my array

In [None]:
# Resize an array in place (replaces existing array variable)
another_narray.resize(6,2)
print(another_narray)

In [None]:
# Transpose array axes and overwrite existing array
another_narray = another_narray.transpose()
print(another_narray)

In [None]:
# Flatten my array
another_narray = another_narray.flatten()
print(another_narray)

In [None]:
# Sort my array
another_narray.sort()
print(another_narray)

In [None]:
# Now break it back into a 2x6, giving me exactly what I started with...
another_narray.resize (2,6)
print(another_narray)