# Intro

**Objective**

Scrape images from Kickstarter from a random subset of the web robots kickstarter dataset which conatains mostly meta data but also has image url information. We will use this url information to download the full images from the selected subset. The images will be saved to a local directory with the image id as the file name. These files will later be analyzed using custom python function to extract low-level structral features form the images.

**Resources**

Web_robots_data metadata for kickstarter.com campaigns - source: 

https://webrobots.io/kickstarter-datasets/


Author: Nicholas Mostovych

# Imports

In [1]:
## for data
import pandas as pd
import numpy as np
import joblib
import pickle

# for webscraping
import requests as r
import nltk
from bs4 import BeautifulSoup
import re
import lxml

# other
import time
import random

## Interactive env
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import clear_output

# Functions

In [2]:
# Parse html objects using BeautifulSoup
def parse(scraped_html):
    """
    Use the BeautifulSoup library to parse the scraped HTML of a project 
    using an lxml parser
    
    Args:
        scraped_html (response object): the unparsed response object collected
        by the web scraper
    
    Returns:
        a soup object containing the parsed HTML
    """
    return BeautifulSoup(scraped_html.text, 'lxml')

In [3]:
# Paid proxy information stored in local file, not shared on github. Retrieve this info.
def get_proxy_data(column):
    """
    Function to use with get_proxy() function to extract specfific proxy data
    
    Args:
        ip = 0
        port_http = 1
        port_socks = 2
        login = 3
        password = 4
        
    
    Returns:
        proxy data in list form
    """
    ### directory information
    f = open('/home/mosto/Documents/insight/kickstarter-project/proxylist.csv', "r")
    
    ### get proxy data
    lines = f.readlines()
    result=[]
    for x in lines:
        result.append(x.split('\t')[column])
    f.close()
    return result

In [4]:
# Grab my proxy data
def get_proxy():
    """
    Function to pick a random proxy and assemble the correct url
    
    Args:
        none
        
    
    Returns:
        random proxy url 
    """
    
    ### Retrieve proxy information 
    proxy_ip = get_proxy_data(0)
    del proxy_ip[0]
    port_http = get_proxy_data(1)
    port_socks = get_proxy_data(2)
    login = get_proxy_data(3)
    password = get_proxy_data(4)

    ### Pick a random proxy to use
    in_proxy_ip = random.choice(proxy_ip)
    user = login[1]
    passwd = password[1]
    passwd = passwd.rstrip()
    ip = in_proxy_ip
    port = port_http[1]
    
    ### Assemble the url
    proxy_url = "http://" + user + ":" + passwd + "@" + ip + ":" + port + "/"
    return {
        "http": proxy_url,
        "https": proxy_url
    }

# Load Data and Organize Data Subset

In [5]:
# Webscrapped meta-data
datafile = '/home/mosto/Documents/insight/kickstarter-project/web_robots_data_to_08-2020_processed.pkl'

In [6]:
# Load table containing Web Robots data
df = joblib.load(datafile)

In [7]:
# Obtain subset of images that were scrapped from the large web robots dataset
'''
A subset of images was randomly sampled from the large web robots dataset and then
the images were downloaded from Kickstarter.com. These randomly selected images are
specified using the random seed number of 74 and then this dataset looks at the first
10000 images of this dataset
'''

# Select projects in USD
df_USD = df[df['currency'] == 'USD']
# Take a random sample of the Web Robots data using a seed value to ensure repeatability
seed = np.random.seed(74)
df_sample = df_USD.sample(50000)
df1_sample=df_sample.iloc[:10000]
df1_sample.shape

(10000, 17)

# Scrape Images

In [9]:
# This will scrape images to the local directory with the image file names = image id from web robots dataset

# Rename dataframe for scraping
df = df1_sample

# Initalize an empty DataFrame to store scraped HTML names as a reference
images_collection = pd.DataFrame(columns=['image'])

# Record the start time
start_time = time.time()

# Initialize the number of requests
request_count = 0

# Select which projects to scrape via its index. If scrapper fails for some reason we can pick up where we left off
start = 0
stop = 3

# Set up for loop to scrape all images in df
for row in range(start, stop, 1):
    
    # instantiate a new random proxy
    current_proxy = get_proxy()     
    # make index variable 
    index = df.index[row] 
    
    # Scrape HTML url
    try:
        
        scraped_html = r.get(df.iloc[row]['url'],proxies=current_proxy)
        soup = parse(scraped_html)
        # Get image urls
        img = soup.find_all('img',src=True)        
        # Select first image which is the cover photo
        img_url = img[0]['src']
        # Wait to lower chances of getting blocked by website during scraping 
        time.sleep(random.uniform(1,2))
        
        # Open image
        res=r.get(img_url,proxies=current_proxy)
        con=res.content
        # Save image 
        out=open(str(index),'wb')
        out.write(con)
        out.close()
    
        # Record scraped HTML name to reference dataframe
        images_collection.loc[index, 'image'] = str(index)
    
    # Error handling    
    except IndexError:
        #If there are no images, we will get an index error in the img[0] expression, so we skip it
        continue
    except ProxyError:
        # Sometimes proxy fails, so we skip it
        continue
    except OSError:
        # Sometimes proxy fails, so we skip it
        continue
    
    # Wait again because we pinged the same site twice
    time.sleep(random.uniform(2,4)) 
    
    
    # Monitor the requests by clearing the output and displaying current 
    # progress
    elapsed_time = time.time() - start_time
    clear_output(wait = True)
    print(
        'Request: {}; Row ID: {}; Frequency: {} requests/sec'.format(
            request_count + start,
            index,
            (request_count + 1) / elapsed_time
        )
    )
    request_count += 1
    
    
# Display the overall time, average scraping speed and total number of scraped
# project pages
run_time = time.time() - start_time
print()
print('Run time:', run_time)
print('Average rate:', len(images_collection) / run_time)
print('# of images downloaded:', len(images_collection))

Request: 2; Row ID: 52182; Frequency: 0.17123741158750425 requests/sec

Run time: 17.52166175842285
Average rate: 0.17121663694699893
# of images downloaded: 3


# Save Data

In [10]:
# Save reference dataframe to know which images you have downloarded to the directory
# Serialize the data table containing the scraped HTML for each project
joblib.dump(
    images_collection, '/home/mosto/Documents/insight/kickstarter-project/df1_dynamic_scrape_scraped_collection_{}-{}.pkl'.format(
        start,
        stop - 1
    )
)

['/home/mosto/Documents/insight/kickstarter-project/df1_dynamic_scrape_scraped_collection_0-2.pkl']