# Kickstarter.com scraper
Perry

In [2]:
import requests #html
from bs4 import BeautifulSoup #scraping
import re #regex
import time
import datetime
import random
import csv
import numpy as np

# Part 1: Get & save project URLs to scrape

- Finds project URLs from the tabletop games category explore page
- Checks if kick_urls.txt exists, if so then pulls in any additional urls not scraped this run  
- Saves to kick_urls.txt

### URL Scraper

In [5]:
def scrape_urls(explore_url):
    '''
    Scrapes project page URLs from explore_url
    Input:
        explore_url: kickstarter explore page, must end with page variable open (i.e. "&page=")
        Also reads from kick_urls.txt
    Output:
        project_urls: set of url strings found by iterating through the explore page
    '''
    #Regex to grab project links. Format: [...]kickstarter.com/projects/[creator]/[project]
    project_url_regex = re.compile(r"https://www.kickstarter.com/projects/[0-9A-Za-z\-]*/[0-9A-Za-z\-]*")
    project_urls = set()

    print("starting to scrape:", explore_url)
    
    empty_count = 0

    #Grab all project urls for each explore page.
    #  Note: Kickstarter only generates up to 200 pages of history (~2400 projects)
    for page in range(1,201):
        response = requests.get(explore_url + str(page))
        if response.status_code == 200:
            new_urls = set(project_url_regex.findall(response.text))
            
            #increment count if page is empty
            if len(new_urls.difference(project_urls)) == 0:
                empty_count += 1
            else:
                empty_count = 0
                
            project_urls.update(new_urls)
            
            #progress tracker
            if page % 10 == 0:
                print("Finished page", page)
        else:
            print("Page",page,"fail. Status code:",response.status_code,"Reason:",response.reason)
        
        #If we are getting empty pages (no more projects)
        if empty_count >= 5:
            print(str(empty_count), "consecutive pages with no new project urls. Quitting scrape")
            break

    print (str(len(project_urls)),"distinct project urls scraped")
    return project_urls


### Driver

In [6]:
# Specify explore pages to scrape for project URLs
# Universal filters: Tabletop Games, US based, sort on end date
#  1) Successful:    (>100% funded & closed) [~5,800]
#  2) Live [>100%]:  (>100% funded & live) [~64]
#  3) All [75-100%]: (75-100% live or closed) [~165 of which ~13 live]
#  4) All [<75%):    (0-75% live or closed) [~4,400 of which ~49 live]
explore_urls = ["https://www.kickstarter.com/discover/advanced?state=successful&category_id=34&woe_id=23424977&sort=end_date&seed=2528671&page=", 
                 "https://www.kickstarter.com/discover/advanced?state=live&category_id=34&woe_id=23424977&raised=2&sort=end_date&seed=2528671&page=",
                 "https://www.kickstarter.com/discover/advanced?category_id=34&woe_id=23424977&raised=1&sort=end_date&seed=2528671&page=",
                 "https://www.kickstarter.com/discover/advanced?category_id=34&woe_id=23424977&raised=0&sort=end_date&seed=2528671&page="
                ]

project_urls = set()
save_file_path = "kick_urls.txt"

#Scrape the URL pages
for url in explore_urls:
    project_urls.update(scrape_urls(url))
    print("Combining...",str(len(project_urls)),"distinct urls")

#Check previous save file (if exists) for projects we didn't scrape this time
try:
    with open(save_file_path,"r") as f:
        for line in f:
            project_urls.add(line.strip())
    print (str(len(project_urls)),"distinct project urls after reading archive")
except FileNotFoundError:
    print("No pre-existing file - skipping import")

#Write project urls to save file
with open(save_file_path,"w") as f:
    f.writelines([url + "\n" for url in project_urls])

starting to scrape: https://www.kickstarter.com/discover/advanced?state=successful&category_id=34&woe_id=23424977&sort=end_date&seed=2528671&page=
Finished page 10
Finished page 20
Finished page 30
Finished page 40
Finished page 50
Finished page 60
Finished page 70
Finished page 80
Finished page 90
Finished page 100
Finished page 110
Finished page 120
Finished page 130
Finished page 140
Finished page 150
Finished page 160
Finished page 170
Finished page 180
Finished page 190
Finished page 200
2420 distinct project urls scraped
Combining... 2420 distinct urls
starting to scrape: https://www.kickstarter.com/discover/advanced?state=live&category_id=34&woe_id=23424977&raised=2&sort=end_date&seed=2528671&page=
Finished page 10
5 consecutive pages with no new project urls. Quitting scrape
81 distinct project urls scraped
Combining... 2486 distinct urls
starting to scrape: https://www.kickstarter.com/discover/advanced?category_id=34&woe_id=23424977&raised=1&sort=end_date&seed=2528671&page=
Fi

# Part 2: Scrape individual project pages

Things to get:
- Dependent var  
  - num backers  
  - total $ funded  
  - % of goal raised (which will always be >100%+ because kickstarter doesn't surface "failed" projects)

- Independent vars  
  - Funding goal (dollars)
  - Rewards: # of reward levels, min reward cost, max reward cost  
  - Funding period: campaign start date, campaign end date, campaign date
  - Project activity: # FAQ posts, # project update posts
  - Backers: % from US, % new backers vs. have backed before
  - Creator: # projects created, # projects backed  

Output to .csv

### Helper functions

In [None]:
def get_num(string):
    '''
    Input: string
    Output: first valid int found in string with comma formatting removed
    Error if no int found
    '''
    num_match = re.search(r"[0-9,]+",string)
    num_text = num_match.group(0).replace(",","")
    return int(num_text)

def get_html(url, attempts=3, pause=2, rate_limit=0):
    '''
    Input: 
        url: url address string
        attempts: # tries (default = 3)
        pause: # seconds pause after failed request (default = 2)
        rate_limit: # seconds pause before every request (default = 0)
    Output: (status, html)
        status: Status code 
        html: html string
    '''
    for i in range(attempts):
        if (rate_limit > 0):
            time.sleep(rate_limit)
        response = requests.get(url)
        if response.status_code == 200:
            return (response.status_code, response.text)
        else:
            print ("Request failed with code:", str(response.status_code), str(i+1), "attempt out of", str(attempts))
            if (i<attempts) and (pause>0):
                time.sleep(pause)
    return(response.status_code, response.text)

def make_row(url, data, data_headers):
    '''
    Returns a data row as a list for csv.writer to output
    Inputs:
        url: url string (first column)
        data: dict {data_headers -> values}
        data_headers: list of column names in the desired csv order
    '''
    line = [url]
    line.extend([data[x] for x in data_headers])
    return line

### Scraper function

In [None]:
#main scraping function
def scrape_project(url, html):
    data = {}
    soup = BeautifulSoup(html,"lxml")
    
    #Project name
    data["project_name"] = soup.find("meta",attrs={"property":"og:title"})["content"].strip()
    
    #Success metrics - total $ pledged, goal $, num backers
    result = soup.find("div",class_="col-right col-4 py3 border-left")
    if (result):
        data["dollar_total"] = get_num(result.find("h3",class_="mb0").span.text)
        data["dollar_goal"] = get_num(result.find("div",class_="type-12 medium navy-500").span.text)
        data["backers_total"] = get_num(result.find("div",class_="mb0").h3.text)
    else:
        result = soup.find("div",id="pledged")
        data["dollar_total"] = get_num(result["data-pledged"])
        data["dollar_goal"] = get_num(result["data-goal"])
        data["backers_total"] = get_num(soup.find("div",id="backers_count")["data-backers-count"])
        
    #Reward tiers - number of tiers, reward tier costs / popularity
    result = soup.find_all("div",class_="pledge__info")
    rewards = []
    reward_mode_cost = -1
    reward_mode_backers = -1
    
    for element in result:
        e2 = element.find("h2",class_="pledge__amount").span
        #On live campaigns skip the "pledge without rewards" option that breaks the find
        if (e2 is None):
            continue
        reward_cost = get_num(e2.text)
        rewards.append(reward_cost)
        
        #Track the most popular reward tier (likely the "base" tier that gets you 1 copy of the game/product)
        reward_backers = get_num(element.find("span",class_="pledge__backer-count").text)
        if (reward_backers > reward_mode_backers):
            reward_mode_cost = reward_cost
            reward_mode_backers = reward_backers
    
    data["reward_count"] = len(result)
    data["reward_min"] = np.min(rewards)
    data["reward_max"] = np.max(rewards)
    data["reward_mean"] = np.mean(rewards)
    data["reward_median"] = np.median(rewards)
    data["reward_mode_cost"] = reward_mode_cost
    data["reward_mode_backers"] = reward_mode_backers
    
    #Funding period - days open, start date, close date
    result = soup.find("div",class_="NS_campaigns__funding_period")
    if (result):
        element = result.time
        data["funding_start"] = element["datetime"].split("T")[0].strip()
        element = element.nextSibling.nextSibling
        data["funding_end"] = element["datetime"].split("T")[0].strip()
        element = element.nextSibling #this element is :NavigableString (not :Tag) so doesn't have .text attribute
        data["funding_days"] = get_num(element)
    else:
        #Incompete projects don't show start date. If necessary: fill from updates then back-calc funding_days
        data["funding_start"] = "Not found"
        data["funding_end"] = soup.find("p",class_="mb3 mb0-lg type-12").time["datetime"].split("T")[0].strip()
        data["funding_days"] = -1
    
    # Project activity - count of FAQ / update / comment entries
    #--FAQ
    result = soup.find("a",attrs={"data-content":"faqs"}).find("span",class_="count")
    if (result):
        data["activity_faq_total"] = get_num(result.text)
    else:
        data["activity_faq_total"] = 0
    #--Updates
    result = soup.find("a",attrs={"data-content":"updates"}).find("span",class_="count")
    if (result):
        data["activity_update_total"] = get_num(result.text)
    else:
        data["activity_update_total"] = 0
    #--Comments
    result = soup.find("a",attrs={"data-content":"comments"}).find("span",class_="count")
    if (result):
        data["activity_comment_total"] = get_num(result.text)
    else:
        data["activity_update_total"] = 0
        
    #--FAQ before funding end date
    code, text = get_html(url + "/faqs")
    if code == 200:
        faq_count = 0
        soup = BeautifulSoup(text, "lxml")
        result = soup.find_all("time")
        for element in result:
            date = element["datetime"].split("T")[0].strip()
            if date <= data["funding_end"]:
                faq_count += 1
        data["activity_faq_end"] = faq_count        
    else:
        print ("Subrequest failed - /faqs ->",str(code),url)
        data["activity_faq_end"] = -1
    
    #--Updates before funding end date
    code, text = get_html(url + "/updates")
    if code == 200:
        update_count = 0
        result = []
        soup = BeautifulSoup(text, "lxml")
        
        if data["funding_start"] == "Not found":
            data["funding_start"] = soup.find("div",class_="timeline__divider--launched").time["datetime"].split("T")[0].strip()
        
        #Look for a success / failure banner in the updates timeline
        #If one exists, only count entries below the banner. Otherwise count all entries
        success_divider = soup.find(class_="timeline__divider timeline__divider--successful")
        fail_divider = soup.find(class_="timeline__divider timeline__divider--failure")
        if (success_divider):
            result = success_divider.find_all_next("p",class_="grid-post__date")
        elif (fail_divider):
            result = fail_divider.find_all_next("p",class_="grid-post__date")
        else:
            result = soup.find_all("p",class_="grid-post__date")
        
        for element in result:
            date = element.time["datetime"].split("T")[0].strip()
            if date <= data["funding_end"]:
                update_count += 1
        data["activity_update_end"] = update_count        
    else:
        print ("Subrequest failed - /updates ->",str(code),url)
        data["activity_update_end"] = -1
        
    #--Comments before funding end date
    #--Not pulling these because there are a lot (thousands) without a good option to scroll, 
    #  plus number of comments isn't really controlled by the creator
    
    #Backers - # from US, # new backers vs. have backed before
    code, text = get_html(url + "/community",attempts=1)
    if code == 200:
        soup = BeautifulSoup(text, "lxml")
        result = soup.find("div",class_="community-section__locations_countries")
        if (result):
            element = result.find("a",text="United States")
            if (element):
                element = result.find("a",text="United States").findNext("div",class_="tertiary-text js-location-tertiary-text")
                data["backers_US"] = get_num(element.text)
            else:
                data["backers_US"] = -1
        else:
            data["backers_US"] = -1
        
        result = soup.find("div",class_="community-section__new_vs_existing")
        if(result):
            element = result.find("div",class_="new-backers").find("div",class_="count")
            data["backers_new"] = get_num(element.text)
            element = result.find("div",class_="existing-backers").find("div",class_="count")
            data["backers_exist"] = get_num(element.text)
        else:
            data["backers_new"] = -1
            data["backers_exist"] = -1
    else:
        print ("Subrequest failed - /community ->",str(code),url)
        data["backers_US"] = -1
        data["backers_new"] = -1
        data["backers_exist"] = -1
    
    #Creator - # projects created, # projects backed
    code, text = get_html(url + "/creator_bio")
    if code == 200:
        soup = BeautifulSoup(text, "lxml")
        result = soup.find("div",class_="created-projects py2 f5 mb3")
        
        #Creator projects
        #--Seems like this count includes projects created *after* the current one. 
        regex = re.compile(r"([\d\w]+) created")
        created = regex.search(result.text.strip()).group(1)
        if (created == "First"): #special case for first project
            data["creator_projects"] = 1
        else:
            data["creator_projects"] = get_num(created)
        
        #Creator backed
        regex = re.compile(r"([\d\w]+) backed")
        backed = regex.search(result.text.strip()).group(1)
        data["creator_backed"] = get_num(backed)
        
    else:
        print ("Subrequest failed - /community ->",str(code),url)
        data["creator_projects"] = -1
        data["creator_backed"] = -1
        
    return data

#Temporary - scrape project name to backfill (didn't pull this originally)
def scrape_name(url):
    code, html = get_html(url)
    if code == 200:
        soup = BeautifulSoup(html,"lxml")
        return soup.find("meta",attrs={"property":"og:title"})["content"].strip()
    else:
        return " "

def scrape_fund_start(url):
    code, html = get_html(url+"/updates")
    if code == 200:
        soup = BeautifulSoup(html,"lxml")
        return soup.find("div",class_="timeline__divider--launched").time["datetime"].split("T")[0].strip()
    else:
        return "Not found"
    
#debugging
def debug():
    url = "https://www.kickstarter.com/projects/1208693854/the-island-of-el-dorado" #completed
    #url = "https://www.kickstarter.com/projects/ghoulash/ghoulash-the-card-game"    #failed
    #url = "https://www.kickstarter.com/projects/1048213369/project-alpaca"          #live

    code, html = get_html(url)
    scrape_project(url, html)


### Driver 
Scrapes URLs from text file, outputs to csv  
To do:
- Add code to grab URLs from unsuccessful projects (but avoid unfinished ones)
- Add code to read in previous .csv and avoid rescraping unless error values

In [None]:
url_save_path = "kick_urls.txt"
project_save_path = "kick_data.csv"

column_headers = ['url', 'project_name', 'activity_comment_total', 'activity_faq_end', 'activity_faq_total', 
                  'activity_update_end', 'activity_update_total', 'backers_US', 'backers_total', 
                  'backers_exist', 'backers_new', 'creator_backed', 'creator_projects', 'dollar_goal', 
                  'dollar_total','funding_days', 'funding_end', 'funding_start', 'reward_count', 'reward_max', 
                  'reward_mean', 'reward_median', 'reward_min','reward_mode_backers', 'reward_mode_cost']
    
#Read previous csv (if exists). Make dict {url -> {other columns:values}}
data_dict = {}
try:
    with open(project_save_path,"r") as f_in:
        #skip header row
        f_in.readline()
        #pull in data rows
        reader = csv.reader(f_in, delimiter=",")
        for row in reader:
            data_dict[row[0]]=dict(zip(column_headers[1:],row[1:]))
    print (str(len(data_dict)),"projects imported from archive")
except FileNotFoundError:
    print("No pre-existing save file - skipping import")   

#Get the urls from the first scraping step
urls = []
with open(url_save_path,"r") as f_in:
    urls = [line.strip() for line in f_in] 
    
#Second scraping step
with open(project_save_path,"w") as f_out:

    #Output CSV
    out = csv.writer(f_out, delimiter=",")
    
    #Re-write header and previous values
    out.writerow(column_headers)
    for url in data_dict.keys():
        #back-filling project name
        if "project_name" not in data_dict[url]:
            data_dict[url]["project_name"] = scrape_name(url)
        if data_dict[url]["funding_start"] == "Not found":
            data_dict[url]["funding_start"] = scrape_fund_start(url)        
        #write out data
        out.writerow(make_row(url,data_dict[url],column_headers[1:]))
    
    #Progress tracker 
    progress = 0
    
    print("Starting scrape")
    for url in urls:
        #If we scraped this before, skip (already wrote to csv)
        if url in data_dict:
            progress += 1 
            
        #Else, scrape
        else:
            #Fetch html, scrape if success, print error if fail
            code, text = get_html(url,attempts=10, pause=5)
            if code == 200:
                #print(str(datetime.datetime.now()), "Scrape:", url)
                #scrape
                data = scrape_project(url, text)
                #write out data
                out.writerow(make_row(url,data,column_headers[1:]))
                progress += 1
            else:
                print("HTML request failed. Code:", str(response.status_code), "URL:", line)

        if (progress % (len(urls) // 100)) == 0:
            print("****",str(progress),"Urls scraped out of",str(len(urls)))

print("Done!")


In [None]:
with open("kick_urls.txt","r") as f_in:
    for line in f_in:
        print (line.strip())