In [1]:
import numpy as np
import pandas as pd

import re

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from bs4 import BeautifulSoup
import requests

import time, os

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

from fake_useragent import UserAgent

import json

In [2]:
# Use random user agent just in case the site checks for too many requests from a specific user agent

ua = UserAgent()
user_agent = {'User-agent': ua.random}

Considered using "headless" Chrome, but I ended up not doing a lot of additional scraping.

In [3]:
# instantiate a chrome options object so you can set the size and headless preference

chrome_options = Options()
chrome_options.add_argument(f'user-agent={user_agent}')

# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--window-size=1920x1080")

In [4]:
chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

Below are a few different selections from the search feature for Kickstarter. 
Because of the way Kickstarter removes unsuccessful projects from their search, I ended up just using only projects that were already funded, i.e. met 100% of their funding goal (url_100).

In [9]:
url_75 = 'https://www.kickstarter.com/discover/advanced?category_id=34&raised=0&sort=newest&seed=2644681&page={}'
url_75_100 = 'https://www.kickstarter.com/discover/advanced?category_id=34&raised=1&sort=newest&seed=2644681&page={}'
url_100 = 'https://www.kickstarter.com/discover/advanced?category_id=34&raised=2&sort=newest&seed=2644681&page={}'

Below is the function I ended up using to scrape data. I wrote another function that I thought I might use to grab project data from individual pages (as opposed to the main search page), but I ended up not using any other data.

In [5]:
def get_data(start_url):
    
    """
    Takes a kickstarter search page URL with page={} at the end
    and scrapes basic searched project data returning it as a dataframe.
    
    ---
    Input: kickstarter search page URL with page={} at the end
    Output: dataframe of scraped searched project data
    
    """
    
    page_number = 1
    all_data = []

    while True:
        url = start_url.format(page_number)
        response  = requests.get(url, headers = user_agent)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        project_divs = soup.findAll('div', {"class":"js-react-proj-card"})

        if len(project_divs) == 0:
            break

        for div in project_divs:
            all_data.append(json.loads(div['data-project']))

        page_number +=1
        time.sleep(1)
        
        ###############
        ### For Testing
#         if page_number > 3:
#             break
        ###############
    
    return pd.DataFrame(all_data)

df_100 ended up being the dataframe I used for my project. 

In [None]:
# df_75 = get_data(url_75)
# df_75_100 = get_data(url_75_100)
df_100 = get_data(url_100)

I did a little exploration of the data to see how df_100 was formatted. 
Specifically, I spent some trying trying to figure out how to access the URLs for each project.

In [36]:
df_100.iloc[0]

Unnamed: 0                                                                  0
id                                                                  746380653
photo                       {'key': 'assets/028/357/363/2327d019e352f0dcb1...
friends                                                                    []
is_starred                                                              False
is_backing                                                              False
permissions                                                                []
urls                        {'api': {'star': 'https://api.kickstarter.com/...
name                                              Kovaud's Adventurer's Guide
blurb                       A 5th edition supplement crammed full of tons ...
goal                                                                     5000
pledged                                                                    26
state                                                           

In [190]:
df_100.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 41 columns):
Unnamed: 0                  984 non-null int64
id                          984 non-null int64
photo                       984 non-null object
friends                     12 non-null object
is_starred                  12 non-null object
is_backing                  12 non-null object
permissions                 12 non-null object
urls                        984 non-null object
name                        984 non-null object
blurb                       984 non-null object
goal                        984 non-null float64
pledged                     984 non-null float64
state                       984 non-null object
slug                        984 non-null object
disable_communication       984 non-null bool
country                     984 non-null object
country_displayable_name    984 non-null object
currency                    984 non-null object
currency_symbol             984 non-n

I spent some time trying to see if there was anything useful I could get for certain columns. For the most part, I decided there wasn't much I could use.

In [212]:
df_100.iloc[1]['creator']

"{'id': 434622949, 'name': 'Nicholas Robinson', 'is_registered': None, 'chosen_currency': None, 'is_superbacker': None, 'avatar': {'thumb': 'https://ksr-ugc.imgix.net/assets/010/171/887/f995aa41058bdeb531d5e5c2bf482ab0_original.jpg?ixlib=rb-2.1.0&w=40&h=40&fit=crop&v=1461671654&auto=format&frame=1&q=92&s=dbde0db035d4504db5ad0c618a4fcce6', 'small': 'https://ksr-ugc.imgix.net/assets/010/171/887/f995aa41058bdeb531d5e5c2bf482ab0_original.jpg?ixlib=rb-2.1.0&w=160&h=160&fit=crop&v=1461671654&auto=format&frame=1&q=92&s=b7df291009755065d40a55b2026475c1', 'medium': 'https://ksr-ugc.imgix.net/assets/010/171/887/f995aa41058bdeb531d5e5c2bf482ab0_original.jpg?ixlib=rb-2.1.0&w=160&h=160&fit=crop&v=1461671654&auto=format&frame=1&q=92&s=b7df291009755065d40a55b2026475c1'}, 'urls': {'web': {'user': 'https://www.kickstarter.com/profile/434622949'}, 'api': {'user': 'https://api.kickstarter.com/v1/users/434622949?signature=1586842536.fd2344efac60f4831c05bbf1e66a0574b587d4b3'}}}"

In [214]:
 df_100.iloc[1]['category']

"{'id': 34, 'name': 'Tabletop Games', 'slug': 'games/tabletop games', 'position': 6, 'parent_id': 12, 'parent_name': 'Games', 'color': 51627, 'urls': {'web': {'discover': 'http://www.kickstarter.com/discover/categories/games/tabletop%20games'}}}"

In [215]:
df_100.iloc[1]['profile']

"{'id': 3962348, 'project_id': 3962348, 'state': 'inactive', 'state_changed_at': 1585510981, 'name': None, 'blurb': None, 'background_color': None, 'text_color': None, 'link_background_color': None, 'link_text_color': None, 'link_text': None, 'link_url': None, 'show_feature_image': False, 'background_image_opacity': 0.8, 'should_show_feature_image_section': True, 'feature_image_attributes': {'image_urls': {'default': 'https://ksr-ugc.imgix.net/assets/028/705/669/4a448eef23613e5ea03361ef9463cee5_original.jpg?ixlib=rb-2.1.0&crop=faces&w=1552&h=873&fit=crop&v=1586709264&auto=format&frame=1&q=92&s=6f0d15af26e5cb15995d0733fdcfc793', 'baseball_card': 'https://ksr-ugc.imgix.net/assets/028/705/669/4a448eef23613e5ea03361ef9463cee5_original.jpg?ixlib=rb-2.1.0&crop=faces&w=560&h=315&fit=crop&v=1586709264&auto=format&frame=1&q=92&s=e152badc4c29298cfd161157051fc28b'}}}"

Below, I was looking at the URLs that I mentioned earlier.

In [31]:
test_urls = df_100['urls'][0]

In [41]:
test_urls.split("'")[-6]

'https://www.kickstarter.com/projects/kovaud/kovauds-adventurers-guide'

In [None]:
test_df = df_100[:3]

In [52]:
url_strings = test_df['urls']

In [53]:
type(url_strings)

pandas.core.series.Series

In [54]:
url_strings[0]

"{'api': {'star': 'https://api.kickstarter.com/v1/projects/746380653/star?signature=1586842536.b729e047bd273121a62be9473a19de5ecbdf423c', 'message_creator': 'https://api.kickstarter.com/v1/projects/746380653/messages?signature=1586842536.91165ed798048713a57378af5b9d03d2084bfd4e'}, 'web': {'message_creator': 'https://www.kickstarter.com/projects/kovaud/kovauds-adventurers-guide/messages/new?message%5Bto%5D=kovaud', 'project': 'https://www.kickstarter.com/projects/kovaud/kovauds-adventurers-guide', 'rewards': 'https://www.kickstarter.com/projects/kovaud/kovauds-adventurers-guide/rewards'}}"

I didn't end up using this function. I decided it would be better to just pass the entire dataframe to the get_project_data function a few cells below.

In [57]:
def get_project_urls(my_df):
    
    """
    Retrieves URLs from dataframe created by get_data function which scraped data from Kickstarter
    
    Input: dataframe in the form generated by get_data
    Output: list of URLs as strings for individual kickstarter project pages
    """
    my_urls = []
    
    url_strings = my_df['urls']
    
    for row in url_strings:
        my_urls.append(row.split("'")[-6])

    return my_urls

In [74]:
get_project_urls(test_df)

['https://www.kickstarter.com/projects/kovaud/kovauds-adventurers-guide',
 'https://www.kickstarter.com/projects/434622949/the-twilight-dream-a-5th-edition-setting',
 'https://www.kickstarter.com/projects/badgersfrommars/regicide']

In [None]:
project_url = 'https://www.kickstarter.com/projects/frosthaven'

I went thru many iterations of the function below. Sadly, I didn't end up using it (or the next iteration). This one had some issues that I solved in the next iteration. 


In [7]:
def get_project_data(my_df):
    
    """
    Attempts to return text from each project URL that is in the input dataframe. 
    
    Input: dataframe in the form generated by get_data
    Output: List of text for each project in the dataframe
    
    """
    
    
    chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" # path to the chromedriver executable
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(options=chrome_options, executable_path=chromedriver)
    
    
    my_urls = []
    
    print(my_df['urls'])
    
    url_strings = my_df['urls']
    
    for row in url_strings:
#         print(row)
#         print('\n')
        regexp = re.compile("'project': (.*),")
        my_urls.append(regexp.search(row).group(1).replace("'",""))
        
#         my_urls.append(row.split("'")[-6])

    print(my_urls)
    print('\n')
        
        
    
    all_text = []
    
        
    for url in my_urls:
        print(url)
        print('\n')
        driver.get(url)
        
        print('\n')
        
#         new_soup = BeautifulSoup(driver.page_source, 'lxml')
        
        # Get Text
#         all_text_divs = new_soup.findAll('div', class_ = "rte__content")
        print(driver.find_element_by_xpath('//*[@id="react-campaign"]'))
        all_text.append(driver.find_element_by_xpath('//*[@id="react-campaign"]').text)
        print(all_text)
        print('\n')
        
#         print(all_text_divs)
#         for item in all_text_divs:
#             for paragraph in item.findAll('p'):
#                 all_text = all_text + paragraph.text
                
#         time.sleep(1)
                
    driver.close()
        
    return # all_text

In [169]:
chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
# driver = webdriver.Chrome(options=chrome_options, executable_path=chromedriver)


In [187]:
def get_project_data_2(my_df):
    
    """
    Attempts to return text from each project URL that is in the input dataframe. 
    
    Input: dataframe in the form generated by get_data
    Output: List of text for each project in the dataframe
    
    """
    my_urls = []
    
#     print(my_df['urls'])
    
    url_strings = my_df['urls']
    
    for row in url_strings:
#         print(row)
#         print('\n')
        regexp = re.compile("'project': (.*),")
        my_urls.append(regexp.search(row).group(1).replace("'",""))
        

    print(my_urls)
    print('\n')
        
        
    
    all_text = []
    driver = webdriver.Chrome(chromedriver)
    
        
    for url in my_urls:
        print(url)
        print('\n')
        driver.get(url)
        time.sleep(2)
        
        print('\n')
        
        
        # Get Text

        print(driver.find_element_by_xpath('//*[@id="react-campaign"]'))
        print(driver.find_element_by_xpath('//*[@id="react-campaign"]').text)
#         print(all_text)
        print('\n')
        
               
    driver.close()
        
    return # all_text

In [188]:
test_text = get_project_data_2(test_df.iloc[0:3])

['https://www.kickstarter.com/projects/kovaud/kovauds-adventurers-guide', 'https://www.kickstarter.com/projects/434622949/the-twilight-dream-a-5th-edition-setting', 'https://www.kickstarter.com/projects/badgersfrommars/regicide']


https://www.kickstarter.com/projects/kovaud/kovauds-adventurers-guide




<selenium.webdriver.remote.webelement.WebElement (session="2f9262c355e2cb11002f9e4dbeddeea0", element="10fb854a-a037-4ee0-9f2c-6c308b9a4728")>
Story
For the past two years I have been consistently creating quality content for 5th edition Dungeons and Dragons on a weekly basis and decided to finally begin my first large scale project compiling all my work and content into a bright and fancy book. Full of new art, content, and writing! 
The goal for this Kickstarter is to raise funds for artwork for a high quality pdf.
Fairly simple I'd say! However their will be other smaller goals that will garner more rewards for backers! 
The stretch goal is $12,000, and if that stretch goal is met t



<selenium.webdriver.remote.webelement.WebElement (session="2f9262c355e2cb11002f9e4dbeddeea0", element="15bc7f99-9787-4342-a6ef-cc11a90bb43a")>
Story
Regicide is a card game that can be played with a standard deck of 52 cards plus 2 jokers. The game rules are available here, so feel free to try out Regicide yourself by just grabbing any old deck of cards you might have lying around! We also have a learn to play video at the bottom of this page if you want to see it in action. So why back this project if you can just play Regicide with a standard deck of cards? We think this Regicide deck adds a new level of excitement and immersion while also communicating the key game concepts. Why play a boring 2 of Spades when you could instead play a cute Gnome wielding an oversized cheese cutter?
The art is beautifully crafted by the amazing Sketchgoblin, who has extensive experience illustrating many kinds of strange and wonderful fantasy characters. Each of the 54 cards is brought to life with 

In [71]:
driver.close()

I did end up using the function below to add a few more features to my data:
    - Facebook friends of the creator
    - Number of kickstarter projects created
    - Number of kickstarter projects backed

In [138]:
def get_creator_data(my_df):
    
    """
    Returns information from each creator for each project URL that is in the input dataframe. 
    
    Input: dataframe in the form generated by get_data
    Output: Original dataframe with three additional columns associated with the creator: fb, created, and backed.
    
    """    

    my_urls = []
    
#     print(my_df['urls'])
    
    url_strings = my_df['urls']
    
    for row in url_strings:
#         print(row)
#         print('\n')
        regexp = re.compile("'project': (.*),")
        my_urls.append(regexp.search(row).group(1).replace("'",""))
        

#     print(my_urls)
#     print('\n')
        
        
    for i,url in enumerate(my_urls):
#         print(url)
        
        response  = requests.get(url + '/creator_bio', headers = user_agent)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        
        try:
#             my_df['fb'].iloc[i] = int(soup.find('a', class_='popup').text.split(' ')[0].replace(',',''))
            my_df.iloc[i, my_df.columns.get_loc('fb')] = int(soup.find('a', class_='popup').text.split(' ')[0].replace(',',''))
        except:
            pass
        
        # cb = soup.findAll('a', class_ = 'green-dark bold remote_modal_dialog')
        cb = soup.find('div', class_ = 'created-projects py2 f5 mb3')
        cb_list = re.findall('\d+', cb.text.replace(',',''))
        
        
        if len(cb_list) == 1:
#             my_df['created'].iloc[i] = 0
#             my_df['backed'].iloc[i] = int(cb_list[0])
            my_df.iloc[i, my_df.columns.get_loc('created')] = 0
            my_df.iloc[i, my_df.columns.get_loc('backed')] = int(cb_list[0])
            
        else:
#             my_df['created'].iloc[i] = int(cb_list[0])
#             my_df['backed'].iloc[i] = int(cb_list[1])
            my_df.iloc[i, my_df.columns.get_loc('created')] = int(cb_list[0])
            my_df.iloc[i, my_df.columns.get_loc('backed')] = int(cb_list[1])
                
      
        time.sleep(1)
        
        
    return my_df

In [142]:
df_creator.to_csv('df_creator.csv', index=False)

In [143]:
df_100.head()

Unnamed: 0.1,Unnamed: 0,id,photo,friends,is_starred,is_backing,permissions,urls,name,blurb,...,location,category,profile,spotlight,percent_funded,is_liked,is_disliked,fb,created,backed
0,0,746380653,{'key': 'assets/028/357/363/2327d019e352f0dcb1...,[],False,False,[],{'api': {'star': 'https://api.kickstarter.com/...,Kovaud's Adventurer's Guide,A 5th edition supplement crammed full of tons ...,...,"{'id': 2400767, 'name': 'Evansville', 'slug': ...","{'id': 34, 'name': 'Tabletop Games', 'slug': '...","{'id': 3947508, 'project_id': 3947508, 'state'...",False,0.52,False,False,0,0,0
1,1,850587382,{'key': 'assets/028/705/669/4a448eef23613e5ea0...,[],False,False,[],{'api': {'star': 'https://api.kickstarter.com/...,The Twilight Dream: A 5th Edition Setting,A 100+ page original campaign setting and adve...,...,"{'id': 2439482, 'name': 'Lincoln', 'slug': 'li...","{'id': 34, 'name': 'Tabletop Games', 'slug': '...","{'id': 3962348, 'project_id': 3962348, 'state'...",False,12.384615,False,False,0,3,8
2,2,587381352,{'key': 'assets/028/491/554/6975ec7112dd4cd45c...,[],False,False,[],{'api': {'star': 'https://api.kickstarter.com/...,Regicide,"Regicide is a cooperative, fantasy card game f...",...,"{'id': 2348079, 'name': 'Auckland', 'slug': 'a...","{'id': 34, 'name': 'Tabletop Games', 'slug': '...","{'id': 3712837, 'project_id': 3712837, 'state'...",False,32.532483,False,False,150,0,4
3,3,1340348303,{'key': 'assets/027/965/340/14139bb8c8aba2ce99...,[],False,False,[],{'api': {'star': 'https://api.kickstarter.com/...,MOMminiaturas novedades,"Miniaturas para wargames de batallas, escaramu...",...,"{'id': 769006, 'name': 'Osuna', 'slug': 'osuna...","{'id': 34, 'name': 'Tabletop Games', 'slug': '...","{'id': 3926810, 'project_id': 3926810, 'state'...",False,112.1,False,False,400,2,0
4,4,1654502771,{'key': 'assets/028/700/342/95537f32c84604f76a...,[],False,False,[],{'api': {'star': 'https://api.kickstarter.com/...,Doomsday - A survival board game for 1-4 players,Help us create our professional prototype and ...,...,"{'id': 13911, 'name': 'Brighton', 'slug': 'bri...","{'id': 34, 'name': 'Tabletop Games', 'slug': '...","{'id': 3972713, 'project_id': 3972713, 'state'...",False,2.2,False,False,0,2,0
