# Target & Feature Scrape Functions

In [6449]:
import numpy as np
from bs4 import BeautifulSoup
import requests
from sklearn.linear_model import RidgeCV
import scipy.stats as stats
import statsmodels.api as sm
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import random
import os
import pandas as pd
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PowerTransformer
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import ProbPlot
import seaborn as sns
plt.style.use('seaborn')
plt.rc('font', size=14)
plt.rc('figure', titlesize=18)
plt.rc('axes', labelsize=15)
plt.rc('axes', titlesize=18)

In [68]:
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/marcmuon/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/marcmuon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




### Get Campaign Full Overview Text and Length

In [70]:
def campaign_text():

    # Grab description of campaign in text
    words = soup.find(class_='full-description').text
    words = words.replace('\xa0', '').split('\n')

    # Count number of words in campaign description
    count = 0
    for word in words:
        sentence = word.split(' ')
        count += len(sentence)
    return count

### Get Campaign Summary and Length

In [71]:
def campaign_text_len():

    # Grab campaign title text
    desc = soup.find(class_='type-18-md').text
    desc_words = desc.split(' ')
    return len(desc_words)
    # title = soup.title.text
    # title_txt = title.split('by')[0].strip()
    # print(len(title_txt))

### Get Suggested Pledge Amount

In [72]:
def pledge_amount():

    pledge = soup.input
    pledge = str(soup.find(class_='pledge__no-reward__input'))
    pledge_int = int(pledge.split('value="')[1][0:2])
    return pledge_int

### Get Number of Images in Description

In [73]:
def image_count():

    images = soup.find_all(class_='fit')
    return len(images)

### Video Header or Not

In [74]:
def video_header():
    header = soup.find_all(class_='ksr-video-player')
    if header:
        return 1
    else:
        return 0

### Get Pledged and Goal

In [75]:
def pledged_amount():
    pledged = soup.find(class_='ksr-green-700').text
    return pledged


def goal_amount():
    goal = soup.find(class_='money').text
    return goal

### Get Number of Pledge Gift Options

In [76]:
def gift_options():

    gifts = soup.find_all(class_='pledge__hover-content')
    gift_num = len(gifts)
    return gift_num

### Get Average Pledge Options

In [77]:
def average_pledge_amount():

    convert = soup.find_all(class_='pledge__currency-conversion')
    count = 0
    total = 0
    for options in convert:
        options = options.text
        amount = options.split('$')[1].strip()
        amount = amount.replace(',', '')
        amount = amount.split('.')[0]
        total += int(amount)
        count += 1
    average_option = total / count
    return average_option

### Sentiment Analysis

In [78]:
def description_sentiment():

    # get sentiment of the description
    sid = SentimentIntensityAnalyzer()
    words = soup.find(class_='full-description').text
    words = words.replace('\xa0', '').split('\n')
    chunk = ''
    for word in words:
        if word:
            chunk = chunk + ' ' + word
    scores = sid.polarity_scores(chunk)
    return scores

def sentiment_parse_pos(scores_dict):
    return scores_dict['pos']

def sentiment_parse_neg(scores_dict):
    return scores_dict['neg']

def sentiment_parse_neu(scores_dict):
    return scores_dict['neu']

def sentiment_parse_compound(scores_dict):
    return scores_dict['compound']

In [79]:
def summary_sentiment():
    desc = soup.find(class_='type-18-md').text
    # get sentiment of the summary text
    sid2 = SentimentIntensityAnalyzer()
    scores_desc = sid2.polarity_scores(desc)
    return scores_desc

### Get Project Length

In [80]:
def update_page(url):

    update_page = '/updates'
    url = url.split('?')[0]
    updates = url + update_page
    response = requests.get(updates)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup

In [81]:
import datetime
from dateutil import parser

def project_length(url):
    soup = update_page(url)
    start = soup.find_all('time')
    first = start[-1].text
    now = datetime.datetime.now()
    dt = parser.parse(first)
    diff = now - dt
    day_length = diff.days
    return day_length

# Pipeline to Scrape All Pages

### Collect List of Pages to Scrape

In [4694]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver


driver = webdriver.Chrome(chromedriver)
page = '1'
url = 'https://www.kickstarter.com/discover/advanced?woe_id=23424977&sort=end_date&seed=2579760&page=' + page
driver.get(url)

In [4695]:
link_list = []
for p in range(8, 13):
    page = str(p)
    
    url = 'https://www.kickstarter.com/discover/advanced?woe_id=23424977&sort=end_date&seed=2579603&page=' + page
    time.sleep(.1+.6*random.random())
    driver.get(url)
    elems = driver.find_elements_by_xpath("//a[@href and @class='block img-placeholder w100p']")
    for elem in elems:
        link_list.append(elem.get_attribute("href"))
link_list

['https://www.kickstarter.com/projects/1819480674/makeup-and-cosmetics-self-love-brand?ref=discovery',
 'https://www.kickstarter.com/projects/amberdunnministries/amber-dunn-worship-ep-heaven-reaching-down?ref=discovery',
 'https://www.kickstarter.com/projects/206230260/derelict-ink-vol-2?ref=discovery',
 'https://www.kickstarter.com/projects/threecupsdesign/make-100-sigils?ref=discovery',
 'https://www.kickstarter.com/projects/1357652086/berashield-total-protection-system-for-smartphone?ref=discovery',
 'https://www.kickstarter.com/projects/nickjcrabb/camber-brewing-company-help-us-make-the-best-beer?ref=discovery',
 'https://www.kickstarter.com/projects/privateerpress/the-art-of-privateer-press?ref=discovery',
 'https://www.kickstarter.com/projects/1789463192/extra-virgin-olive-oil-from-catalonia-spain?ref=discovery',
 'https://www.kickstarter.com/projects/njpartistry/a-childs-artistry?ref=discovery',
 'https://www.kickstarter.com/projects/outofmycomfortzone/out-of-my-comfort-zone-fea

### Save Link List

In [4696]:
import pickle

with open('links_3day_3lt.pkl', 'wb') as f:
    pickle.dump(link_list, f)

In [760]:
link_list = link_list[1:]

### Scrape all Pages in Pipeline

In [4697]:
df_dict = dict()
row_list = []
for idx, link in enumerate(link_list):
    row_list = []
    url = link
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    #row_list.append(target())
    row_list.append(pledged_amount())
    row_list.append(goal_amount())
    row_list.append(campaign_text())
    row_list.append(campaign_text_len())
    row_list.append(pledge_amount())
    row_list.append(image_count())
    row_list.append(video_header())
    row_list.append(gift_options())
    row_list.append(average_pledge_amount())
    
    overview_scores = description_sentiment()
    row_list.append(sentiment_parse_pos(overview_scores))
    row_list.append(sentiment_parse_neg(overview_scores))
    row_list.append(sentiment_parse_neu(overview_scores))
    row_list.append(sentiment_parse_compound(overview_scores))
    
    summary_scores = summary_sentiment()
    row_list.append(sentiment_parse_pos(summary_scores))
    row_list.append(sentiment_parse_neg(summary_scores))
    row_list.append(sentiment_parse_neu(summary_scores))
    row_list.append(sentiment_parse_compound(summary_scores))
    
    row_list.append(project_length(url))
    df_dict[idx]= row_list
    print(idx)
    time.sleep(.1+.6*random.random())

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59


### Save Scraped Dict

In [4698]:
import pickle
with open('dict_3day_3lt.pkl', 'wb') as f:
    pickle.dump(df_dict, f)