In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
from urllib.request import urlopen
from tqdm import tqdm
import random
import time
from selenium import webdriver
from langdetect import detect
import urllib
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
project_features = [
    'pro_link',
    'title',
    'status',
    'percentage_fund',
    'funded',
    'target',
    'end_date',
    'backers',
    'subcategory',
    'thumbnail_type',
    'number_rewards',
    'min_price',
    'max_price',
    'num_news',
    'num_comments',
    'num_contributions',
    'creater_link',
    'language',
    'short_des'
]

reward_features = [
    'pro_link',
    'price',
    'num_backers',
    'has_media'
]

creater_features = [
    'creater_link',
    'name',
    'num_created',
    'num_backed',
    'num_followed'
]

timeline_features = [
    'pro_link',
    'date',
    'amount'
] 

comment_features = [
    'pro_link',
    'comment_txt'
]

In [4]:
def create_url(page):
    return 'https://www.kisskissbankbank.com/en/discover?project[successful]=on&filter=all&page=' + str(page)

In [5]:
def extract_data(url):
    # 1. Go to the page of the project
    # 1.1 Load the website by driver
    chrome_path = './chromedriver.exe'
    driver = webdriver.Chrome(chrome_path)
    driver.get(url)
    time.sleep(5)
    
    # 1.2 Accept the cookie
    cookie_question = driver.find_element_by_xpath("//*[@id=\"App-react-component\"]/div[1]/div/div/div/div/button[2]")
    cookie_question.click()
    
    # 1.3 Load the html page
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    
    # 2. Get information about the project
    # Get pro_link
    pro_link = url
    
    # Get title
    title = bs.find('h1', 'title__StyledTitle-sc-46lshq-0 kxkLAV titles__StyledTitle-sc-1v04wsx-0 gwAQhx k-u-align-center').contents[0]
    
    # Get status
    status = bs.find('span', 'text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-weight-regular project-state__StyledText-w82mew-2 dClAPF').contents[0]
    
    #Get percentage_fund
    percentage_fund = bs.find('div', 'text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-line-height-normal k-u-weight-regular progress__StyledText-ycznm1-1 iXebEz').contents[0]
    
    # Get funded, target, end_date, backers
    info = bs.findAll('div', 'text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-weight-bold info__StyledBigText-lcf1ol-0 clfvNf')
    funded = info[2].contents[0] 
    target = bs.findAll('div', 'text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-weight-light info__StyledSmallText-lcf1ol-1 gDdVEG')[2].contents[0]
    end_date = info[1].contents[0]
    backers = info[0].contents[0]
    
    # Get subcategory
    subcategories = bs.findAll('span','text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-size-micro k-u-weight-regular')
    temp_sub = []
    for s in subcategories:
        cate = ''
        for k in range (0,len(s.contents),2):
            if(k == 0):
                cate = cate + s.contents[k]
            else:
                cate = cate + ' ' + s.contents[k]
        temp_sub.append(cate)
    subcategory = ','.join(temp_sub)
    
    # Get thumbnail_type
    thumbnail_type = bs.find('img','project-media__StyledAvatar-bus9q7-1 oMMta').get('src').split('.')[-1]
    
    # Get number_rewards, min_price, max_price, and 3. Get information about rewards
    rewards = bs.findAll('div','marger__StyledMarger-sc-1qqifp5-0 kXGmDY')
    price = []
    num_backers = []
    reward = [] 
    for j in range(len(rewards)):
        rj = rewards[j]
            
        # Get remaining variables
        price_j = rj.findAll('h2', 'title__StyledTitle-sc-46lshq-0 edmigd k-RewardCard__title k-u-margin-bottom-double k-u-margin-top-none')
        if(len(price_j) == 0):
            break
        try:
            price_j = float(price_j[0].contents[0].replace('€','').replace(',',''))
        except:
            break
        price.append(price_j)
        backers_j = rj.findAll('span', 'text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-size-micro k-u-weight-regular k-RewardCard__infos k-RewardCard__infos--hasBottomMargin k-RewardCard__infos--disabled')
        bj = 0
        if(len(backers_j) != 0 and backers_j[0].contents[0].replace(' ','').isnumeric()):
            bj = int(backers_j[0].contents[0])
        num_backers.append(bj)
        
        # Get has_media
        img = rj.findAll('img','k-RewardCard__image k-RewardCard__image--disabled')
        if(len(img) > 0):
            has_media_j = True
        else:
            has_media_j = False
            
        reward.append([pro_link,price_j,bj,has_media_j])
    if(len(price) == 0):
        min_price = -1
        max_price = -1
    else:
        min_price = np.min(np.array(price))
        max_price = np.max(np.array(price))
    number_rewards = len(price)
    
    # Get num_news, num_comments, num_contributions
    information = bs.findAll('a', 'k-NavBar__link')
    news_bs = information[2].findAll('span', 'badge__StyledBadge-sc-7liuod-0 eucluz k-Badge k-Badge--spaced')
    comments_bs = information[3].findAll('span', 'badge__StyledBadge-sc-7liuod-0 eucluz k-Badge k-Badge--spaced')
    contributions_bs = information[4].findAll('span', 'badge__StyledBadge-sc-7liuod-0 eucluz k-Badge k-Badge--spaced')
    
    if(len(news_bs) == 0):
        num_news = 0
    else:
        num_news = int(news_bs[0].contents[0])

    if(len(comments_bs) == 0):
        num_comments = 0
    else:
        num_comments = int(comments_bs[0].contents[0])

    if(len(contributions_bs) == 0):
        num_contributions = 0
    else:
        num_contributions = int(contributions_bs[0].contents[0])
        
    # Get creater_link
    creater_link = 'https://www.kisskissbankbank.com' + bs.find('a','owner-info__StyledOwnerGrid-tqxc8c-0 jnrkmW').get('href')
    
    # Get language
    try:
        short_des = bs.findAll('p',{'data-test-id':'short-description'})[0].contents[0]
        language = detect(short_des)

        # Get short description
        short_des = bs.findAll('p',{"data-test-id":"short-description"})[0].contents[0]
    except:
        short_des = ''
        language = ''
    
    project = [
        pro_link,
        title,
        status,
        percentage_fund,
        funded,
        target,
        end_date,
        backers,
        subcategory,
        thumbnail_type,
        number_rewards,
        min_price,
        max_price,
        num_news,
        num_comments,
        num_contributions,
        creater_link,
        language,
        short_des
    ]
    
    
    # 5. Get information about timeline
    '''
    timeline = []
    driver.find_element_by_xpath("//a[contains(., 'Contributions')]").click();
    time.sleep(2)
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    
    date, amount = [], []
    while True:
        try:
            driver.find_element_by_xpath("//button[contains(., 'Load more')]").click();
            time.sleep(5)
        except:
            time.sleep(5)
            break        
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    contributions = bs.findAll('div', 'backer-card__StyledCard-sc-1buoqas-0 TgtUZ')
    print(len(contributions))
    for contribution in contributions:
        spans = contribution.find('p','text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-line-height-normal k-u-size-micro k-u-weight-light k-u-margin-none').findAll('span')
        result = [span.text.strip() for span in spans]
        if len(result) > 1: 
            #date.append(result[1])
            #amount.append(result[0].replace('€', ''))
            timeline.append([pro_link, result[1], result[0].replace('€', '')])


    # 6. Get information about comments
    comment = []
    driver.find_element_by_xpath("//a[contains(., 'Comments')]").click();
    time.sleep(5)
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    
    comments_txt = []
    #replies_txt = []
    
    while True:
        try:
            driver.find_element_by_xpath("//button[contains(., 'Load more')]").click();
            time.sleep(5)
        except:
            time.sleep(5)
            break
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    comments = bs.findAll('div', 'comment__StyledMargerText-sc-8s8e85-5 eYhLGU')
    #replies = bs.find('div', 'replies__Container-t86rf-0 bJiWce').findAll('div', 'comment__StyledMargerText-sc-8s8e85-5 eYhLGU')
    print(len(comments))
    for cmt in comments:
        comment.append([pro_link, cmt.text])
    #replies_txt = [reply.text for reply in replies]

    '''
    # 4. Get information about the project creaters
    #name, num_created, num_backed, num_followed = extract_creater(creater_link)
    driver.get(creater_link)
    
    # 4.1 Load the html page
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    
    # 4.2 Get information about the creater
    try:
        name = bs.find('h1', 'k-Title k-Title--quinary').text
        span = bs.findAll('span', 'k-Badge k-HorizontalNav__badge')
        num_created = int(span[0].text)
        num_backed = int(span[1].text)
        num_followed = int(int(span[2].text))
    except:
        name = 'Group creaters'
        num_created = -1
        num_backed = -1
        num_followed = -1
    
    
    creater = [
        creater_link,
        name,
        num_created,
        num_backed,
        num_followed
    ]
    
    # 7. Quit the driver
    driver.quit()
    
    return project, reward, creater

In [7]:
def collect_from_google(url):
    
    projects = []
    rewards = []
    creaters = []
    
    chrome_path = './chromedriver.exe'
    driver = webdriver.Chrome(chrome_path)
    driver.get(url)
    time.sleep(5)
    
    # 1.2 Accept the cookie
    #cookie_question = driver.find_element_by_xpath('//*[@id="yDmH0d"]/c-wiz/div[2]/div/div/div/div/div[2]/form')
    #cookie_question.click() 
    WebDriverWait(driver,10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[src^='https://consent.google.com']")))
    WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[@id='introAgreeButton']"))).click() 
    
    
    while True:
        
        # Go to each page and collect data
        html = driver.page_source
        bs = BeautifulSoup(html, 'html.parser')
        
        # Locate the link of each page
        project_links = bs.findAll('div','yuRUbf')
        print(len(project_links))
        links = []
        for p in project_links:
            links.append(p.find('a').get('href'))
        
        # Collect information of each page
        for pro_link in tqdm(links):
            try: 
                project, reward, creater = extract_data(pro_link)
            except:
                continue
            projects.append(project)
            for r in reward:
                rewards.append(r)
            creaters.append(creater)
    
        # Click to "loadmore"
        try:
            nxt = 'https://www.google.com' + bs.find('a',{'id':'pnnext'}).get('href')
            driver.get(nxt)
            time.sleep(5)
        except:
            driver.quit()
            break
    projects = pd.DataFrame(projects, columns = project_features)
    rewards = pd.DataFrame(rewards, columns = reward_features)
    creaters = pd.DataFrame(creaters, columns = creater_features)
    return projects, rewards, creaters
    

In [8]:
search = 'https://www.google.com/search?q=site%3Awww.kisskissbankbank.com%2Fen%2Fprojects%2F+failed&oq=site%3Awww.kisskissbankbank.com%2Fen%2Fprojects%2F+failed&aqs=chrome..69i57j69i58.725j0j7&sourceid=chrome&ie=UTF-8'
projects, rewards, creaters = collect_from_google(search)

  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:28<00:00, 20.88s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:24<00:00, 20.46s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:32<00:00, 21.27s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:25<00:00, 20.54s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:26<00:00, 20.63s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:30<00:00, 21.06s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:28<00:00, 20.88s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:26<00:00, 20.65s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:29<00:00, 20.93s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:25<00:00, 20.59s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:28<00:00, 20.80s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:49<00:00, 22.97s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:28<00:00, 20.83s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:28<00:00, 20.81s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:33<00:00, 21.34s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:49<00:00, 22.95s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:29<00:00, 20.96s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:36<00:00, 21.62s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:45<00:00, 22.50s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:28<00:00, 20.89s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:36<00:00, 21.67s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:35<00:00, 21.54s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

10


100%|██████████| 10/10 [03:30<00:00, 21.03s/it]
  0%|          | 0/8 [00:00<?, ?it/s]

8


100%|██████████| 8/8 [02:53<00:00, 21.67s/it]


In [14]:
projects[projects['status'] == 'Failed']

Unnamed: 0,pro_link,title,status,percentage_fund,funded,target,end_date,backers,subcategory,thumbnail_type,number_rewards,min_price,max_price,num_news,num_comments,num_contributions,creater_link,language,short_des
0,https://www.kisskissbankbank.com/en/projects/t...,Test-Failed,Failed,0,€0,Out of €100,05/30/2018,0,,png,0,-1.0,-1.0,0,0,0,https://www.kisskissbankbank.com/en/users/kiss...,fr,test-failed
1,https://www.kisskissbankbank.com/en/projects/3...,365 PLUS UN: Sustainable Fashion Accessories m...,Failed,18,€550,"Out of €3,000",12/05/2019,11,,jpg,11,5.0,3500.0,3,1,12,https://www.kisskissbankbank.com/en/users/kami...,en,Join the 365 PLUS UN adventure
2,https://www.kisskissbankbank.com/en/projects/j...,Juan Francisco Casas - (A)utopic (BOOTleg ARTt...,Failed,32,€960,"Out of €3,000",06/20/2014,26,,jpg,5,5.0,60.0,0,1,26,https://www.kisskissbankbank.com/en/users/fred...,en,"Discover exclusively the new title of the ""BOO..."
3,https://www.kisskissbankbank.com/en/projects/p...,Peace Project,Failed,0,€0,"Out of €3,000",12/14/2015,0,,png,6,5.0,100.0,0,0,0,https://www.kisskissbankbank.com/en/users/paxp...,en,Every voice needs to be heard !\r\nEvery life...
8,https://www.kisskissbankbank.com/en/projects/e...,Excroissance,Failed,33,€835,"Out of €2,500",01/28/2015,26,,jpg,7,5.0,300.0,0,10,26,https://www.kisskissbankbank.com/en/users/leo-...,en,A film about a man who is disabled and tries t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,https://www.kisskissbankbank.com/en/projects/s...,Sexy Pause - Arrêt sur image,Failed,0,€0,"Out of €3,550",03/30/2014,0,,jpg,8,5.0,1000.0,3,0,0,https://www.kisskissbankbank.com/en/users/sexy...,fr,Le chic d'une galerie Porn'Art mêlant photogra...
234,https://www.kisskissbankbank.com/en/projects/t...,The BEE - universal urban tricycle,Failed,1,€130,"Out of €8,100",06/23/2015,2,,jpg,6,10.0,3800.0,0,1,2,https://www.kisskissbankbank.com/en/users/tryt...,en,Help us to create healthier alternative to cit...
235,https://www.kisskissbankbank.com/en/projects/m...,MADE IN LOVE,Failed,52,"€7,870","Out of €15,000",12/21/2015,97,,gif,13,5.0,50000.0,7,29,97,https://www.kisskissbankbank.com/en/users/made...,en,When the condom transform itself into a piece ...
236,https://www.kisskissbankbank.com/en/projects/p...,PIDAN STUDIO - Gamelle Diététique Pour Chat,Failed,32,€642,"Out of €2,000",07/30/2017,18,,jpg,7,5.0,42.0,1,1,18,https://www.kisskissbankbank.com/en/users/pida...,en,"To feed his cat a dietary way, start by choosi..."


In [10]:
rewards

Unnamed: 0,pro_link,price,num_backers,has_media
0,https://www.kisskissbankbank.com/en/projects/3...,85.0,0,True
1,https://www.kisskissbankbank.com/en/projects/3...,5.0,3,True
2,https://www.kisskissbankbank.com/en/projects/3...,25.0,4,True
3,https://www.kisskissbankbank.com/en/projects/3...,35.0,3,True
4,https://www.kisskissbankbank.com/en/projects/3...,60.0,0,True
...,...,...,...,...
1670,https://www.kisskissbankbank.com/en/projects/a...,20.0,0,True
1671,https://www.kisskissbankbank.com/en/projects/a...,30.0,0,True
1672,https://www.kisskissbankbank.com/en/projects/a...,50.0,0,True
1673,https://www.kisskissbankbank.com/en/projects/a...,75.0,0,True


In [11]:
creaters

Unnamed: 0,creater_link,name,num_created,num_backed,num_followed
0,https://www.kisskissbankbank.com/en/users/kiss...,kisskissbankbank-noël,3,0,0
1,https://www.kisskissbankbank.com/en/users/kami...,Kamilla-Sani Gabdullina,1,0,0
2,https://www.kisskissbankbank.com/en/users/fred...,Frederic Claquin,8,2,6
3,https://www.kisskissbankbank.com/en/users/paxp...,paxproject,1,0,0
4,https://www.kisskissbankbank.com/en/users/pika...,PIKA PIKA FILMS,1,2,0
...,...,...,...,...,...
233,https://www.kisskissbankbank.com/en/users/sexy...,SexyPause,1,0,0
234,https://www.kisskissbankbank.com/en/users/tryt...,Trytrike,1,0,0
235,https://www.kisskissbankbank.com/en/users/made...,MADE IN LOVE team,1,1,0
236,https://www.kisskissbankbank.com/en/users/pida...,pidan studio,1,0,0


In [12]:
#projects.to_csv('./data/failed/projects.csv')
#rewards.to_csv('./data/failed/rewards.csv')
#creaters.to_csv('./data/failed/creaters.csv')