# store information of a competition in one table: Competition

<table>
  <thead>
    <tr>
      <th>Datafield</th>
      <th>Description</th>
    </tr>
  <tbody>
    <tr>
      <td>id</td>
      <td>unique identifier for competition</td>
    </tr>
    <tr>
      <td>name</td>
      <td>competition name</td>
    </tr>
    <tr>
      <td>start_time</td>
      <td>start time of competition</td>
    </tr>
    <tr>
      <td>end_time</td>
      <td>end time of competition</td>
    </tr>
    <tr>
      <td>prize</td>
      <td>prize for winners</td>
    </tr>
    <tr>
      <td>entries</td>
      <td>{{entry_id: xx, user_id: xx, entry_url:xxx, time:xxx}, {entry_id: xx, user_id: xx, entry_url:xxx, time:xxx}, ...}</td>
    </tr>
    <tr>
      <td>winners</td>
      <td>[entry_id, entry_id]</td>
    </tr>

  </tbody>
</table>


In [1]:
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import pandas as pd

In [2]:
def scrape(competition_url):


    kv = {'user-agent': 'Mozilla/5.0'}
    r = requests.get(competition_url, headers=kv, timeout=30)
    r.raise_for_status()
    r.status_code
    r.encoding = r.apparent_encoding


    # In[4]:


    soup = BeautifulSoup(r.text, 'html.parser')


    # In[5]:


    ## title
    result = soup.find(name='h1', attrs="heading heading--h1 heading--no-margin")
    if result:
        title = result.text
    else:
        title = None

    title


    # In[6]:


    results = soup.find(name='div', attrs='inline-page')

    headers = results.findAll("p", "heading heading--size4 heading--no-margin")
    descriptions = results.findAll("p", "paragraph")

    summary = ""
    company_name = ""
    vision = ""

    for header, description in zip(headers, descriptions):
        if header.text == "Summary":
            summary = description.text
        elif header.text == "Company name":
            company_name = description.text
        elif header.text == "What's your vision?":
            vision = description.text



    kv = {'user-agent': 'Mozilla/5.0'}
    url = competition_url + "/entries"

    r = requests.get(url, headers=kv, timeout=30)
    r.raise_for_status()
    r.status_code
    r.encoding = r.apparent_encoding


    # In[10]:


    soup = BeautifulSoup(r.text, 'html.parser')


    # In[11]:


    ## competition start date
    text = str(soup.find(name='div', attrs="contest-header__price"))
    start_time = re.search('"startDate": .+,', text).group(0).replace('"startDate": "', '')[:-2]



    # ## 2. get participants and entry info

    # ### winner id and entry id

    # In[12]:


    def make_soup(competition_url, page_number, active=True):

        kv = {'user-agent': 'Mozilla/5.0'}
        if active:
            url = competition_url + "/entries?filter=active&page=" + str(page_number)
        else: 
            url = competition_url + "/entries?filter=non_active&page=" + str(page_number)
 

        r = requests.get(url, headers=kv, timeout=30)
        r.raise_for_status()
        r.status_code
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text, 'html.parser')
        return soup


    # In[13]:


    def get_winner_info(soup):

        results = soup.findAll(name='div', attrs='entry-matrix__item matrix__item entry-winners')
            
        winner_entry_ids = []
        for result in results:
            winner_entry = result.find(name='div', attrs="entry entry--linked entry--zoom-linked")
            if winner_entry:
                winner_entry_ids.append(winner_entry["id"])
                continue
            winner_entry = result.find(name='div', attrs="entry") # case if winner entry is deleted
            if winner_entry:
                winner_entry_ids.append(winner_entry["id"])
                continue
        return winner_entry_ids


    # In[14]:


    page_number = 1
    soup = make_soup(competition_url, page_number)
    winner_entry_ids = get_winner_info(soup) 
    winner_entry_ids


    # In[15]:


    seeker = soup.find(name='span', attrs="display-name").text
    seeker


    # In[16]:


    entry_summary = soup.find(name='select', attrs="styled-select__select")
    entry_summary = entry_summary.text.replace('\n', "")
    entry_summary


    # In[17]:


    entry_count = int(re.search('All \(\d+\)', entry_summary).group(0)[5: -1])
    deleted_entry_count = re.search('Declined and withdrawn \(\d+\)', entry_summary).group(0)[24:-1]


    # In[18]:


    def get_participant_and_entry_info(soup):

        entry_matrix = soup.findAll(name='div', attrs='entry-matrix__item matrix__item')
        for entry in entry_matrix:
            result = entry.find(name='div', attrs='entry entry--linked entry--zoom-linked')
            try:
                participants_user_ids.append(result["data-user-id"])
                participants_entry_ids.append(result["id"])
                participants_entry_image_urls.append(result.find(name='a')["href"])
            except:
                pass


    # In[19]:


    def get_time(soup):
        ## get time
        # text = str(soup.find(name='div', attrs='entry-pane__results').contents)
        text = str(soup.find(name='div', attrs='entry-pane__results'))
        time = re.findall('"timeCreatedString":".{20,30}"', text)
        time = list(map(lambda x: x.replace('"timeCreatedString":"', '').replace('","', ""), time))
        return time

        




    def get_prize(soup):
        ## get time
        text = str(soup.find(name='div', attrs='contest-header contest-header--with-breadcrumbs').contents)
        prize = re.search('"prizeMoney": ".{2,10}"', text).group(0)
        prize = prize.replace('"prizeMoney": "', "").replace('"', "")
        return prize



    participants_user_ids = []
    participants_entry_ids = []
    participants_entry_image_urls = []
    participants_submission_time = []
    page_number = 1



    while True:

        if page_number == 1:
            soup = make_soup(competition_url, page_number)
            prize = get_prize(soup)
            winner_entry_ids = get_winner_info(soup)
            get_participant_and_entry_info(soup)
            time = get_time(soup)
            participants_submission_time += time
            if "entry-1" in participants_entry_ids or len(time) < 36:
                break
        else:
            soup = make_soup(competition_url, page_number)
            get_participant_and_entry_info(soup)
            time = get_time(soup)
            participants_submission_time += time
            if "entry-1" in participants_entry_ids or len(time) < 36:
                break

        page_number += 1


    # In[22]:


    participants_submission_time[:5]


    # In[23]:


    entries = []
    for entry_id, participant_id, time, url in zip(participants_entry_ids, 
                                                    participants_user_ids, 
                                                participants_submission_time, 
                                                participants_entry_image_urls):
        entries.append({'entry_id': entry_id, 'participant_id': participant_id, 'time': time, 'url':url})







    # ## 3. get participants and entry info for deleted entries

    # In[26]:


    def get_participant_and_entry_info_deleted_page(soup):

        user_ids = []
        entry_ids = []
        status = []
        deleted_count = 0
        withdrawn_count = 0
        declined_count = 0

        entry_matrix = soup.findAll(name='div', attrs='entry-matrix__item matrix__item')
        
        for entry in entry_matrix:
            result = entry.find(name='div', attrs='entry')
        # print(result)
            try:
                user_ids.append(result["data-user-id"])
                entry_ids.append(result["id"])
                s = ''
                for tag in entry.findAll('div', "entry-status-overlay"):
                    attributes = tag.attrs
                    if attributes['data-entry-status'] == 'deleted' and 'data-hidden' not in attributes:
                        s = 'deleted'
                        deleted_count += 1
                        break
                    if attributes['data-entry-status'] == 'withdrawn' and 'data-hidden' not in attributes:
                        s = 'withdrawn'
                        withdrawn_count += 1
                        break
                    if attributes['data-entry-status'] == 'declined' and 'data-hidden' not in attributes:
                        s = 'declined' 
                        declined_count += 1
                        break
                
                status.append(s)

            except:
                pass

        return user_ids, entry_ids, status, deleted_count, withdrawn_count, declined_count

    # In[27]:

    deleted_participants_user_ids = []
    deleted_participants_entry_ids = []
    status_all = []
    total_deleted_count = 0
    total_withdrawn_count = 0
    total_declined_count = 0

    page_number = 1
    winner_number = len(winner_entry_ids)


    while True:
        # print(page_number)
        if page_number == 1:
            soup = make_soup(competition_url, page_number, active=False)
            winner_entry_ids = get_winner_info(soup)
            user_ids, entry_ids, status, deleted_count, withdrawn_count, declined_count = get_participant_and_entry_info_deleted_page(soup)
            deleted_participants_user_ids += user_ids
            deleted_participants_entry_ids += entry_ids
            status_all += status
            total_deleted_count += deleted_count
            total_withdrawn_count += withdrawn_count
            total_declined_count += declined_count
            if "entry-1" in entry_ids or len(user_ids) < 36:
                break
        else:
            soup = make_soup(competition_url, page_number, active=False)
            user_ids, entry_ids, status, deleted_count, withdrawn_count, declined_count = get_participant_and_entry_info_deleted_page(soup)
            deleted_participants_user_ids += user_ids
            deleted_participants_entry_ids += entry_ids
            status_all += status
            total_deleted_count += deleted_count
            total_withdrawn_count += withdrawn_count
            total_declined_count += declined_count
            if "entry-1" in entry_ids or len(user_ids) < 36:
                break

        page_number += 1

    # In[29]:


    deleted_entries = []
    for entry_id, participant_id, status in zip(deleted_participants_entry_ids, 
                                                    deleted_participants_user_ids, status_all):
        deleted_entries.append({'entry_id': entry_id, 'participant_id': participant_id, 'status': status})


    df_competition_description = pd.DataFrame({'title': [title], 'url': competition_url, 'seeker': seeker, 'summary': [summary], 
                                                'company_name': [company_name], 
                                                'vision': [vision],  'start_time': start_time, 
                                                'entry_count': entry_count, 'deleted_entry_count': deleted_entry_count,
                                                'deleted': total_deleted_count, 'withdrawn': total_withdrawn_count, 
                                                'declined': total_declined_count, 
                                                'prize': prize, 'entries': [entries],
                                                'deleted_entries': [deleted_entries], 'winners': [winner_entry_ids]})


    return df_competition_description



## output to file

In [3]:
urls = pd.read_csv('competition_data_1013/href.csv')
urls = urls['href'][:]

results = pd.DataFrame({})
counter = 0

for url in tqdm(urls):
    counter += 1 
    try:
        df = scrape(url)
        results = pd.concat([results, df])
    except:
        pass

100%|██████████| 1500/1500 [4:50:13<00:00, 11.61s/it]


In [4]:
results.to_csv('competition_data_1016/first1500-v2.csv', index=False)