# store information of a competition in one table: Competition

<table>
  <thead>
    <tr>
      <th>Datafield</th>
      <th>Description</th>
    </tr>
  <tbody>
    <tr>
      <td>id</td>
      <td>unique identifier for competition</td>
    </tr>
    <tr>
      <td>url</td>
      <td>competition webpage</td>
    </tr>
    <tr>
      <td>name</td>
      <td>competition name</td>
    </tr>
    <tr>
      <td>start_time</td>
      <td>start time of competition</td>
    </tr>
    <tr>
      <td>seeker</td>
      <td>competition host</td>
    </tr>
    <tr>
      <td>prize</td>
      <td>prize for winners</td>
    </tr>
    <tr>
      <td>entries</td>
      <td>{{entry_id: xx, user_id: xx, entry_url:xxx, time:xxx}, {entry_id: xx, user_id: xx, entry_url:xxx, time:xxx}, ...}</td>
    </tr>
    <tr>
      <td>deleted entries</td>
      <td>{{entry_id: xx, user_id: xx}, {entry_id: xx, user_id: xx}, ...}</td>
    </tr>
    <tr>
      <td>winners</td>
      <td>[entry_id, entry_id]</td>
    </tr>

  </tbody>
</table>


In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
# competition_url = "https://99designs.hk/logo-design/contests/exotic-rides-ultimate-logo-contest-48555"
competition_url = "https://99designs.hk/logo-design/contests/dv-live-event-management-company-logo-graphics-48557"
competition_url = "https://99designs.hk/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641"


## 1. get competition information

In [3]:
kv = {'user-agent': 'Mozilla/5.0'}
r = requests.get(competition_url, headers=kv, timeout=30)
r.raise_for_status()
r.status_code
r.encoding = r.apparent_encoding

In [4]:
soup = BeautifulSoup(r.text, 'html.parser')

In [5]:
## title
result = soup.find(name='h1', attrs="heading heading--h1 heading--no-margin")
if result:
    title = result.text
else:
    title = None

title

'Needed: cutting edge, contemporary logo for real estate firm'

In [6]:
results = soup.find(name='div', attrs='inline-page')

In [7]:
headers = results.findAll("p", "heading heading--size4 heading--no-margin")
headers

[<p class="heading heading--size4 heading--no-margin">Company name</p>,
 <p class="heading heading--size4 heading--no-margin">Overview</p>,
 <p class="heading heading--size4 heading--no-margin">About us</p>]

In [8]:
descriptions = results.findAll("p", "paragraph")
descriptions

[<p class="paragraph paragraph--no-margin">Plan B Investments</p>,
 <p class="paragraph"><span>We are a diverse team of real estate professionals whose goal is to help the average person achieve success in real estate investment. We accomplish this through a process of education, facilitation, and building relationships with developers.</span></p>,
 <p class="paragraph"><span>We want to appeal to the investor who tried to "do real estate" the traditional way with traditional properties and traditional mortgages, but ended up losing big. Our audience is people who want to invest but want a more viable option than standard US real estate.</span></p>]

In [9]:
overview = ""
company_name = ""
description_of_organization_and_audience = ""

In [10]:
for header, description in zip(headers, descriptions):
    if header.text == "Overview":
        overview = description.text
    elif header.text == "Company name":
        company_name = description.text
    elif header.text == "About us":
        description_of_organization_and_audience = description.text


In [11]:
print("**********company name**********")
print(company_name)
print("**********overview*************")
print(overview)
print("**********description of the organization and its target audience**********")
print(description_of_organization_and_audience)


**********company name**********
Plan B Investments
**********overview*************
We are a diverse team of real estate professionals whose goal is to help the average person achieve success in real estate investment. We accomplish this through a process of education, facilitation, and building relationships with developers.
**********description of the organization and its target audience**********
We want to appeal to the investor who tried to "do real estate" the traditional way with traditional properties and traditional mortgages, but ended up losing big. Our audience is people who want to invest but want a more viable option than standard US real estate.


In [12]:
requirements = soup.findAll(name='div', attrs='inline-page')[1].span.text

In [13]:
kv = {'user-agent': 'Mozilla/5.0'}
url = competition_url + "/entries"

r = requests.get(url, headers=kv, timeout=30)
r.raise_for_status()
r.status_code
r.encoding = r.apparent_encoding

In [14]:
soup = BeautifulSoup(r.text, 'html.parser')

In [15]:
## competition start date
text = str(soup.find(name='div', attrs="contest-header__price"))
start_time = re.search('"startDate": .+,', text).group(0).replace('"startDate": "', '')[:-2]
print(start_time)

Mon, 12 Jul 2010 20:51:26 +0000


## 2. get participants and entry info

### winner id and entry id

In [16]:
def make_soup(competition_url, page_number, active=True):

    kv = {'user-agent': 'Mozilla/5.0'}
    if active:
        url = competition_url + "/entries?filter=active&page=" + str(page_number)
    else: 
        url = competition_url + "/entries?filter=non_active&page=" + str(page_number)
    print(url)

    r = requests.get(url, headers=kv, timeout=30)
    r.raise_for_status()
    r.status_code
    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

In [17]:
def get_winner_info(soup):

    results = soup.findAll(name='div', attrs='entry-matrix__item matrix__item entry-winners')
        
    winner_entry_ids = []
    for result in results:
        winner_entry = result.find(name='div', attrs="entry entry--linked entry--zoom-linked")
        if winner_entry:
            winner_entry_ids.append(winner_entry["id"])
            continue
        winner_entry = result.find(name='div', attrs="entry") # case if winner entry is deleted
        if winner_entry:
            winner_entry_ids.append(winner_entry["id"])
            continue
    return winner_entry_ids


In [18]:
page_number = 1
soup = make_soup(competition_url, page_number)
winner_entry_ids = get_winner_info(soup) 
winner_entry_ids

https://99designs.hk/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries?filter=active&page=1


['entry-34']

In [19]:
seeker = soup.find(name='span', attrs="display-name").text
seeker

'Cyara Pott'

In [20]:
entry_summary = soup.find(name='select', attrs="styled-select__select")
entry_summary = entry_summary.text.replace('\n', "")
entry_summary

'            All (44)                    Unrated (42)                    1–2 stars (1)                    3–5 stars (1)                    Declined and withdrawn (27)        '

In [21]:
entry_count = int(re.search('All \(\d+\)', entry_summary).group(0)[5: -1])
deleted_entry_count = re.search('Declined and withdrawn \(\d+\)', entry_summary).group(0)[24:-1]
print(entry_count, deleted_entry_count)

44 27


In [22]:
def get_participant_and_entry_info(soup):

    entry_matrix = soup.findAll(name='div', attrs='entry-matrix__item matrix__item')
    for entry in entry_matrix:
        result = entry.find(name='div', attrs='entry entry--linked entry--zoom-linked')
        try:
            participants_user_ids.append(result["data-user-id"])
            participants_entry_ids.append(result["id"])
            participants_entry_image_urls.append(result.find(name='a')["href"])
        except:
            pass

In [23]:
def get_time(soup):
    ## get time
    # text = str(soup.find(name='div', attrs='entry-pane__results').contents)
    text = str(soup.find(name='div', attrs='entry-pane__results'))
    time = re.findall('"timeCreatedString":".{20,30}"', text)
    if time:
        time = list(map(lambda x: x.replace('&quot;', '').replace('timeCreatedString:', ""), time))
        return time
    # print(text)
    time = re.findall('timeCreatedString&quot;:&quot;.{20,30}&quot;', text)
    if time:
        time = list(map(lambda x: x.replace('&quot;', '').replace('timeCreatedString:', ""), time))
        return time
    else:
        return []
    

In [24]:
def get_prize(soup):
    ## get prize
    text = str(soup.find(name='div', attrs='contest-header contest-header--with-breadcrumbs').contents)
    prize = re.search('"prizeMoney": ".{2,10}"', text).group(0)
    prize = prize.replace('"prizeMoney": "', "").replace('"', "")
    return prize


In [25]:
participants_user_ids = []
participants_entry_ids = []
participants_entry_image_urls = []
participants_submission_time = []
page_number = 1



while True:
    print(page_number)
    if page_number == 1:
        soup = make_soup(competition_url, page_number)
        prize = get_prize(soup)
        winner_entry_ids = get_winner_info(soup)
        get_participant_and_entry_info(soup)
        time = get_time(soup)
        # print(participants_user_ids)
        participants_submission_time += time
        if len(time) < 36:
            break
    else:
        soup = make_soup(competition_url, page_number)
        get_participant_and_entry_info(soup)
        time = get_time(soup)
        participants_submission_time += time
        if len(time) < 36:
            break

    # print(participants_user_ids)
    # print(participants_entry_ids)
    # print(participants_entry_image_urls)

    page_number += 1

1
https://99designs.hk/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries?filter=active&page=1
2
https://99designs.hk/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries?filter=active&page=2


In [26]:
participants_submission_time[:5]

['2010-07-16T20:06:16+00:00',
 '2010-07-13T18:25:16+00:00',
 '2010-07-19T20:53:01+00:00',
 '2010-07-19T20:51:59+00:00',
 '2010-07-19T20:46:35+00:00']

In [27]:
entries = []
for entry_id, participant_id, time, url in zip(participants_entry_ids, 
                                                participants_user_ids, 
                                               participants_submission_time, 
                                               participants_entry_image_urls):
    entries.append({'entry_id': entry_id, 'participant_id': participant_id, 'time': time, 'url':url})


In [28]:
entries[:5]

[{'entry_id': 'entry-34',
  'participant_id': '407367',
  'time': '2010-07-16T20:06:16+00:00',
  'url': '/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries/34'},
 {'entry_id': 'entry-2',
  'participant_id': '426835',
  'time': '2010-07-13T18:25:16+00:00',
  'url': '/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries/2'},
 {'entry_id': 'entry-71',
  'participant_id': '432199',
  'time': '2010-07-19T20:53:01+00:00',
  'url': '/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries/71'},
 {'entry_id': 'entry-70',
  'participant_id': '315319',
  'time': '2010-07-19T20:51:59+00:00',
  'url': '/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries/70'},
 {'entry_id': 'entry-69',
  'participant_id': '370615',
  'time': '2010-07-19T20:46:35+00:00',
  'url': '/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entrie

In [29]:
entries[-5:]

[{'entry_id': 'entry-20',
  'participant_id': '278455',
  'time': '2010-07-15T20:33:31+00:00',
  'url': '/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries/20'},
 {'entry_id': 'entry-19',
  'participant_id': '315498',
  'time': '2010-07-15T16:42:05+00:00',
  'url': '/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries/19'},
 {'entry_id': 'entry-18',
  'participant_id': '315498',
  'time': '2010-07-15T16:40:39+00:00',
  'url': '/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries/18'},
 {'entry_id': 'entry-17',
  'participant_id': '407367',
  'time': '2010-07-15T14:36:56+00:00',
  'url': '/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries/17'},
 {'entry_id': 'entry-16',
  'participant_id': '407367',
  'time': '2010-07-15T14:36:21+00:00',
  'url': '/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entr

## 3. get participants and entry info for deleted entries

In [30]:
def get_participant_and_entry_info_deleted_page(soup):

    user_ids = []
    entry_ids = []
    status = []
    deleted_count = 0
    withdrawn_count = 0
    declined_count = 0

    entry_matrix = soup.findAll(name='div', attrs='entry-matrix__item matrix__item')
    
    for entry in entry_matrix:
        result = entry.find(name='div', attrs='entry')
    # print(result)
        try:
            user_ids.append(result["data-user-id"])
            entry_ids.append(result["id"])
            s = ''
            for tag in entry.findAll('div', "entry-status-overlay"):
                attributes = tag.attrs
                if attributes['data-entry-status'] == 'deleted' and 'data-hidden' not in attributes:
                    s = 'deleted'
                    deleted_count += 1
                    break
                if attributes['data-entry-status'] == 'withdrawn' and 'data-hidden' not in attributes:
                    s = 'withdrawn'
                    withdrawn_count += 1
                    break
                if attributes['data-entry-status'] == 'declined' and 'data-hidden' not in attributes:
                    s = 'declined' 
                    declined_count += 1
                    break
            
            status.append(s)

        except:
            pass

    return user_ids, entry_ids, status, deleted_count, withdrawn_count, declined_count

In [31]:
deleted_participants_user_ids = []
deleted_participants_entry_ids = []
status_all = []
total_deleted_count = 0
total_withdrawn_count = 0
total_declined_count = 0

page_number = 1
winner_number = len(winner_entry_ids)


while True:
    print(page_number)
    if page_number == 1:
        soup = make_soup(competition_url, page_number, active=False)
        winner_entry_ids = get_winner_info(soup)
        user_ids, entry_ids, status, deleted_count, withdrawn_count, declined_count = get_participant_and_entry_info_deleted_page(soup)
        deleted_participants_user_ids += user_ids
        deleted_participants_entry_ids += entry_ids
        status_all += status
        total_deleted_count += deleted_count
        total_withdrawn_count += withdrawn_count
        total_declined_count += declined_count
        if len(user_ids) < 36:
            break
    else:
        soup = make_soup(competition_url, page_number, active=False)
        user_ids, entry_ids, status, deleted_count, withdrawn_count, declined_count = get_participant_and_entry_info_deleted_page(soup)
        deleted_participants_user_ids += user_ids
        deleted_participants_entry_ids += entry_ids
        status_all += status
        total_deleted_count += deleted_count
        total_withdrawn_count += withdrawn_count
        total_declined_count += declined_count
        if len(user_ids) < 36:
            break

    page_number += 1

1
https://99designs.hk/logo-design/contests/needed-cutting-edge-contemporary-logo-real-estate-firm-48641/entries?filter=non_active&page=1


In [32]:
print(deleted_participants_user_ids)
print(deleted_participants_entry_ids)

['252538', '310467', '386412', '386412', '437268', '437268', '437268', '322472', '367167', '437268', '386412', '417051', '437268', '367167', '310467', '310467', '310467', '310467', '437268', '437268', '386412', '342701', '342701', '342701', '342701', '342701', '272379']
['entry-61', 'entry-48', 'entry-1', 'entry-68', 'entry-66', 'entry-65', 'entry-64', 'entry-63', 'entry-58', 'entry-56', 'entry-55', 'entry-52', 'entry-51', 'entry-50', 'entry-49', 'entry-47', 'entry-46', 'entry-45', 'entry-42', 'entry-41', 'entry-29', 'entry-21', 'entry-12', 'entry-11', 'entry-6', 'entry-5', 'entry-4']


In [33]:
deleted_entries = []
for entry_id, participant_id, status in zip(deleted_participants_entry_ids, 
                                    deleted_participants_user_ids, status_all):
    deleted_entries.append({'entry_id': entry_id, 'participant_id': participant_id, 'status': status})

## output to file

In [34]:
df_competition_description = pd.DataFrame({'title': [title], 'url': competition_url, 'seeker': seeker, 'overview': [overview], 
                                            'company_name': [company_name], 
                                            'description_of_organization_and_audience': [description_of_organization_and_audience],  
                                            'requirements': requirements, 'start_time': start_time, 
                                            'entry_count': entry_count, 'deleted_entry_count': deleted_entry_count,
                                            'deleted': total_deleted_count, 'withdrawn': total_withdrawn_count, 
                                            'declined': total_declined_count, 
                                            'prize': prize, 'entries': [entries],
                                            'deleted_entries': [deleted_entries], 'winners': [winner_entry_ids]})

In [35]:
competition_name = competition_url.split("/")[-1]
df_competition_description.to_csv('data_20220301/'+competition_name+'.csv', index=False)