# store information of a competition in one table: Competition

<table>
  <thead>
    <tr>
      <th>Datafield</th>
      <th>Description</th>
    </tr>
  <tbody>
    <tr>
      <td>id</td>
      <td>unique identifier for competition</td>
    </tr>
    <tr>
      <td>name</td>
      <td>competition name</td>
    </tr>
    <tr>
      <td>start_time</td>
      <td>start time of competition</td>
    </tr>
    <tr>
      <td>end_time</td>
      <td>end time of competition</td>
    </tr>
    <tr>
      <td>prize</td>
      <td>prize for winners</td>
    </tr>
    <tr>
      <td>entries</td>
      <td>{{entry_id: xx, user_id: xx, entry_url:xxx, time:xxx}, {entry_id: xx, user_id: xx, entry_url:xxx, time:xxx}, ...}</td>
    </tr>
    <tr>
      <td>winners</td>
      <td>[entry_id, entry_id]</td>
    </tr>

  </tbody>
</table>


In [1]:
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import pandas as pd
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')
from selenium import webdriver
from selenium.webdriver.common.by import By

In [2]:
def scrape(competition_url):


    kv = {'user-agent': 'Mozilla/5.0'}
    r = requests.get(competition_url, headers=kv, timeout=30)
    r.raise_for_status()
    r.status_code
    r.encoding = r.apparent_encoding


    # In[4]:


    soup = BeautifulSoup(r.text, 'html.parser')


    # In[5]:


    ## title
    result = soup.find(name='h1', attrs="heading heading--h1 heading--no-margin")
    if result:
        title = result.text
    else:
        title = None

    title


    # In[6]:


    results = soup.find(name='div', attrs='inline-page')


    # In[7]:


    headers = results.findAll("p", "heading heading--size4 heading--no-margin")
    headers


    # ## 2. get brief page

    # In[8]:





    # In[9]:


    PATH = "C:\Program Files (x86)\chromedriver.exe" 


    # In[10]:


    driver = webdriver.Chrome(PATH)
    driver.get(competition_url + '/brief')


    # In[11]:


    background_information = driver.find_element(By.XPATH, '//div[@id="section-backgroundInformation"]/div')
    name_to_incorporate = background_information.find_element(By.XPATH, '//div[@id="element-backgroundInformation-logoBusinessName"]/div/div/div').text
    slogan_to_incorporate = background_information.find_element(By.XPATH, '//div[@id="element-backgroundInformation-slogan"]/div/div/div/div').text
    description = background_information.find_element(By.XPATH, '//div[@id="element-backgroundInformation-targetAudience"]/div/div/div').text
    industry = background_information.find_element(By.XPATH, '//div[@id="element-backgroundInformation-industry"]/div/div/div').text


    # In[12]:


    visual_style = driver.find_element(By.XPATH, '//div[@id="section-visualStyle"]/div')
    logo_types = driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/div[3]/div/div/div[2]/div/div[1]/div/div/div/div/div[2]/div/div/div[1]/div/div/div/div').text
    logo_to_be_used = driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/div[3]/div/div/div[2]/div/div[1]/div/div/div/div/div[2]/div/div/div[2]/div/div/div').text
    colors_to_explore = visual_style.find_element(By.XPATH, '//*[@id="element-visualStyle-colorPreferences"]/div/div/div').text.split('\n')
    other_color_requirements = visual_style.find_element(By.XPATH, '//div[@id="element-visualStyle-otherColorRequirements"]/div/div/div').text
    style_attributes = []
    for i in range(7):
        value = visual_style.find_element(By.XPATH, '//*[@id="element-visualStyle-styleAttributes"]/div/div/div/div/div[' + str(i+1) + ']/div[2]/div/div').get_attribute("aria-valuenow")
        style_attributes.append(value)

    try: 
        design_inspiration = visual_style.find_element(By.XPATH, '//*[@id="element-visualStyle-designExamples"]/div/div/div/div/div')
        design_inspiration_count = len(design_inspiration.find_elements(By.CLASS_NAME, "matrix__item"))
    except: 
        design_inspiration_count = 0




    # In[13]:


    references = visual_style.find_element(By.XPATH, '//div[@id="section-references"]')
    attachments = references.find_element(By.XPATH, '//*[@id="element-references-attachments"]/div/div/div').text
    if attachments == 'No files':
        attachments_count = 0
    else:
        attachments_count = len(references.find_elements(By.CLASS_NAME, "matrix__item"))
    other_notes = references.find_element(By.XPATH, '//*[@id="element-references-notes"]/div/div/div').text


    # In[14]:


    driver.close()


    # ## 3. get participants and entry info

    # In[15]:


    kv = {'user-agent': 'Mozilla/5.0'}
    url = competition_url + "/entries"

    r = requests.get(url, headers=kv, timeout=30)
    r.raise_for_status()
    r.status_code
    r.encoding = r.apparent_encoding


    # In[16]:


    soup = BeautifulSoup(r.text, 'html.parser')


    # In[17]:


    ## competition start date
    text = str(soup.find(name='div', attrs="contest-header__price"))
    start_time = re.search('"startDate": .+,', text).group(0).replace('"startDate": "', '')[:-2]
    # print(start_time)


    # ### winner id and entry id

    # In[18]:


    def make_soup(competition_url, page_number, active=True):

        kv = {'user-agent': 'Mozilla/5.0'}
        if active:
            url = competition_url + "/entries?filter=active&page=" + str(page_number)
        else: 
            url = competition_url + "/entries?filter=non_active&page=" + str(page_number)

        r = requests.get(url, headers=kv, timeout=30)
        r.raise_for_status()
        r.status_code
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text, 'html.parser')
        return soup


    # In[19]:


    def get_winner_info(soup):

        results = soup.findAll(name='div', attrs='entry-matrix__item matrix__item entry-winners')
            
        winner_entry_ids = []
        for result in results:
            winner_entry = result.find(name='div', attrs="entry entry--linked entry--zoom-linked")
            if winner_entry:
                winner_entry_ids.append(winner_entry["id"])
                continue
            winner_entry = result.find(name='div', attrs="entry") # case if winner entry is deleted
            if winner_entry:
                winner_entry_ids.append(winner_entry["id"])
                continue
        return winner_entry_ids


    # In[20]:


    page_number = 1
    soup = make_soup(competition_url, page_number)
    winner_entry_ids = get_winner_info(soup) 
    winner_entry_ids


    # In[21]:


    seeker = soup.find(name='span', attrs="display-name").text
    seeker


    # In[22]:


    entry_summary = soup.find(name='select', attrs="styled-select__select")
    entry_summary = entry_summary.text.replace('\n', "")
    entry_summary


    # In[23]:


    entry_count = int(re.search('All \(\d+\)', entry_summary).group(0)[5: -1])
    deleted_entry_count = re.search('Declined and withdrawn \(\d+\)', entry_summary).group(0)[24:-1]
    # print(entry_count, deleted_entry_count)


    # In[24]:


    def get_participant_and_entry_info(soup):

        entry_matrix = soup.findAll(name='div', attrs='entry-matrix__item matrix__item')
        for entry in entry_matrix:
            result = entry.find(name='div', attrs='entry entry--linked entry--zoom-linked')
            try:
                participants_user_ids.append(result["data-user-id"])
                participants_entry_ids.append(result["id"])
                participants_entry_image_urls.append(result.find(name='a')["href"])
            except:
                pass


    # In[25]:


    def get_time(soup):
        ## get time
        # text = str(soup.find(name='div', attrs='entry-pane__results').contents)
        text = str(soup.find(name='div', attrs='entry-pane__results'))
        time = re.findall('"timeCreatedString":".{20,30}"', text)
        if time:
            time = list(map(lambda x: x.replace('"timeCreatedString":"', '').replace('","', ""), time))
            return time

        time = re.findall('timeCreatedString\&quot\;\:&quot;.{20,30}\&quot\;', text)
        if time:
            time = list(map(lambda x: x[30: -6], time))
            return time

        
    # In[26]:


    def get_prize(soup):
        ## get time
        text = str(soup.find(name='div', attrs='contest-header contest-header--with-breadcrumbs').contents)
        prize = re.search('"prizeMoney": ".{2,10}"', text).group(0)
        prize = prize.replace('"prizeMoney": "', "").replace('"', "")
        return prize


    # In[ ]:





    # In[27]:


    participants_user_ids = []
    participants_entry_ids = []
    participants_entry_image_urls = []
    participants_submission_time = []
    page_number = 1



    while True:
        if page_number == 1:
            soup = make_soup(competition_url, page_number)
            prize = get_prize(soup)
            winner_entry_ids = get_winner_info(soup)
            get_participant_and_entry_info(soup)
            time = get_time(soup)
            participants_submission_time += time
            if len(time) < 36:
                break
        else:
            soup = make_soup(competition_url, page_number)
            get_participant_and_entry_info(soup)
            time = get_time(soup)
            participants_submission_time += time
            if len(time) < 36:
                break

        # print(participants_user_ids)
        # print(participants_entry_ids)
        # print(participants_entry_image_urls)

        page_number += 1


    # In[28]:


    participants_submission_time[:5]


    # In[29]:


    entries = []
    for entry_id, participant_id, time, url in zip(participants_entry_ids, 
                                                    participants_user_ids, 
                                                participants_submission_time, 
                                                participants_entry_image_urls):
        entries.append({'entry_id': entry_id, 'participant_id': participant_id, 'time': time, 'url':url})


    # In[30]:


    entries[:5]


    # In[31]:


    entries[-5:]


    # In[ ]:





    # ## 3. get participants and entry info for deleted entries

    # In[32]:


    def get_time_and_status_for_deleted_entry(entry_id):
        entry_id = entry_id.split('-')[-1]
        entry_url = competition_url + '/entries/' + entry_id
        
        ## make soup for entry page
        kv = {'user-agent': 'Mozilla/5.0'}
        r = requests.get(entry_url, headers=kv, timeout=30)
        r.raise_for_status()
        r.status_code
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text, 'html.parser')

        ## 
        script = str(soup.find('script', id="standalone-design-details-app-data"))
        time = re.search('"timeCreatedString":".{20,30}"', script).group(0)
        time = time.replace('"timeCreatedString":"', '').replace('","', "")

        status = 'deleted'
        if re.search('"status":"withdrawn"', script):
            status = 'withdrawn'
        elif re.search('"status":"eliminated"', script):
            status = 'declined'
        

        return time, status
        


    # In[33]:


    def get_participant_and_entry_info_deleted_page(soup):

        user_ids = []
        entry_ids = []
        status = []
        submission_time = []
        deleted_count = 0
        withdrawn_count = 0
        declined_count = 0

        entry_matrix = soup.findAll(name='div', attrs='entry-matrix__item matrix__item')
        
        for entry in entry_matrix:
            result = entry.find(name='div', attrs='entry')
        # print(result)
            try:
                user_ids.append(result["data-user-id"])
                entry_ids.append(result["id"])
                time, s = get_time_and_status_for_deleted_entry(result["id"])
                submission_time.append(time)
                status.append(s)

                if s == 'deleted':
                    deleted_count += 1
                elif s == 'withdrawn':
                    withdrawn_count += 1  
                elif s == 'declined':
                    declined_count += 1
                    

            except:
                pass

        return user_ids, entry_ids, status, submission_time, deleted_count, withdrawn_count, declined_count


    # In[34]:


    deleted_participants_user_ids = []
    deleted_participants_entry_ids = []
    status_all = []
    submission_time_all = []
    total_deleted_count = 0
    total_withdrawn_count = 0
    total_declined_count = 0

    page_number = 1
    winner_number = len(winner_entry_ids)


    while True:
        # print(page_number)
        if page_number == 1:
            soup = make_soup(competition_url, page_number, active=False)
            winner_entry_ids = get_winner_info(soup)
            user_ids, entry_ids, status, submission_time, deleted_count, withdrawn_count, declined_count = get_participant_and_entry_info_deleted_page(soup)
            deleted_participants_user_ids += user_ids
            deleted_participants_entry_ids += entry_ids
            status_all += status
            submission_time_all += submission_time
            total_deleted_count += deleted_count
            total_withdrawn_count += withdrawn_count
            total_declined_count += declined_count
            if len(user_ids) < 36:
                break
        else:
            soup = make_soup(competition_url, page_number, active=False)
            user_ids, entry_ids, status, submission_time, deleted_count, withdrawn_count, declined_count = get_participant_and_entry_info_deleted_page(soup)
            deleted_participants_user_ids += user_ids
            deleted_participants_entry_ids += entry_ids
            status_all += status
            submission_time_all += submission_time
            total_deleted_count += deleted_count
            total_withdrawn_count += withdrawn_count
            total_declined_count += declined_count
            if len(user_ids) < 36:
                break

        page_number += 1





    deleted_entries = []
    for entry_id, participant_id, status, submission_time in zip(deleted_participants_entry_ids, 
                                        deleted_participants_user_ids, status_all, submission_time_all):
        deleted_entries.append({'entry_id': entry_id, 'participant_id': participant_id, 'status': status, 'time': submission_time})


    # ## output to file

    # In[40]:


    df_competition_description = pd.DataFrame({'title': [title], 'url': competition_url, 'seeker': seeker, 'start_time': start_time,
                                                'name_to_incorporate': [name_to_incorporate], 'slogan_to_incorporate': [slogan_to_incorporate], 
                                                'description': [description], 'industry': [industry],  
                                                'logo_types': [logo_types],
                                                'logo_to_be_used': [logo_to_be_used],
                                                'colors_to_explore': [colors_to_explore], 
                                                'other_color_requirements': [other_color_requirements], 
                                                'style_attributes': [style_attributes], 
                                                'design_inspiration_count': [design_inspiration_count],
                                                'attachments_count': [attachments_count], 
                                                'other_notes': [other_notes],
                                                'entry_count': entry_count, 'deleted_entry_count': deleted_entry_count,
                                                'deleted': total_deleted_count, 'withdrawn': total_withdrawn_count, 
                                                'declined': total_declined_count, 
                                                'prize': prize, 'entries': [entries],
                                                'deleted_entries': [deleted_entries], 'winners': [winner_entry_ids]})



    return df_competition_description



## output to file

In [3]:
urls = pd.read_csv('href_2022-03-05_difference.csv')
urls = urls['href'].values

results = pd.DataFrame({})
counter = 0

for url in tqdm(urls):
    counter += 1 
    try:
        df = scrape(url)
        results = pd.concat([results, df])
    except:
        pass

100%|██████████| 52/52 [5:25:35<00:00, 375.69s/it]  


In [4]:
results.to_csv('contest_data_' + str(datetime.now().date()) + '.csv', index=False)