In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
from datetime import datetime
import pandas as pd
import os
import glob

In [2]:
## read contest csv
df = pd.read_csv('solver_data_20220131.csv')

In [3]:
# get user id
def get_user_id(df):

    ids = df['user_id'].values
    return ids



In [4]:
user_ids = get_user_id(df)

In [5]:
def multiple_scraping(user_id):

    kv = {'user-agent': 'Mozilla/5.0'}
    url = "https://99designs.hk/profiles/" + str(user_id) + "/about"
    # url = "https://99designs.hk/profiles/3923486/about"
    # print(url)

    r = requests.get(url, headers=kv, timeout=30)
    r.raise_for_status()
    r.status_code
    r.encoding = r.apparent_encoding

    soup = BeautifulSoup(r.text, 'html.parser')

    ## user name
    result = soup.find(name='h1', attrs='user-details__name')
    user_name = result.text.strip()


    ## about
    result = soup.find(name='div', attrs='two-column-layout__body')
    try:
        about = result.find(name='p').text.strip()
    except:
        about = None


    ## member since
    result = soup.find(name='div', attrs='two-column-layout__body')
    # country_and_member_since = result.findall(name='span', attrs='subtle-text')
    country_and_member_since = result.find(name='div', attrs='section')
    country_and_member_since = country_and_member_since.text.replace('\n', '').strip()
    # country_and_member_since
    member_since = re.search('Member since:.+', country_and_member_since).group(0).split(':')[-1].strip()
    
    # stars and reviews
    try:
        stars_and_reviews = soup.find(name='div', attrs='aggregate-star-rating__description').text
        stars = float(re.search('\d\.\d{1,2} stars', stars_and_reviews).group(0).replace(" stars", ""))
        review_count = int(re.search('\d{1,3} reviews', stars_and_reviews).group(0).replace(" reviews", ""))
    except:
        stars = 0
        review_count = 0

    ## Experience：Contests won, Runner up, 1-to-1 Projects, Repeat clients, tags
    result = soup.find(name='div', attrs='profile__tags')
    experience = result.find(name='div', attrs='stats-panel').text.replace('\n', '')
    numbers = re.findall(' \d+ ', experience)
    contests_won = numbers[0]
    runner_up = numbers[1]
    one_to_one_projects = numbers[2]
    repeat_clients = numbers[3]


    ## tags
    result = soup.find(name='div', attrs='profile__tags')
    result = result.find(name='div', attrs='pill-group').text.replace('\n', ' ')
    tags = result.strip().split('     ')


    ## certification
    result = soup.find_all(name='div', attrs='profile__tag-section')[1]
    certification = result.find(name='div', attrs='pill-group').text.replace('\n', '').strip().split('                                                              ')
    

    ## reviews
    results = soup.find_all(name='em')
    reviews = []
    for result in results:
        reviews.append(result.text.replace('\n', '').strip().replace('"', ''))


    data = {'user_id': user_id, 'user_name': user_name, 'about': about, 
            'stars': stars, 'review_count': review_count,
            'member_since': member_since, 'contests_won': contests_won, 
            'runner_up': runner_up, '1_to_1_projects': one_to_one_projects, 
            'tags': tags,
            'repeat_clients': repeat_clients, 'reviews': reviews, 
            'certification': certification,
            'timestamp': str(datetime.now().date())}

    return data


In [6]:
from tqdm import tqdm
result = []

for id in tqdm(user_ids):
    try:
        data = multiple_scraping(id)
        if data: 
            result.append(data)
    except:
        pass


100%|██████████| 6479/6479 [9:05:30<00:00,  5.05s/it]   


In [7]:
user_profile_df = pd.DataFrame(result)

In [8]:
user_profile_df.to_csv('user_profile_all_2022-02-28.csv', index=False)

In [9]:
set(user_ids).difference(user_profile_df['user_id'])

{150588,
 281850,
 342436,
 344786,
 379651,
 383249,
 504191,
 574511,
 619858,
 679036,
 862446,
 902268,
 1029336,
 1046006,
 1078889,
 1103272,
 1175632,
 1185072,
 1185158,
 1285980,
 1398592,
 1496752,
 1644099,
 1665952,
 1679771,
 1716970,
 1769081,
 1819019,
 1892275,
 1993975,
 2060673,
 2067107,
 2093069,
 2122815,
 2130195,
 2225343,
 2252092,
 2284860,
 2310922,
 2320240,
 2332737,
 2354135,
 2362154,
 2507879,
 2700388,
 2732458,
 2732505,
 2749539,
 2788484,
 2798627,
 2805066,
 2825713,
 2881670,
 2891015,
 2970800,
 2973353,
 2977280,
 2979778,
 3113719,
 3228302,
 3238934,
 3250438,
 3300789,
 3322350,
 3389272,
 3402107,
 3439801,
 3440794,
 3446750,
 3501518,
 3515010,
 3527794,
 3533826,
 3549370,
 3553534,
 3565729,
 3591228,
 3598880,
 3600003,
 3610593,
 3627220,
 3636331,
 3729888,
 3752277,
 3766150,
 3800798,
 3814134,
 3823152,
 3825766,
 3831361,
 3851374,
 3852622,
 3874471,
 3882263,
 3899833,
 3918146,
 3931624,
 3932969,
 3958897,
 4089853,
 4199599,
 4