## IDEAL SCRAPE

image : author : panel-number : pre-caption : post-caption : smile : love : haha : wow : duck : game_url : player_num : game_duration: game_date : game_tags

In [1]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import time
import os
from IPython.display import clear_output

In [2]:
base_url = 'https://drawception.com'
res = requests.get(base_url)
res.status_code

200

In [None]:
# More URLs to test with
url = 'https://drawception.com/game/DWP1BQgR2b/data-science/'
url24 = 'https://drawception.com/game/DY3RD2GtGQ/the-great-dolphin-war-of-76/'
urltopgray = 'https://drawception.com/game/6EwXD7nBqD/camping-in-the-woods/'

### Scrape Function

This function takes a drawception game and records the game into a dataframe.

In [3]:
def scrape_game(game_url):
    
    # Collect page information
    base_url = 'https://drawception.com'
    res = requests.get(base_url+game_url)
    soup = BeautifulSoup(res.content, 'lxml')
    
    # Get panels variable and check that the game page has panels
    # If no panels then something is up with the game so skip it. NSFW games do this.
    panels = soup.find_all(attrs = {'class':'col-sm-12 col-md-4'})
    if len(panels) == 0:
        return None
    
    # Escape if page doesn't load
    if res.status_code != 200:
        return None
    
    ### Header stuff that applies to all panels ###
    #  game_url : player_num : game_duration: game_date : game_tags
    
    # html with tags like 'top game' color palette and other misc tags
    top_padding = soup.find(attrs = {'class':'text-center add-padding-bottom2x add-padding-top'})
    game_tags = []
    if top_padding != None:
        game_tags = list(set([s.text.strip() for s in top_padding.find_all(name='span')]))

    # String with player count duration and game date
    header_str = soup.find(name='p', attrs={'class':'lead text-muted add-margin-top2x'}).text
    header_str = header_str.strip().split()

    player_num = header_str[0]
    game_date = ' '.join(header_str[6:9])
    game_duration = ' '.join(header_str[11:13])
    
    
    data_rows = []
    for i in range(len(panels)):
        if panels[i].find('img') != None:

            this_row = {}
            # Things to collect for this row:
            #  panel-number : pre-caption : post-caption : image : author : smile : love : haha : wow : duck

            # Check if this is the first panel then record the pre_caption
            if i == 0:
                this_row['pre_caption'] = 'draw_first'
            else:
                this_row['pre_caption'] = panels[i].find(name = 'img').attrs['title']

            # Check if this is the last panel then record the post_caption
            if i == (len(panels)-1):
                this_row['post_caption'] = 'draw_last'
            else:
                this_row['post_caption'] = panels[i+1].find(name='p').text.strip()

            # Image URL
            this_row['image_url'] = panels[i].find('img').attrs['src']

            # Author name (deleted or banned accounts appear as OooOOOoOo)
            if panels[i].find(attrs = {'class':'panel-user'}).find('a') != None:
                this_row['author'] = panels[i].find(attrs = {'class':'panel-user'}).find('a').text
            else:
                # Ghost name
                this_row['author'] = panels[i].find(attrs = {'class':'panel-user'}).find('span').text
                
            # Panel number is simple to collect
            this_row['panel_number'] = i+1

            # Reactions could exist or not exist in a dictionary. Start by setting all of them to 0.
            reaction_types = ['LIKE', 'HAHA', 'WOW', 'LOVE', 'DUCK']
            react_data = json.loads(panels[i].find('reactions').attrs['reactions_data'])
            for react in reaction_types:
                this_row[react] = 0
            # Now loop through the reaction dictionary and reset the non-zeros
            for react_dict in react_data:
                this_row[react_dict['id']] = react_dict['num']
                
            # Add in the universal game stuff here
            # game_url : player_num : game_duration: game_date : game_tags
            this_row['game_url'] = game_url
            this_row['player_num'] = player_num
            this_row['game_date'] = game_date
            this_row['game_tags'] = game_tags

            data_rows.append(this_row)
            
            # Sleep here, no matter how the function is called the sleep is included
            time.sleep(1)

    return pd.DataFrame(data_rows)

### Index Page Scrape to find the Games

This function scrapes the browse games page for url's to games. It takes an integer of the number of pages to scrape and returns a list of game url's. It scrapes from oldest to newest, there are a maximum of 100 pages of history and 21 games per page.

In [30]:
# Function to scrape all pages in the browse games part of drawception up to specified number.
# Thanks to: https://towardsdatascience.com/the-simplest-cleanest-method-for-tracking-a-for-loops-progress-and-expected-run-time-in-python-972675392b3
def scrape_recent(stop_page):
    
    browse_games_url = 'https://drawception.com/browse/recent-games/'
    game_list = []
    
    # Make sure I don't access a page that doesn't exist
    if (stop_page > 100) | (stop_page <= 0):
        stop_page = 100
    
    # look at all pages in reverse order, by going backwards if (when!) the list shifts in the middle of scraping 
    # we miss a game instead of getting a duplicate.
    for i in range(stop_page,0,-1):
        
        clear_output(wait=True)
        
        res = requests.get(f'{browse_games_url}{i}/')
        soup = BeautifulSoup(res.content, 'lxml')
        temp_list = [game.attrs['href'] for game in soup.find_all(attrs={'class':'thumbpanel'})]
        game_list.extend(temp_list)
        
        print(f'Scraped page {i}')
        time.sleep(1)
    
    return game_list

## Collect the Game List
The game completion rate is so high that the entire 2100 browsable games are refreshed in less than 5 days. This means that even though the 2100 games isn't enough, I can scrape every few days and it won't take long for me to accumulate as much data as I need. It also means that my data will be chronologically very close together (rather than over the 5+ years the site has been running).

The cells below are for collecting game url's. The game data itself can be collected at any point afterward.

In [None]:
# Read in saved games list and convert from dataframe back to list, keep commented for safety
games_df = pd.read_csv('games_dec28.csv')
games = games_df['0'].tolist()

This chain of cells collects new data, extends the variable games with the new_scrape games. Then gets rid of the duplicates. Then there is the stuff for writing to a file.

In [31]:
new_scrape = scrape_recent(100)
len(new_scrape)

Scraped page 1


2100

In [None]:
# Adds the lists together then gets rid of duplicates (possibly altering the order in some random fashon?)
games.extend(new_scrape)
games = list(set(games))

In [None]:
# Check to see how much data I have
print(f'Number of games: {len(games)}')
print(f'Minimum drawings in those games: {len(games)*6}')

In [32]:
# Save the list to a file, ### CHANGE THE FILENAME ###
game_df = pd.DataFrame(new_scrape)
game_df.to_csv('games_jan17.csv', index=False)

### Scrape Data from Games on the List
Check if I have the data, if not scrape it. Move to the next item on the list. Repeat for a fixed number of list items. This way I'm never scraping for to long, and if I run into a game that doesn't scrape right I don't lose everything. I can rerun the code over and over until every item on the list shows up in the final data set.

In [33]:
# Use older games list first because the games will have had time to accumulate likes
# Read in saved games list and convert from dataframe back to list, keep commented for safety
games_df = pd.read_csv('games_dec31.csv')
oldest_games = games_df['0'].tolist()

In [5]:
# Run first scrape to initialize the columns of the master DataFrame
# This is commented out so that I don't accidently rerun the cell and overwrite the DF

#drawception = scrape_game(oldest_games[0])
drawception = pd.read_csv('drawception_master.csv')

In [34]:
drawception.shape

(34249, 14)

In [7]:
# scrape list
def batch_scrape(game_list, dataframe, batch_size):
    # Count up to batch size like a while loop. Use a for loop to handle the edge case of finishing the list
    count = 0
    
    for i in range(len(game_list)):
        if game_list[i] not in list(drawception['game_url']):
            
            clear_output(wait=True)
            count += 1
            
            print(f'Scraped page {count}/{batch_size} - {game_list[i]}')
            scrape_game(game_list[i])
            dataframe = dataframe.append(scrape_game(game_list[i]))
            
            if count >= batch_size:
                return dataframe
    print('End of Game List')
    return dataframe

In [36]:
drawception = batch_scrape(oldest_games, drawception, 500)

Scraped page 110/500 - /game/hTmaaZz1MT/author/
End of Game List


In [39]:
drawception.shape

(38058, 14)

In [37]:
# Save the data to a file, ### CHANGE THE FILENAME ###
drawception.to_csv('drawception_master.csv', index=False)

In [16]:
drawception.duplicated(subset='image_url').sum()

0

In [19]:
drawception.shape[0] - len(oldest_games)*6

1188

In [None]:
drawception[drawception['author']=='GugloPWN'].head()

### Save images locally from the DataFrame

This function will go through the dataframe and check if the image exists locally. If the image is here, it moves on, if it's not it gets the image. The file path is similar to how it is saved on the website.

In [23]:
# Thanks to AlexG on stack exchange
# https://stackoverflow.com/questions/8286352/how-to-save-an-image-locally-using-python-whose-url-address-i-already-know
def image_scrape(dataframe, batch_size):

    # Count up to batch size like a while loop. Use a for loop to handle the edge case of finishing the list
    count = 0
    
    for _, row in dataframe.iterrows():
        
        # Get the file path from the url. My local data will mirror the url.
        path_list = row['image_url'].split('/')
        my_file = f'./{"/".join(path_list[3:6])}'
        
        # Check if the image is already downloaded
        if not os.path.exists(my_file):
        
            # This code runs if the image needs scraping
            clear_output(wait=True)
            count += 1
            print(f'Scraped image {count}/{batch_size} - {my_file}')

            # Create directory if it doesn't exist
            my_path = f'./{"/".join(path_list[3:5])}'
            if not os.path.exists(my_path):
                os.makedirs(my_path)

            # Get the image from the web
            page = requests.get(row['image_url'])
            time.sleep(1)

            # Write image file
            with open(my_file, 'wb') as f:
                f.write(page.content)
                
            # Escape early so I can control how long this code runs
            if count >= batch_size:
                return
                
    print('End of DataFrame')
    return

In [None]:
drawception.head(5)

In [38]:
image_scrape(drawception, 10000)

Scraped image 3809/10000 - ./drawings/750124/3VMbZOwjsr.png
End of DataFrame


### TESTING Here I scrape a few games to get a testing set to build the image scrape function

In [None]:
games_sample = games[500:510]
games_sample

In [None]:
games_sample[1:]

In [None]:
# Decent start at game scrape function.

# Set up the DataFrame with the correct columns by reading in first game
print(f'Scraping first game   -   {games_sample[0]}')
testing_df = scrape_game(games_sample[0])

# Loop through all the resrt of the games
for i, game in enumerate(games_sample[1:]):
    clear_output(wait=True)
    print(f'Scraping {i+1} of {len(games_sample)-1}   -   {game}')
    testing_df = testing_df.append(scrape_game(game))

testing_df

In [None]:
testing_df['image_url'].head(1)

In [None]:
testing_df['image_url'].head(1)[0:10]

In [None]:
f_ext = os.path.splitext(url)[-1]
f_name = 'img{}'.format(f_ext)
with open(f_name, 'wb') as f:
    f.write(page.content)

In [None]:
# Thanks to AlexG on stack exchange
# https://stackoverflow.com/questions/8286352/how-to-save-an-image-locally-using-python-whose-url-address-i-already-know

base_image_url = 'https://cdn.drawception.com'


for index, row in testing_df.iterrows():
    
    # Get the file path from the url. My local data will mirror the url.
    path_list = row['image_url'].split('/')
    my_path = f'./{"/".join(path_list[3:5])}'
    
    # Create directory if it doesn't exist
    if not os.path.exists(my_path):
        os.makedirs(my_path)
        
    # Get the image from the web
    page = requests.get(row['image_url'])
    time.sleep(1)
        
    # Write image file
    my_file = f'./{"/".join(path_list[3:6])}'
    with open(my_file, 'wb') as f:
        f.write(page.content)
    print(my_file)

In [None]:
os.rmdir('test_dir')

### Code for Exploring specific games HTML if I encounter something weird

In [None]:
scrape_game(panel_removed_url)

In [None]:
base_url = 'https://drawception.com'

nsfw_url = 'https://drawception.com/game/7OCwsPr6Cg/darude-sandstorm/'
panel_removed_url = 'https://drawception.com/game/q3XGN9TkMO/a-lonely-wizard-tower/'
res = requests.get(nsfw_url)
soup = BeautifulSoup(res.content, 'lxml')

panels = soup.find_all(attrs = {'class':'col-sm-12 col-md-4'})

In [None]:
soup

In [None]:
panels[3]

In [None]:
panels[3].find(attrs = {'class':'panel-user'}).find('a')

In [None]:
if panels[3].find(attrs = {'class':'panel-user'}).find('a') != None:
    print('name here')
else:
    print('no name')

In [None]:
drawception['DUCK'].value_counts()

In [None]:
drawception.dtypes

In [None]:
drawception['LIKE'] = drawception['LIKE'].astype(int)
drawception['HAHA'] = drawception['HAHA'].astype(int)
drawception['WOW'] = drawception['WOW'].astype(int)
drawception['LOVE'] = drawception['LOVE'].astype(int)
drawception['DUCK'] = drawception['DUCK'].astype(int)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
bins = np.linspace(1, 10, 10)

plt.figure(figsize=(15,10))

plt.hist([drawception['LIKE'],drawception['HAHA'],drawception['WOW'],drawception['LOVE'],drawception['DUCK']], 
         bins, 
         alpha=0.5, 
         label=['LIKE','HAHA','WOW','LOVE','DUCK'])

plt.title('Reaction Counts by Drawings', size=28)
plt.xlabel('Number of Reactions', size=20)
plt.ylabel('Number of Drawing', size=20)
plt.legend(loc='upper right', fontsize=20)
plt.show()

In [None]:
drawception[(drawception['LIKE']==0) & (drawception['HAHA']==0) & (drawception['WOW']==0) & (drawception['LOVE']==0) & (drawception['DUCK']==0)].shape

In [None]:
drawception.shape

In [None]:
drawception.groupby(by='author').count()['DUCK'].mean()

In [40]:
drawception[drawception['author']=='GugloPWN']

Unnamed: 0,pre_caption,post_caption,image_url,author,panel_number,LIKE,HAHA,WOW,LOVE,DUCK,game_url,player_num,game_date,game_tags
126,A Bunny peeking out behind blueberry bushes,draw_last,https://cdn.drawception.com/drawings/491204/8X...,GugloPWN,12,0,0,0,0,0,/game/5jwLfeXmqO/flowers/,12,"December 18th, 2020",[]
1702,crow eating corn,Crow pecking at corn on a cob,https://cdn.drawception.com/drawings/491204/Vs...,GugloPWN,2,1,0,1,2,0,/game/Dc4WwMMjSc/crow-eating-corn/,12,"December 18th, 2020",[]
6654,Elf archer,Robin Hood,https://cdn.drawception.com/drawings/491204/Wz...,GugloPWN,12,1,0,0,0,0,/game/9CwW4HRczH/grandmother/,15,"December 20th, 2020",[]
9180,best gum brand ever,Nicotine gum,https://cdn.drawception.com/drawings/491204/Xh...,GugloPWN,10,0,0,0,1,0,/game/ay7KrSKXBn/prehistoric-beer/,12,"December 21st, 2020",[]
9192,Avocado waves on a sunny day,he's got no idea what's coming,https://cdn.drawception.com/drawings/491204/Mw...,GugloPWN,10,1,0,0,0,0,/game/mbLHmdRn5n/punny-avocado/,12,"December 21st, 2020",[]
9315,"bee wins 1st place, ladybird 2nd",Bee is happy for getting first place,https://cdn.drawception.com/drawings/491204/63...,GugloPWN,10,0,2,0,2,0,/game/Pzt8xMbZ42/mosquito-blacksmith/,12,"December 21st, 2020",[]
9836,Saving a Minecraft dog with a defibrillator,Person tries to revive animal,https://cdn.drawception.com/drawings/491204/wN...,GugloPWN,8,1,0,0,0,0,/game/77N66CWPm3/goat/,12,"December 21st, 2020",[]
11876,evil tooth burns a tiny man in a jar,draw_last,https://cdn.drawception.com/drawings/491204/nX...,GugloPWN,12,0,0,0,0,0,/game/En8rQX9tb1/pepper-rocket-scientist/,12,"December 22nd, 2020",[]
12778,Lion doing workout,Lion working out,https://cdn.drawception.com/drawings/491204/g8...,GugloPWN,6,0,0,0,0,0,/game/bhsCCLT1xA/tiger-jogging-with-raspberries/,12,"December 22nd, 2020",[]
13084,draw_first,Guy stuck in gigantic hourglass in a desert,https://cdn.drawception.com/drawings/491204/s3...,GugloPWN,1,2,0,0,0,0,/game/rY1qxdtF5Q/guy-stuck-in-gigantic-hourgla...,12,"December 22nd, 2020",[]
