# COGS 108 - Data Checkpoint

# Names

- Mariam Bachar (A16217374)
- Alexandra Hernandez (A16730685)
- Brian Kwon (A16306826)
- Andrew Uhm (A16729684)
- Ethan Wang (A17229824)

<a id='research_question'></a>
# Research Question

*Do certain keywords as identified by CLIP correlate with the popularity (as measured by the equivalent of “likes”) that artwork receives on social media?*

# Dataset(s)

*Fill in your dataset information here*

(Copy this information for each dataset)
- Dataset Name:
- Link to the dataset:
- Number of observations:

1-2 sentences describing each dataset. 

If you plan to use multiple datasets, add 1-2 sentences about how you plan to combine these datasets.

# Setup

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import requests
import urllib
from bs4 import BeautifulSoup
import deviantart

import time
from datetime import datetime
from pathlib import Path

# DeviantArt API: https://www.deviantart.com/developers/http/v1/20210526
# Open-Source Python wrapper for DA API: https://github.com/neighbordog/deviantart

In [None]:
# creates a pd df from the csv file if it exists, else creates a blank df
csv_file = 'deviation_info.csv'
try:
    deviation_df = pd.read_csv(csv_file)
except FileNotFoundError:
    deviation_df = pd.DataFrame()

In [None]:
# Separate API keys in case of requesting issues
andrew_DA_API = deviantart.Api("25542", "61a232f232df245f2560a3cb72ecc535")
ethan_DA_API = deviantart.Api("25492", "06217cf59e73b401dc0a14d00857a793")

# access token is da.access_token

In [None]:
# README: use your own token
cur_access = andrew_DA_API

In [None]:
# how many images we want to fetch * 10
n = 120

In [None]:
for i in range(n):
    print('on iteration', i, '* 10')
    # grab 10 images at a time. DeviantArt calls their posts "deviations".
    # TODO: consider timerange 'onemonth'
    deviations = cur_access.browse(endpoint='popular', timerange='alltime', offset=i*10, limit=10)['results']
    
    for deviation in deviations:
        # saves image to file by deviation id using url for local CLIP analysis
        if deviation.content is None:
            print('null deviation on iteration', i)
            continue
        url = deviation.content['src']
        dId = deviation.deviationid
        filename = f"images/{dId}.png"
        path = Path(filename)
        if path.is_file():
            pass
        else:
            open(filename, 'w').close()
            urllib.request.urlretrieve(url, filename)
        
        # these serve as examples of how to make a request when the python wrapper doesn't work
        username = deviation.author.username
        request = f"https://www.deviantart.com/api/v1/oauth2/user/profile/{username}?access_token={cur_access.access_token}&expand=user.stats"
        response = requests.get(request)
        authorData = response.json()
        authorWatchers = authorData['user']['stats']['watchers']
        authorPageViews = authorData['stats']['profile_pageviews'] # deemed unnecessary?
        authorDeviations = authorData['stats']['user_deviations']
        
        request = f"https://www.deviantart.com/api/v1/oauth2/deviation/metadata?access_token={cur_access.access_token}&deviationids={deviation}&ext_stats=True"
        response = requests.get(request)
        metaData = response.json()
        views = metaData['metadata'][0]['stats']['views']
        
        # gathering relevant data, turning it into a new observation
        row = {
            'Deviation ID': deviation.deviationid,
            'Title': deviation.title,
            'Author': deviation.author,
            'Views': views,
            'Favorites': deviation.stats['favourites'],
            'Comments': deviation.stats['comments'],
            'URL Link': deviation.url,
            'Date Posted': datetime.fromtimestamp(int(deviation.published_time)),
            'Height': deviation.content['height'],
            'Width': deviation.content['width'],
            'File Size': deviation.content['filesize'],
            'Author Watchers': authorWatchers,
            'Author Page Views': authorPageViews,
            'Author Deviations': authorDeviations
        }
        row_df = pd.DataFrame(row, index=[0])
        deviation_df = pd.concat([deviation_df, row_df], ignore_index=True)
        
    # when running on the most popular posts, we will likely get duplicates. remove them.
    deviation_df = deviation_df.drop_duplicates(subset='Deviation ID')
    
    # grab every 15 seconds in order to adhere to DeviantArt fetch rate.
    if n > 1:
        time.sleep(15)

In [None]:
# put our df into a csv file so scraping can be collaborative
# to_csv overwrites but should be ok since we are reading from the csv to populate the df anyways
deviation_df.to_csv('deviation_info.csv', index=False)

In [None]:
deviation_df

# Data Cleaning

Describe your data cleaning steps here.

In [None]:
# code (can have multiple)