Price history

In [94]:
import os
import time
import itertools

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from PIL import Image
import requests
import datetime
import re

import urllib
from urllib.request import Request, urlopen

import pandas as pd
import numpy as np

In [110]:
url = "https://www.larvalabs.com/cryptopunks/details/"

def load_punk_info (url):
    web_r = requests.get(url)
    websoup = BeautifulSoup(web_r.text, "html.parser")
    
    price_info = pd.DataFrame()
    for titlesoup in websoup.findAll("div",{"class":"table-responsive"}):
        for element in titlesoup.findAll("tr", {"class":re.compile(r"\bpunk-history-row\b")}):
            price_info= price_info.append([element.get_text().replace("\n", "")])
    return price_info
    
def get_transaction_date (price_info):
    index_=3
    
    if price_info.shape[1]>4 ==True:
        index_=4
        
    price_info["Year"] = price_info[index_].astype(str).apply(lambda x: re.findall(r"^[12][0-9]{3}$", x))
    price_info = price_info.explode("Year")
    
    for index in list(range(2,4)):
        price_info["Year"] = np.where(price_info.Year.isnull(), price_info[index].astype(str).apply(lambda x: re.findall(r"^[12][0-9]{3}$", x)), price_info["Year"])
        price_info = price_info.explode("Year")
    
    price_info["Month"] = price_info[1].astype(str).apply(lambda x: re.findall(r"\b[a-zA-Z]{3}\b", x))
    price_info = price_info.explode("Month")
    
    price_info["Month"] = np.where(price_info.Month.isnull(), price_info[2].astype(str).apply(lambda x: re.findall(r"\b[a-zA-Z]{3}\b", x)), price_info["Month"])
    price_info = price_info.explode("Month")
    return price_info    

def get_transaction_price (price_info):
    price_info["price_in_dollars"] = price_info[1].apply(lambda x: re.findall(r"\(([A-Za-z0-9_$,.<]+)\)", x))
    price_info["price_in_dollars"] = np.where(price_info.Withdrawn == 1, price_info[2].apply(lambda x: re.findall(r"\(([A-Za-z0-9_$,.<]+)\)", x)), price_info["price_in_dollars"])
    price_info = price_info.explode("price_in_dollars")
    
    price_info = price_info[~price_info.price_in_dollars.isnull()]
    for element in [["$", ""], [",", "."], ["<",""], ["Y", ""]]:
        price_info.price_in_dollars = price_info.price_in_dollars.apply(lambda x: str(x).replace(element[0], element[1]))
    
    price_info.price_in_dollars = np.where(price_info.price_in_dollars.str.contains("M"), price_info.price_in_dollars.apply(lambda x: str(x).replace("M", "")).astype(float) * 1000000, price_info.price_in_dollars)
    price_info.price_in_dollars = price_info.price_in_dollars.astype(float, errors="ignore")
    return price_info
    
def transform_punk_info (price_info):   
    price_info = price_info[0].str.split(expand=True)
    for status in ["Bid", "Offered", "Transfer", "Claimed", "Withdrawn", "Offer"]:
        price_info[status] = np.where(price_info[0].str.contains(status), 1, 0)
    
    price_info = get_transaction_price(price_info)
    price_info = get_transaction_date(price_info)
    
    price_info = price_info.iloc[:,6:]
    return price_info

Test the function

In [111]:
punk_id="0014" # input punk id as a string
price_info = load_punk_info(url+punk_id)
price_info = transform_punk_info(price_info)

Iterative loop

In [97]:
path = r"C:\Users\Acer\Git\deda_punks\Crypto_punks\for sale"
os.chdir(path)

punks = []

with os.scandir(path) as files:
    for file in files:
        if file.name.endswith('.png'):
            punks.append(file.name)

new_punks = [re.findall("(\d+)", element) for element in punks]
new_punks = list(itertools.chain(*new_punks))
final_df = pd.DataFrame()

In [135]:
import tqdm

for i in tqdm.tqdm(new_punks[0:1400]): # adjust indices here to run the loop in chunks
    try:
        url = "https://www.larvalabs.com/cryptopunks/details/" + i
        price_info = load_punk_info(url)
        price_info = transform_punk_info(price_info)
        price_info["Punk_ID"] = i
        final_df = final_df.append(price_info)
        time.sleep(5) # prevents lavralabs from blocking you because of too many requests
    except:
        print("Oh no! The post-processing is not adjusted for some of the object data", i)

 11%|████████▋                                                                        | 43/400 [04:03<25:53,  4.35s/it]

Oh no! 7957


 70%|████████████████████████████████████████████████████████▍                       | 282/400 [26:46<08:07,  4.13s/it]

Oh no! 9289


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [37:57<00:00,  5.69s/it]


We scraped all offered punks according to IDs, which were available to us for clustering

In [137]:
final_df.Punk_ID.nunique()

1388

In [141]:
final_df = final_df.drop_duplicates()

The structure of the final data

In [146]:
print(final_df.shape)
final_df.head()

(23685, 9)


Unnamed: 0,Offered,Transfer,Claimed,Withdrawn,Offer,price_in_dollars,Year,Month,Punk_ID
0,0.0,0,0,0,0,64.905,2021,Feb,14
0,0.0,0,0,0,0,66.189,2021,Feb,14
0,0.0,0,0,0,0,68.246,2021,Feb,14
0,1.0,0,0,0,1,986.357,2021,Feb,14
0,1.0,0,0,0,1,736.92,2021,Feb,14


In [143]:
final_df.to_csv("all_offered_punks_.csv", index=None)