In [31]:
%load_ext autoreload
%autoreload 2
import feature_extract
import sys
sys.path.append("..")
import torch
import pandas as pd
import psql_methods as psql
import pickle
import alchemy_methods as alc
from tqdm import tqdm
import numpy as np
import pickle
import image_utils as imgs
from Openseas_Methods import pull_nft_images,pull_nft_contracts
import multiprocessing

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
#First step is loading the list of distinct NFT slugs that present in the graph dataset
with open('../Graph_predictions/dataset_stor/graph_dataset_4/label_list.pkl','rb') as f:
    label_list = pickle.load(f)

In [None]:
#Second step is to pull all of the NFT collections that already have entries in the DB
commands = ["SELECT distinct slug from nfttoimage"]
data = psql.execute_commands(commands)
data = [x[0] for x in data]
#Figure out which labels still need to find URLs for
nfts_to_process = [x for x in label_list if x not in data]
len(nfts_to_process)

In [None]:
#Pull contracts as well incase we need to use alchemy API
contract_check = tuple(nfts_to_process)
commands = ["SELECT * from collectiontoaddress"]
data = psql.execute_commands(commands)
data_dict = {x[0]:x[1] for x in data}

In [None]:
nft_w_contract = [(x,data_dict.get(x,None)) for x in nfts_to_process]
slugs_no_contract = [x[0] for x in nft_w_contract if x[1] is None]
# for slug in tqdm(slugs_no_contract):
#     pull_nft_contracts(slug)
len(slugs_no_contract)

In [None]:
#Pull 500 NFT URLs from openseas api per slug with no data
import logging
logging.basicConfig(filename='slug_url_logs.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
for slug,_ in tqdm(nft_w_contract):
    logging.info(f"Beginning slug {slug}")
    data.append(pull_nft_images(slug,limit_toks=500))
    logging.info(f"Finished slug {slug}")

In [3]:
#Check out old dataset to see which NFTs already have images
file_path = 'images/val'
complete_nfts = get_immediate_subdirectories(file_path)

In [4]:
#Subselect 50 NFTs per collection to get images of
images_to_pull = [x for x in label_list if x not in complete_nfts]
images_to_pull = tuple(images_to_pull)
command = ["""WITH numbered_rows AS (
  SELECT *,
         ROW_NUMBER() OVER (PARTITION BY slug ORDER BY RANDOM()) AS row_num
  FROM nfttoimage
)
SELECT *
FROM numbered_rows
WHERE row_num <= 50;"""]
rows  = psql.execute_commands(command)

In [8]:
#Remove row number field
rows_to_pull = [(x[0],x[1],x[2]) for x in rows if x[0] in images_to_pull]
len(rows_to_pull)

417326

In [25]:
#If you need to restart half way run this cell to not redo collections that are done
file_path = 'graph_images/val'
complete_nfts = get_immediate_subdirectories(file_path)
print(len(rows_to_pull))
rows_to_pull = [x for x in rows_to_pull if x[0] not in complete_nfts]
print(len(rows_to_pull))

411294
28412


In [26]:
#Restructure data into format for multiprocessing
# Column names for the DataFrame
columns = ['slug', 'token_id', 'url']

# Create a DataFrame from the list of tuples
df = pd.DataFrame(rows_to_pull, columns=columns)
grouped_data = df.groupby('slug').apply(lambda x: (x['slug'].iloc[0], list(zip(x['token_id'], x['url']))))
args = list(grouped_data)


In [29]:
#This is the cell that spawns processes and retreives images also creates log files for tracking
records = []
num_processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=num_processes) as pool: # Use all cores   
    for result in pool.starmap(pull_image_from_url, args):
        records.append(result)



In [32]:
#Delete empty directories for collections for which the image retrieveal failed for some reason
imgs.delete_empty_directories(file_path)

Next step is to run the feature extract notebook on the new dataset

In [35]:
#We assume the features have already been computed by the feature_extract_notebook
model_string = 'dinov2_vits14'
data_path = '/global/scratch/tlundy/NFT_Research/nft_research/Dino/graph_images'
out_path = f'/global/scratch/tlundy/NFT_Research/nft_research/Dino/graph_images_features/{model_string}'
feature_path = out_path+'/testfeat.pth'
features = torch.load(feature_path)
labels = feature_extract.get_labels(data_path)
file_names = feature_extract.get_filenames(data_path)
features_list = features.tolist()
# Create a pandas DataFrame
data = {'Label': labels.tolist(), 'Features': features_list,'Collection':[x[0] for x in file_names],
        'NFT_num':[x[1] for x in file_names]}
df = pd.DataFrame(data)
df

Dataset ReturnIndexDataset
    Number of datapoints: 456144
    Root location: /global/scratch/tlundy/NFT_Research/nft_research/Dino/graph_images/val


Unnamed: 0,Label,Features,Collection,NFT_num
0,0,"[0.006129133980721235, 0.08498869091272354, -0...",-glowa-,1
1,0,"[0.04774976521730423, 0.05362828075885773, 0.0...",-glowa-,10017
2,0,"[0.10529650747776031, 0.07033471018075943, 0.0...",-glowa-,10018
3,0,"[-0.02233046293258667, 0.01190363708883524, -0...",-glowa-,10019
4,0,"[0.04859286919236183, 0.054554957896471024, 0....",-glowa-,10024
...,...,...,...,...
456139,10976,"[-0.033141832798719406, -0.0350814163684845, -...",zzz-zzz-by-hashlips,75
456140,10976,"[-0.033141832798719406, -0.0350814163684845, -...",zzz-zzz-by-hashlips,77
456141,10976,"[-0.033141832798719406, -0.0350814163684845, -...",zzz-zzz-by-hashlips,78
456142,10976,"[-0.033141832798719406, -0.0350814163684845, -...",zzz-zzz-by-hashlips,79


In [37]:
import numpy as np
from scipy.stats import pearsonr
# Group the DataFrame by 'Label'
grouped = df.groupby('Label')

# Compute average feature vector for each label
average_feature_vectors = []
for label, group in tqdm(grouped):
    # Compute average feature vector for the current label
    avg_feature_vector = np.mean(group['Features'].tolist(), axis=0)
    average_feature_vectors.append((label, avg_feature_vector))
average_features_df = pd.DataFrame(average_feature_vectors, columns=['Label', 'AverageFeatureVector'])
merged_df2 = pd.merge(df, average_features_df, on='Label')
merged_df2.to_pickle('graph_images_dataframe.pkl')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 10977/10977 [00:06<00:00, 1793.75it/s]
