In [1]:
%load_ext autoreload
%autoreload 2
import feature_extract
import sys
sys.path.append("..")
import torch
import psql_methods as psql
import pickle
import alchemy_methods as alc
from tqdm import tqdm
import numpy as np
import image_utils as imgs
import opensea_methods as opse
import multiprocessing
import pandas as pd
import feature_utils as feat
import matplotlib.pyplot as plt
import counterfeit_utils as cfu

In [2]:
pw_dists = cfu.get_dists()
pw_dists.query("Top_100=='boredapeyachtclub'").sort_values('Euc_Distance').head(171)
# pw_dists.sort_values('Euc_Distance').tail(200)

Unnamed: 0,Top_100,Alt,Euc_Distance,Dot_Distance
175652,boredapeyachtclub,0xapes-trilogy,0.132621,0.991206
182671,boredapeyachtclub,orangapes,0.146840,0.989219
180508,boredapeyachtclub,japanesebornapesociety,0.198228,0.980353
185556,boredapeyachtclub,tiredapeyachtclub,0.209443,0.978067
177097,boredapeyachtclub,brokeapeclub-v2-1,0.212487,0.977425
...,...,...,...,...
178262,boredapeyachtclub,degentoonz-collection,0.538694,0.854904
185195,boredapeyachtclub,the-wabbits,0.538922,0.854782
183459,boredapeyachtclub,queeny00ts,0.539117,0.854676
176988,boredapeyachtclub,bored-mummy-waking-up,0.539298,0.854579


In [None]:
# Filter rows where 'euclid_distance' is less than 0.5
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
filtered_df = pw_dists[pw_dists['Euc_Distance'] < 0.4]

# Count occurrences for each unique value in the 'top_100' column
count_by_top_100 = filtered_df.groupby('Top_100').size()
count_by_top_100

In [90]:
pw_dists

Unnamed: 0,Top_100,Alt,Euc_Distance,Dot_Distance
10977,0n1-force,-glowa-,1.313444,0.137433
10978,0n1-force,-jubilees-multiverse,0.906586,0.589051
10979,0n1-force,-phunks,1.074891,0.422305
10980,0n1-force,-subtraction-,1.301293,0.153318
10981,0n1-force,0-project,1.302781,0.151380
...,...,...,...,...
1470913,zombieclub-token,zunkpets,0.972472,0.527149
1470914,zombieclub-token,zunkz,0.942404,0.555937
1470915,zombieclub-token,zuphioh-editions,0.856914,0.632849
1470916,zombieclub-token,zuttomamoru,1.246206,0.223485


In [None]:
grouped = pw_dists.groupby('Top_100')

# Plotting
fig, ax = plt.subplots()

for key, grp in grouped:
    grp_sorted = grp.sort_values('Euc_Distance')
    grp_sorted['local_rank'] = range(1, len(grp_sorted) + 1)  # Calculate local ranking within each group
    # plt.plot(grp_sorted['local_rank'], grp_sorted['Euc_Distance'], label=key)
    differences = grp_sorted['Euc_Distance'].diff()
    
    # Plotting the differences
    plt.plot(grp_sorted['local_rank'], differences, label=f'{key} - Differences')
    plt.xlabel('Index')
    plt.ylabel('Euclidean Distance')
    plt.xlim(0,200)
    plt.title(f'Euclidean Distance vs. Index for {key}')
    plt.show()

In [None]:
#We can now create a dataframe with th embeddings of all of th top 100 NFTs
#We assume the features have already been computed by the feature_extract_notebook
model_string = 'dinov2_vits14'
data_path = '/global/scratch/tlundy/NFT_Research/nft_research/Dino/counterfeit_images'
out_path = f'/global/scratch/tlundy/NFT_Research/nft_research/Dino/counterfeit_features/{model_string}'
feature_path = out_path+'/testfeat.pth'
features = torch.load(feature_path)
labels = feature_extract.get_labels(data_path)
file_names = feature_extract.get_filenames(data_path)
features_list = features.tolist()
# Create a pandas DataFrame
data = {'Label': labels.tolist(), 'Features': features_list,'Collection':[x[0] for x in file_names],
        'NFT_num':[x[1] for x in file_names]}
df = pd.DataFrame(data)
df = feat.compute_average_vector(df,column='Label')
df

In [87]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)

In [None]:
df['dists'] = df.apply(lambda row: feat.euclidean_distance(row['Features'],row['AverageFeatureVector'],normed=True), axis=1)

In [None]:
average_by_group = df.groupby('Collection')['dists'].mean()

In [None]:
merged_df = pd.merge(pw_dists, average_by_group, left_on='Top_100',right_on='Collection', how='left')

In [89]:
df

Unnamed: 0,Label,Features,Collection,NFT_num,AverageFeatureVector,dists
0,0,"[0.06834631413221359, 0.013368001207709312, 0....",0n1-force,1010,"[0.05036659871786995, -0.013276008738578319, 0...",0.459327
1,0,"[0.08093642443418503, -0.03803253173828125, 0....",0n1-force,1012,"[0.05036659871786995, -0.013276008738578319, 0...",0.588219
2,0,"[0.028013398870825768, -0.02013389579951763, 0...",0n1-force,1025,"[0.05036659871786995, -0.013276008738578319, 0...",0.494515
3,0,"[-0.03390040993690491, 0.004265990108251572, 0...",0n1-force,1034,"[0.05036659871786995, -0.013276008738578319, 0...",0.622250
4,0,"[0.026579292491078377, -0.014232189394533634, ...",0n1-force,1044,"[0.05036659871786995, -0.013276008738578319, 0...",0.630601
...,...,...,...,...,...,...
60377,132,"[-0.06007155403494835, 0.01303652860224247, 0....",zombieclub-token,952,"[-0.007146694796058, 0.0027466976506839275, 0....",0.659244
60378,132,"[0.008216793648898602, 0.007785656955093145, -...",zombieclub-token,956,"[-0.007146694796058, 0.0027466976506839275, 0....",0.619830
60379,132,"[-0.03259636461734772, 0.01624414138495922, 0....",zombieclub-token,964,"[-0.007146694796058, 0.0027466976506839275, 0....",0.643341
60380,132,"[0.05409884452819824, -0.014129256829619408, -...",zombieclub-token,989,"[-0.007146694796058, 0.0027466976506839275, 0....",0.734854


In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
filtered_df = merged_df[merged_df['Euc_Distance'] < merged_df['dists']]

# Count occurrences for each unique value in the 'top_100' column
count_by_top_100 = filtered_df.groupby('Top_100').size()
count_by_top_100

In [126]:
df2 = pd.read_pickle('graph_images_dataframe.pkl')
df2['dists_2'] = df2.apply(lambda row: feat.euclidean_distance(row['Features'],row['AverageFeatureVector'],normed=False), axis=1)
average_by_group = df2.groupby('Collection')['dists_2'].mean()

In [137]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)
merged_df2 = pd.merge(merged_df, average_by_group, left_on='Alt',right_on='Collection', how='left')
df_sorted = merged_df2.sort_values('Euc_Distance')
# filtered_df = df_sorted[df_sorted['Euc_Distance'] < 0.5]
top_100_groups = df_sorted.groupby('Top_100').head(25)
# top_100_groups = filtered_df.groupby('Top_100')['dists_2'].median()
# # Calculating the median of 'dists_2' for each group
median_dists_2 = top_100_groups.groupby('Top_100')['dists_2'].median()

# # Merging the median values back into the original DataFrame
df_with_median = pd.merge(merged_df2, median_dists_2, on='Top_100', how='left', suffixes=('', '_median'))
df_with_median
# print(df_with_median)

Unnamed: 0,Top_100,Alt,Euc_Distance,Dot_Distance,dists,dists_2,dists_2_median
0,0n1-force,-glowa-,1.313444,0.137433,0.582049,0.762015,0.565188
1,0n1-force,-jubilees-multiverse,0.906586,0.589051,0.582049,0.562235,0.565188
2,0n1-force,-phunks,1.074891,0.422305,0.582049,0.319652,0.565188
3,0n1-force,-subtraction-,1.301293,0.153318,0.582049,0.734882,0.565188
4,0n1-force,0-project,1.302781,0.151380,0.582049,0.477592,0.565188
...,...,...,...,...,...,...,...
1459812,zombieclub-token,zunkpets,0.972472,0.527149,0.674177,0.441637,0.589574
1459813,zombieclub-token,zunkz,0.942404,0.555937,0.674177,0.382909,0.589574
1459814,zombieclub-token,zuphioh-editions,0.856914,0.632849,0.674177,0.734466,0.589574
1459815,zombieclub-token,zuttomamoru,1.246206,0.223485,0.674177,0.426912,0.589574


In [118]:
# for label,grp in df_sorted.groupby('Top_100'):
#     print(grp.head(5))
df2.query("Collection=='wow-pixies-v2'")

Unnamed: 0,Label,Features,Collection,NFT_num,AverageFeatureVector,dists_2
445045,10705,"[-0.052821699529886246, -0.010643573477864265,...",wow-pixies-v2,4989,"[-0.03719336843849825, -0.014740405677002855, ...",
445046,10705,"[-0.023714806884527206, -0.026752343401312828,...",wow-pixies-v2,5030,"[-0.03719336843849825, -0.014740405677002855, ...",
445047,10705,"[-0.04107312485575676, 0.006837933789938688, -...",wow-pixies-v2,5069,"[-0.03719336843849825, -0.014740405677002855, ...",
445048,10705,"[-0.05387219041585922, -0.02004130370914936, 0...",wow-pixies-v2,5097,"[-0.03719336843849825, -0.014740405677002855, ...",
445049,10705,"[-0.0358734130859375, -0.031248390674591064, 0...",wow-pixies-v2,5102,"[-0.03719336843849825, -0.014740405677002855, ...",
445050,10705,"[-0.045619115233421326, -0.008675847202539444,...",wow-pixies-v2,5109,"[-0.03719336843849825, -0.014740405677002855, ...",
445051,10705,"[-0.030264224857091904, -0.02018706686794758, ...",wow-pixies-v2,5150,"[-0.03719336843849825, -0.014740405677002855, ...",
445052,10705,"[-0.020418835803866386, -0.018695861101150513,...",wow-pixies-v2,5175,"[-0.03719336843849825, -0.014740405677002855, ...",
445053,10705,"[-0.027363894507288933, -0.020645933225750923,...",wow-pixies-v2,5177,"[-0.03719336843849825, -0.014740405677002855, ...",
445054,10705,"[-0.013448367826640606, -0.03537609055638313, ...",wow-pixies-v2,5188,"[-0.03719336843849825, -0.014740405677002855, ...",


In [117]:
df2

Unnamed: 0,Label,Features,Collection,NFT_num,AverageFeatureVector,dists_2
0,0,"[0.006129133980721235, 0.08498869091272354, -0...",-glowa-,1,"[-0.007021066558081657, 0.034550966462120414, ...",0.459327
1,0,"[0.04774976521730423, 0.05362828075885773, 0.0...",-glowa-,10017,"[-0.007021066558081657, 0.034550966462120414, ...",0.588219
2,0,"[0.10529650747776031, 0.07033471018075943, 0.0...",-glowa-,10018,"[-0.007021066558081657, 0.034550966462120414, ...",0.494515
3,0,"[-0.02233046293258667, 0.01190363708883524, -0...",-glowa-,10019,"[-0.007021066558081657, 0.034550966462120414, ...",0.622250
4,0,"[0.04859286919236183, 0.054554957896471024, 0....",-glowa-,10024,"[-0.007021066558081657, 0.034550966462120414, ...",0.630601
...,...,...,...,...,...,...
456139,10976,"[-0.033141832798719406, -0.0350814163684845, -...",zzz-zzz-by-hashlips,75,"[-0.033141832798719406, -0.0350814163684845, -...",
456140,10976,"[-0.033141832798719406, -0.0350814163684845, -...",zzz-zzz-by-hashlips,77,"[-0.033141832798719406, -0.0350814163684845, -...",
456141,10976,"[-0.033141832798719406, -0.0350814163684845, -...",zzz-zzz-by-hashlips,78,"[-0.033141832798719406, -0.0350814163684845, -...",
456142,10976,"[-0.033141832798719406, -0.0350814163684845, -...",zzz-zzz-by-hashlips,79,"[-0.033141832798719406, -0.0350814163684845, -...",


In [138]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
filtered_df = df_with_median[df_with_median['Euc_Distance'] < df_with_median['dists_2_median']]

# Count occurrences for each unique value in the 'top_100' column
count_by_top_100 = filtered_df.groupby('Top_100').size()
count_by_top_100

Top_100
0n1-force                        14
3landers                         58
alienfrensnft                    37
artgobblers                      84
azuki                            99
azukielementals                 104
beanzofficial                    14
bears-deluxe                      5
bored-ape-chemistry-club          1
bored-ape-kennel-club             1
boredapeyachtclub                92
capsulehouse                      4
chain-runners-nft                10
chimpersnft                      50
chromie-squiggle-by-snowfro       1
clonex                           21
collectvoxtownstar                2
cool-cats-nft                    67
coolmans-universe                21
coolpetsnft                       3
creatureworld                     2
cryptoadz-by-gremplin            14
cryptobatz-by-ozzy-osbourne       1
cryptodickbutts-s3               32
cryptopunks                      72
cryptoskulls                      1
curiocardswrapper               129
cyberkongz          

In [140]:
labels_counts_df = count_by_top_100.reset_index()

# Rename the columns as needed
labels_counts_df.columns = ['Label', 'Count']

# Convert to a list of tuples (label, count)
labels_counts_list = list(zip(labels_counts_df['Count'], labels_counts_df['Label']))

print(labels_counts_list)

[(14, '0n1-force'), (58, '3landers'), (37, 'alienfrensnft'), (84, 'artgobblers'), (99, 'azuki'), (104, 'azukielementals'), (14, 'beanzofficial'), (5, 'bears-deluxe'), (1, 'bored-ape-chemistry-club'), (1, 'bored-ape-kennel-club'), (92, 'boredapeyachtclub'), (4, 'capsulehouse'), (10, 'chain-runners-nft'), (50, 'chimpersnft'), (1, 'chromie-squiggle-by-snowfro'), (21, 'clonex'), (2, 'collectvoxtownstar'), (67, 'cool-cats-nft'), (21, 'coolmans-universe'), (3, 'coolpetsnft'), (2, 'creatureworld'), (14, 'cryptoadz-by-gremplin'), (1, 'cryptobatz-by-ozzy-osbourne'), (32, 'cryptodickbutts-s3'), (72, 'cryptopunks'), (1, 'cryptoskulls'), (129, 'curiocardswrapper'), (1, 'cyberkongz'), (4, 'cyberkongz-babies'), (74, 'deadfellaz'), (9, 'degentoonz-collection'), (63, 'degods'), (2, 'digidaigaku'), (89, 'doodles-official'), (3, 'fidenza-by-tyler-hobbs'), (5, 'forgottenruneswizardscult'), (2, 'galacticapes'), (9, 'genuine-undead'), (15, 'goblintownwtf'), (5, 'guttercatgang'), (1, 'hapeprime'), (132, 'ha

In [143]:
command = 'Update objective_cf_num set num=%s where slug=%s'
psql.batch_insert(command,labels_counts_list)