In [1]:
!pip install --quiet img2vec_pytorch
print('pip install img2vec complete.')

pip install img2vec complete.


In [2]:
import base64
import pandas as pd
from arrow import now
from glob import glob
from img2vec_pytorch import Img2Vec
from os.path import basename
from PIL import Image
from PIL import UnidentifiedImageError

# let's load up a sample
GLOB = '/kaggle/input/photgraphs-of-1000-u-s-cities-ai-generated/*.png'
SIZE = 512
STOP = 3000

def embed(model, filename: str):
    with Image.open(fp=filename, mode='r') as image:
        return model.get_vec(image, tensor=True).numpy().reshape(SIZE,)


def get_picture_from_glob(arg: str, stop: int, ) -> list:
    time_get = now()
    result = []
    count = 0
    failures = 0
    for index, input_file in enumerate(glob(pathname=arg)):
        if count < stop:
            try:
                name = basename(input_file).replace('.jpg', '')
                value = embed(model=model, filename=input_file)
                result.append(pd.Series(data={'name': name, 'value': value,}))
                count += 1
            # almost all of our images will embed successfully so we will flag and discard cases that don't
            # if we report every failure the results get kind of leggy for large samples so let's just count failures 
            # and report the count
            except RuntimeError:
#                 print('embedding failed for {}'.format(input_file))
                failures += 1
                pass 
            except UnidentifiedImageError:
#                 print('embedding failed for {}'.format(input_file))
                failures += 1
                pass 
            
    print('embedded {} rows with {} failures in {}'.format(len(result), failures, now() - time_get))
    return result

time_start = now()
model = Img2Vec(cuda=False, model='resnet-18')
df = pd.DataFrame(data=get_picture_from_glob(arg=GLOB, stop=STOP, ))
print('done in {}'.format(now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 109MB/s]


embedded 1000 rows with 0 failures in 0:02:13.126908
done in 0:02:14.108475


In [3]:
df.head()

Unnamed: 0,name,value
0,highland_california.png,"[0.9073799, 2.4232817, 2.6540484, 0.70218486, ..."
1,new_britain_connecticut.png,"[0.7772473, 0.6293807, 1.8701727, 1.1827742, 0..."
2,pembroke_pines_florida.png,"[1.815567, 2.8266356, 2.9306302, 0.26130608, 0..."
3,lacey_washington.png,"[1.4230752, 3.5252979, 3.401282, 0.60222894, 0..."
4,danville_california.png,"[0.4289289, 0.0, 0.23080131, 0.024211202, 0.05..."


In [4]:
def get_state(arg:str) -> str:
    result = arg.replace('.png', '').split('_')[-1]
    prefix = arg.replace('.png', '').split('_')[-2]
    if result in {'york', 'mexico', 'hampshire', 'carolina', 'dakota', 'island'}:
        return ' '.join([prefix, result])
        
    return result

df['state'] = df['name'].apply(func=get_state)

In [5]:
# now we have some embeddings; let's see how they cluster
from plotly import express
from arrow import now
from umap import UMAP

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=1000)
df[['x', 'y']] = umap.fit_transform(X=df['value'].apply(pd.Series))
print('done with UMAP in {}'.format(now() - time_start))
express.scatter(data_frame=df, x='x', y='y', hover_name='name', color='state', height=900).show()

2024-03-11 17:27:25.018843: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-11 17:27:25.019078: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-11 17:27:25.194831: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=1000, n_jobs=1, random_state=2024, verbose=True)
Mon Mar 11 17:27:38 2024 Construct fuzzy simplicial set
Mon Mar 11 17:27:39 2024 Finding Nearest Neighbors
Mon Mar 11 17:27:44 2024 Finished Nearest Neighbor Search
Mon Mar 11 17:27:47 2024 Construct embedding


Epochs completed:   0%|            0/1000 [00:00]

	completed  0  /  1000 epochs
	completed  100  /  1000 epochs
	completed  200  /  1000 epochs
	completed  300  /  1000 epochs
	completed  400  /  1000 epochs
	completed  500  /  1000 epochs
	completed  600  /  1000 epochs
	completed  700  /  1000 epochs
	completed  800  /  1000 epochs
	completed  900  /  1000 epochs
Mon Mar 11 17:27:53 2024 Finished embedding
done with UMAP in 0:00:15.179017


  sf: grouped.get_group(s if len(s) > 1 else s[0])


In [6]:
!pip install geocoder


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl.metadata (14 kB)
Collecting ratelim (from geocoder)
  Downloading ratelim-0.1.6-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [7]:

from geocoder import arcgis

for index, row in df.head(n=5).iterrows():
    name = ' '.join(row['name'].replace('.png', '').split('_'))
    print('{} {}'.format(name, arcgis(name).latlng))

highland california [34.12126000000006, -117.21713999999997]
new britain connecticut [41.66772000000003, -72.78214999999994]
pembroke pines florida [26.008110000000045, -80.28010999999998]
lacey washington [47.04437000000007, -122.82323999999994]
danville california [37.83202000000006, -122.00539999999995]
