In [1]:
!pip install --quiet geocoder
!pip install --quiet img2vec_pytorch
print('pip install geocoder & img2vec complete.')

pip install geocoder & img2vec complete.


In [2]:
import base64
import pandas as pd
from arrow import now
from glob import glob
from img2vec_pytorch import Img2Vec
from os.path import basename
from PIL import Image
from geocoder import arcgis

GLOB = '/kaggle/input/photgraphs-of-1000-u-s-cities-ai-generated/*.png'
SIZE = 512
STOP = 3000

def embed(model, filename: str):
    with Image.open(fp=filename, mode='r') as image:
        return model.get_vec(image, tensor=True).numpy().reshape(SIZE,)


def get_picture_from_glob(arg: str, stop: int, ) -> list:
    time_get = now()
    result = []
    count = 0
    failures = 0
    for index, input_file in enumerate(glob(pathname=arg)):
        if count < stop:
            name = ' '.join(basename(input_file).replace('.png', '').split('_'))
            value = embed(model=model, filename=input_file)
            latlng = arcgis(location=name).latlng
            result.append(pd.Series(data={'name': name, 'value': value, 'location': latlng}))
            count += 1
            
    print('embedded {} rows with {} failures in {}'.format(len(result), failures, now() - time_get))
    return result

time_start = now()
model = Img2Vec(cuda=False, model='resnet-18')
df = pd.DataFrame(data=get_picture_from_glob(arg=GLOB, stop=STOP, ))
print('done in {}'.format(now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 106MB/s]


embedded 1000 rows with 0 failures in 0:10:02.434533
done in 0:10:03.501064


In [3]:
df.head()

Unnamed: 0,name,value,location
0,highland california,"[0.9073799, 2.4232817, 2.6540484, 0.70218486, ...","[34.12126000000006, -117.21713999999997]"
1,new britain connecticut,"[0.7772473, 0.6293807, 1.8701727, 1.1827742, 0...","[41.66772000000003, -72.78214999999994]"
2,pembroke pines florida,"[1.815567, 2.8266356, 2.9306302, 0.26130608, 0...","[26.008110000000045, -80.28010999999998]"
3,lacey washington,"[1.4230752, 3.5252979, 3.401282, 0.60222894, 0...","[47.04437000000007, -122.82323999999994]"
4,danville california,"[0.4289289, 0.0, 0.23080131, 0.024211202, 0.05...","[37.83202000000006, -122.00539999999995]"


In [4]:
def get_state(arg:str) -> str:
    result = arg.split()[-1]
    prefix = arg.split()[-2]
    if result in {'york', 'mexico', 'hampshire', 'carolina', 'dakota', 'island'}:
        return ' '.join([prefix, result])
        
    return result

df['state'] = df['name'].apply(func=get_state)

In [5]:
df.head()

Unnamed: 0,name,value,location,state
0,highland california,"[0.9073799, 2.4232817, 2.6540484, 0.70218486, ...","[34.12126000000006, -117.21713999999997]",california
1,new britain connecticut,"[0.7772473, 0.6293807, 1.8701727, 1.1827742, 0...","[41.66772000000003, -72.78214999999994]",connecticut
2,pembroke pines florida,"[1.815567, 2.8266356, 2.9306302, 0.26130608, 0...","[26.008110000000045, -80.28010999999998]",florida
3,lacey washington,"[1.4230752, 3.5252979, 3.401282, 0.60222894, 0...","[47.04437000000007, -122.82323999999994]",washington
4,danville california,"[0.4289289, 0.0, 0.23080131, 0.024211202, 0.05...","[37.83202000000006, -122.00539999999995]",california


In [6]:
# now we have some embeddings; let's see how they cluster
from plotly import express
from arrow import now
from umap import UMAP

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=1000)
df[['x', 'y']] = umap.fit_transform(X=df['value'].apply(pd.Series))
print('done with UMAP in {}'.format(now() - time_start))
express.scatter(data_frame=df, x='x', y='y', hover_name='name', color='state', height=900).show()

2024-03-13 18:03:46.776436: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-13 18:03:46.776593: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-13 18:03:46.954455: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=1000, n_jobs=1, random_state=2024, verbose=True)
Wed Mar 13 18:04:01 2024 Construct fuzzy simplicial set
Wed Mar 13 18:04:03 2024 Finding Nearest Neighbors
Wed Mar 13 18:04:07 2024 Finished Nearest Neighbor Search
Wed Mar 13 18:04:12 2024 Construct embedding


Epochs completed:   0%|            0/1000 [00:00]

	completed  0  /  1000 epochs
	completed  100  /  1000 epochs
	completed  200  /  1000 epochs
	completed  300  /  1000 epochs
	completed  400  /  1000 epochs
	completed  500  /  1000 epochs
	completed  600  /  1000 epochs
	completed  700  /  1000 epochs
	completed  800  /  1000 epochs
	completed  900  /  1000 epochs
Wed Mar 13 18:04:17 2024 Finished embedding
done with UMAP in 0:00:16.230813


  sf: grouped.get_group(s if len(s) > 1 else s[0])
