In [2]:
import os
import warnings

import pandas as pd
from tqdm import tqdm

import config
import context_manager
import data_sources.pushshift as ps
from image_utils import download_media_and_return_dhash, read_image

from PIL import Image
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
import joblib
import umap


import torch
from torch import nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torchvision import datasets, models, transforms

from config import cols_conv_feats, skip_hash
from image_utils import read_image, read_and_transform_image

warnings.filterwarnings('ignore')

In [3]:
from sqlalchemy import create_engine
from rasterfairy import transformPointCloud2D

In [4]:
# Are we using a GPU? If not, the device will be using cpu
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [5]:
from config import cols_conv_feats, skip_hash


In [6]:
import json

In [7]:
def get_subreddit_context(subreddit):
    '''
    Where will data be saved?
    '''
    sub_dir = os.path.join(config.data_dir, subreddit)
    sub_config_dir = os.path.join(config.data_dir, subreddit, "config")
    media_dir =  os.path.join(config.data_dir, 'media')
    file_subreddit = os.path.join(sub_dir, 'posts.csv.gz')
    file_subreddit_media = os.path.join(sub_dir, 'media.csv.gz')
    image_metas = os.path.join(config.data_dir, subreddit, "image_metas")
    logits_dir = os.path.join(config.data_dir, subreddit, "image_features")
    full_metadata_dir = os.path.join(config.data_dir, subreddit, "full_metadata")
    
    for _dir in [config.data_dir, sub_dir, media_dir, sub_config_dir, image_metas, logits_dir, full_metadata_dir]:
        os.makedirs(_dir, exist_ok=True)
        
    context = {
        'data_dir' : config.data_dir,
        'sub_dir' : sub_dir,
        "sub_config_dir":sub_config_dir,
        'media_dir' : media_dir,
        'file_subreddit' : file_subreddit,
        'file_subreddit_media' : file_subreddit_media,
        "image_metas": image_metas,
        "logits_dir": logits_dir,
        "full_metadata_dir": full_metadata_dir
    }
    
    return context

In [8]:
import time
subreddit = "dankmemes"

input_date = '20210102'

date = pd.to_datetime(input_date).date()

next_date = date + pd.to_timedelta("1D")


start_ux = int(time.mktime(date.timetuple()))
end_ux = int(time.mktime(next_date.timetuple()))

In [9]:
context = get_subreddit_context(subreddit)

In [10]:
context

{'data_dir': 'data/platforms/reddit',
 'sub_dir': 'data/platforms/reddit\\dankmemes',
 'sub_config_dir': 'data/platforms/reddit\\dankmemes\\config',
 'media_dir': 'data/platforms/reddit\\media',
 'file_subreddit': 'data/platforms/reddit\\dankmemes\\posts.csv.gz',
 'file_subreddit_media': 'data/platforms/reddit\\dankmemes\\media.csv.gz',
 'image_metas': 'data/platforms/reddit\\dankmemes\\image_metas',
 'logits_dir': 'data/platforms/reddit\\dankmemes\\image_features',
 'full_metadata_dir': 'data/platforms/reddit\\dankmemes\\full_metadata'}

In [11]:
new_dir = context['sub_config_dir']+"/"+input_date+".csv.gz"

In [12]:
if os.path.exists(new_dir):
    df = pd.read_csv(new_dir)
else:
    records = ps.download_subreddit_posts(subreddit, start_ux, end_ux, verbose=False)
    df = pd.DataFrame(records)
    df['preview'] = df['preview'].apply(json.dumps)
    df.to_csv(new_dir, index=False, compression='gzip')

In [13]:
df['preview'] = df['preview'].fillna("{}").apply(json.loads)

df_media = df[~df.preview.isnull()]

df_media = df_media.to_dict(orient='rocords')

image_meta_dir = context['image_metas']+"/"+input_date+".csv.gz"

if os.path.exists(image_meta_dir):
    _df_img_meta = pd.read_csv(image_meta_dir)
else:
    img_meta = []
    bar = tqdm(total=len(df_media))

    for row in df_media:
        preview = row.get('preview')
        if preview:
            images = preview.get('images')
            if not images:
                continue
            for img in images:
                r = row.copy()
                img_url, f_img = context_manager.get_media_context(img, context)
                if not img_url:
                    continue
                try:
                    d_hash, img_size = download_media_and_return_dhash(img_url, f_img)
                except:
                    print("ERROR!")
                    continue

                if img_size != 0:
                    r['deleted'] = False
                    r['d_hash'] = d_hash
                    r['f_img'] = f_img 
                    r['img_size'] = img_size
                else:
                    r['deleted'] = True
                    r['d_hash'] = d_hash
                    r['f_img'] = f_img 
                    r['img_size'] = img_size
                img_meta.append(r)
        bar.update(1)
    _df_img_meta = pd.DataFrame(img_meta)
    _df_img_meta.to_csv(image_meta_dir, index=False, compression='gzip')

In [14]:
_df_img_meta

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,removed_by_category,author_cakeday,media,media_embed,secure_media,secure_media_embed,deleted,d_hash,f_img,img_size
0,[],False,harrycrunk,,[],,text,t2_1qfqw61e,False,False,...,,,,,,,False,131729094545070f,data/platforms/reddit\media\q_\er\Q_erM3a_lYvf...,26550
1,[],False,Thelazytimetraveller,,"[{'e': 'text', 't': 'mod collector '}, {'a': '...",mod collector :beeg_yoshi::sooz_bow::noodles::...,richtext,t2_4woom0lt,False,False,...,,,,,,,False,491b2e1721393167,data/platforms/reddit\media\rw\1c\rw1CuUoizYJG...,56118
2,[],False,PowerfulOperation8,pulse,"[{'a': ':maymay:', 'e': 'emoji', 'u': 'https:/...",:maymay: Maymaymaker :maymay:,richtext,t2_38jvij40,False,False,...,,,,,,,True,NOHASH,data/platforms/reddit\media\2x\ew\2XewVSYnZPxv...,0
3,[],False,calizoomer,,"[{'e': 'text', 't': '☣️'}]",☣️,richtext,t2_69d7qby2,False,False,...,,,,,,,False,53910663d1d0ccde,data/platforms/reddit\media\ro\h-\rOh-pn26Lwif...,104050
4,[],False,theredditor13,,[],,text,t2_4m6vn0cv,False,False,...,,,,,,,False,7c1a094b70e02879,data/platforms/reddit\media\-f\ta\-FtAzj7sYjDd...,108619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2310,[],False,Inkling4,,"[{'e': 'text', 't': '☝ FOREVER NUMBER ONE ☝'}]",☝ FOREVER NUMBER ONE ☝,richtext,t2_yq2n8,False,False,...,,,,,,,False,2bc0d194e6eea4c8,data/platforms/reddit\media\vm\pd\vmPDZQhLyqas...,46502
2311,[],False,HRUDUS,,[],,text,t2_8zqqcbc8,False,False,...,moderator,,,,,,False,c099a6723371d872,data/platforms/reddit\media\ul\gz\ULgZ97Q-VzAK...,48617
2312,[],False,Sike1dj,,[],,text,t2_a8zez,False,False,...,,,,,,,False,6333212701a62404,data/platforms/reddit\media\6u\ud\6uUdCuHm2y30...,143750
2313,[],False,XxF1RExX,pulse,"[{'e': 'text', 't': 'bitly.com/1H9DQSz'}]",bitly.com/1H9DQSz,richtext,t2_38hew926,False,True,...,,,,,,,False,a09c9d9c9e9e46a7,data/platforms/reddit\media\kz\0z\kz0zjYeCnTSa...,175871


In [15]:
_df_img_meta = _df_img_meta[~_df_img_meta['d_hash'].isin(skip_hash)] 

# The image needs to be specific dimensions, normalized, and converted to a Tensor to be read into a PyTorch model.
scaler = transforms.Resize((224, 224))
to_tensor = transforms.ToTensor()
normalizer = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                  std=[0.229, 0.224, 0.225])

# this is the order of operations that will occur on each image.
transformations = transforms.Compose([scaler, 
                                      to_tensor, 
                                      normalizer])

class Feature_Extraction_Dataset(Dataset):
    """Dataset wrapping images and file names
    img_col is the column for the image to be read
    index_col is a unique value to index the extracted features
    """
    def __init__(self, df, img_col, index_col):
        # filter out rows where the file is not on disk.
        self.X_train = df.drop_duplicates(subset='d_hash').reset_index(drop=True)
        self.files = self.X_train[img_col]
        self.idx = self.X_train[index_col]

    def __getitem__(self, index):
        img_idx = self.idx[index]
        img_file = self.files[index]
        try:
            img = read_and_transform_image(self.files[index], transformations)
            return img, img_file, img_idx
        except:
            pass

    def __len__(self):
        return len(self.X_train.index)

dataset = Feature_Extraction_Dataset(_df_img_meta, 
                                     img_col='f_img', 
                                     index_col='d_hash')
data_loader = DataLoader(dataset,
                         batch_size=config.batch_size,
                         shuffle=False,
                         num_workers=0)

def load_resnet_for_feature_extraction():
    # Load a pre-trained model
    res50_model = models.resnet50(pretrained=True)
    # Pop the last Dense layer off. This will give us convolutional features.
    res50_conv = nn.Sequential(*list(res50_model.children())[:-1])
    res50_conv.to(device)

    # Don't run backprop!
    for param in res50_conv.parameters():
        param.requires_grad = False

    # we won't be training the model. Instead, we just want predictions so we switch to "eval" mode. 
    res50_conv.eval();
    
    return res50_conv

res50_conv = load_resnet_for_feature_extraction()

logits_dir = context['logits_dir']+"/"+input_date+".csv.gz"

conv = None

if os.path.exists(logits_dir):
    conv = pd.read_csv(logits_dir, index_col=0).drop_duplicates()

df_convs = []
for (X, img_file, idx) in tqdm(data_loader):
    filt = [i for i in idx if i not in conv.index] if conv is not None else [i for i in idx]
    if filt:
        X = X.to(device)
        logits = res50_conv(X)
        #logits.size() # [`batch_size`, 2048, 1, 1])

        logits = logits.squeeze(2) # remove the extra dims
        logits = logits.squeeze(2) # remove the extra dims
        #logits.size() # [`batch_size`, 2048]

        n_dimensions = logits.size(1)
        logits_dict = dict(zip(idx, logits.cpu().data.numpy()))
        #{'filename' : np.array([x0, x1, ... x2047])}

        df_conv = pd.DataFrame.from_dict(logits_dict, 
                                         columns=cols_conv_feats, 
                                         orient='index')
        # add a column for the filename of images...
        df_conv['f_img'] = img_file
        df_conv = df_conv.loc[filt]
        df_convs.append(df_conv)    

conv = pd.concat([conv, *df_convs]).drop_duplicates()

conv.to_csv(logits_dir, compression='gzip')

# UMAP Params
n_neighbors = 25
metric = 'euclidean'
min_dist = 0.5
training_set_size = config.umap_training_set_size
overwrite_model = False # set to True to re-train the model.
os.makedirs(f'{ config.working_dir }/encoders',exist_ok=True)
os.makedirs(f'{ config.working_dir }/umap_training_data',exist_ok=True)
# Model files
file_encoder = (f'{ config.working_dir }/encoders/{ str(min_dist).replace(".", "-") }_'
                f'dist_{ metric }_sample_{ training_set_size }_{input_date}.pkl')
file_training_set = f'{ config.working_dir }/{ training_set_size }_{input_date}.csv'

if not os.path.exists(file_encoder) or overwrite_model:
    # Create the training set (note: UMAP can be either supervised or unsupervised.)
    if not os.path.exists(file_training_set):
        training_set = conv[config.cols_conv_feats].sample(training_set_size, 
                                                              random_state=303)
    else:
        training_set = pd.read_csv(file_training_set, 
                                   index_col=0)
    
    # fit the model scikit-learn style
    encoder = umap.UMAP(n_neighbors=n_neighbors,
                        min_dist=min_dist,
                        metric=metric,
                        random_state=303,
                        verbose=1).fit(training_set.values)

    # save the model for later! Save the training data, too.
    joblib.dump(encoder, file_encoder)                             
    training_set.to_csv(file_training_set)
else:
    encoder = joblib.load(file_encoder)
    encoder

logits_dir = context['full_metadata_dir']+"/"+input_date+".csv.gz"

_df_img_meta

# Join the image metadata with convolutional features
if not os.path.exists(logits_dir):
    # Merge the datasets
    merge_cols = [c for c in _df_img_meta.columns if c != 'f_img']
    df_merged = pd.merge(left=_df_img_meta[merge_cols],
                          right=conv.reset_index(), 
                          how='left',
                          left_on='d_hash',
                          right_on='index')
    df_merged.to_csv(logits_dir, 
                     compression='gzip')
else:
    df_merged = pd.read_csv(logits_dir, 
                            index_col=0, 
                            compression='gzip')

tile_width, tile_height = config.tile_width, config.tile_height # pixel dimenstions per image

nx = config.mosiac_width # number of images in the x and y axis
ny = df_merged.shape[0] // nx
sample_size = nx * ny
aspect_ratio = float(tile_width) / tile_height




100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:13<00:00, 14.05it/s]


NameError: name 'df_sample' is not defined

In [18]:

df_sample = df_merged.sample(sample_size, random_state=303)

In [27]:
from image_utils import resize_image

In [29]:
from PIL import Image, ImageFont, ImageDraw 

In [28]:
full_width = tile_width * nx
full_height = tile_height * (ny + 2)
aspect_ratio = float(tile_width) / tile_height

# create an empty image for the mosaic
mosaic = Image.new('RGB', (full_width, full_height))

# iterate through each image and where it is possed to live.
for f_img, (idx_x, idx_y) in tqdm(zip(images, grid_assignment[0]), 
                                  disable = False):
    # Find exactly where the image will be
    x, y = tile_width * idx_x, tile_height * idx_y

    # read the image, center crop the image and add it to the mosaic
    try:
        img = Image.open(f_img).convert('RGBA')
        tile = resize_image(img, tile_width, tile_height, aspect_ratio)
        mosaic.paste(tile, (int(x), int(y)))
    except Exception as e:
        print(f"Failed to add image {f_img} see error:\n{e}")    

# write an annotation
#fnt = ImageFont.truetype('Pillow/Tests/fonts/FreeMono.ttf', int(tile_height * 1.2) )


1500it [00:15, 95.33it/s] 


NameError: name 'ImageDraw' is not defined

In [None]:
mosaic_height = 1000

In [32]:
draw = ImageDraw.Draw(mosaic)
draw.text((4, (tile_height * (ny)) + 10), 
           "title", title_rbg)

ValueError: unknown color specifier: ''

In [19]:
df_sample = df_merged.sample(sample_size, random_state=303)
images = df_sample.f_img
embeddings = encoder.transform(df_sample[config.cols_conv_feats].values)

import mosaic

grid_assignment = transformPointCloud2D(embeddings, 
                                            target=(nx, 
                                                    ny))

grid_assignment

df_sample[['x', 'y']] = grid_assignment[0].astype(int)

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs


NameError: name 'output_df' is not defined

In [20]:
images

1466    data/platforms/reddit\media\ty\3f\ty3fOqjq7uP3...
1341    data/platforms/reddit\media\wb\sg\WBsGJM2EeEHh...
1178    data/platforms/reddit\media\dk\jy\dKJYtpeu8jHR...
1334    data/platforms/reddit\media\8t\j2\8tj2bkzxb08x...
78      data/platforms/reddit\media\n6\rc\N6RcVjoW9DUl...
                              ...                        
975     data/platforms/reddit\media\ft\qg\ftqgGJZYCCmG...
180     data/platforms/reddit\media\49\nf\49nF0M1E-1m3...
369     data/platforms/reddit\media\0u\o_\0uO_iHiMeAcF...
370     data/platforms/reddit\media\be\yu\Beyuxf2o4S37...
200     data/platforms/reddit\media\ye\n4\YEn4gpv7CBGs...
Name: f_img, Length: 1500, dtype: object

In [None]:
images

In [None]:
def get_sql_connection():
    db = create_engine(
        "mysql://admin:coffee-admin@coffee.cp82lr4f5r06.us-east-2.rds.amazonaws.com:3306/db?charset=utf8",
        encoding="utf8",
    )
    return db

db = get_sql_connection()

wanted_cols = ["url", "full_link", "x", "y"]

output_df = df_sample[wanted_cols]

output_df.to_sql("mosaics", db, index=False, if_exists="append")