# Mosaic Analysis
In this notebook we will used convolutional features and image metadata to generate visual mosaics over time.

To do so, we must transform our dimension-rich data into 2-dimensions. There are a trove of techniques to do dimensionality reduction, but in this case we'll be using an algorithm called UMap. UMap is unique because it allows us to persist the model and reuse it. This allows us to project our data into the same 2-dimensional latent space with new data.

In [1]:
%matplotlib inline
import os
import random
import numpy as np
import datetime
from dateutil import relativedelta
import json
import matplotlib.pyplot
import pickle
from matplotlib.pyplot import imshow
from PIL import Image, ImageFont, ImageDraw 
import pandas as pd
from sklearn.externals import joblib
from tqdm import tqdm
import umap.umap_ as umap
import rasterfairy



In [16]:
# change this!
subreddit = 'dankmemes'
working_dir = f'/beegfs/work/smapp/reddit_/{subreddit}/'

image_lookup_file = os.path.join(working_dir, 'media.json.gz')

# these files don't exist yet
logits_file = os.path.join(working_dir, 'image_features_copy.csv.gz')
knn_file = os.path.join(working_dir, 'knn.pkl')

# This is where local images are stored
media_dir = os.path.join(working_dir, 'media')

for _dir in [working_dir, media_dir]:
    os.makedirs(_dir, exist_ok=True)

In [17]:
df_conv = pd.read_csv(logits_file, 
                      index_col=0, 
                      compression='gzip')

In [86]:
df_conv.head(1).to_csv('conv.csv')

In [None]:
len(df)

## Dimensionality Reduction

In [12]:
# UMAP Params
n_neighbors = 25
metric = 'minkowski'
min_dist = 0.1

# Model files
encoder_file = f'{working_dir}/encoder_{str(min_dist).replace(".", "-")}_dist_{metric}.pkl'

In [7]:
encoder = umap.UMAP(n_neighbors=n_neighbors,
                    min_dist=min_dist,
                    metric=metric,
                    random_state=303,
                    verbose=1).fit(df_conv.sample(2000).values)

UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='minkowski',
   metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
   n_neighbors=25, negative_sample_rate=5, random_state=303,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=1)
Construct fuzzy simplicial set
Construct embedding


  n_components


	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs


In [13]:
joblib.dump(encoder, encoder_file)

['/beegfs/work/smapp/reddit_/dankmemes//encoder_0-1_dist_minkowski.pkl']

This creates a scatterplot, we use Mario Klingmann's RasterFairy software to convert this pointcloud into neat rows and columns.

## Generate Mosaic

In [8]:
df_media = pd.read_json(image_lookup_file, lines=True, orient='records', compression='gzip')

In [74]:
idx = df_conv.index

In [21]:
df_media_ = df_media[df_media['d_hash'].isin(idx)]

In [25]:
encoder = joblib.load(encoder_file)
encoder

UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='minkowski',
   metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
   n_neighbors=25, negative_sample_rate=5, random_state=303,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=1)

In [26]:
embeddings = encoder.transform(df_conv.values)

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs


In [75]:
images = df_media_[
    ~df_media_['d_hash'].isin(['NOHASH', '0000000000000000', 'nan'])
].set_index('d_hash').loc[idx]

ValueError: cannot index with vector containing NA / NaN values

In [78]:
[_ for _ in idx if _ == 'nan']

[]

In [84]:
df_conv.index[64]

nan

In [80]:
df_conv[64]

KeyError: 64

In [72]:
df_conv[df_conv.index == 'nan']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047


In [73]:
images.loc[df_conv.index[:64]]

Unnamed: 0_level_0,approved_at_utc,archived,author,author_cakeday,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,...,subreddit_type,suggested_sort,thumbnail,thumbnail_height,thumbnail_width,title,url,view_count,whitelist_status,wls
d_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3e0f3b3a73e9ae7f,,,mysticbagel14,,,,,[],,,...,public,top,https://b.thumbs.redditmedia.com/Q5FOgxZX7jWIa...,123.0,140.0,Geddit right,https://i.redd.it/f2qeqwlnatl21.jpg,,all_ads,6.0
edfa342363e3064a,,,Nathaniel__Bacon,,,,red,[],,NORMIE,...,public,top,https://b.thumbs.redditmedia.com/6udU2FFGkr3yb...,140.0,140.0,Look how they massacred my boy,https://i.redd.it/514gf53katl21.jpg,,all_ads,6.0
37e4ca82b29003b2,,,shawster,,,,,[],,,...,public,top,https://b.thumbs.redditmedia.com/ncf8wuhQsVmbk...,140.0,140.0,Look at that fucking power pose. She’s fucking...,https://i.imgur.com/Y0rroZU.jpg,,all_ads,6.0
8aae9aca8e96ecce,,,_P_O_O_D_L_E_,,,,,[],,,...,public,top,https://b.thumbs.redditmedia.com/ksHda0L9kIrQd...,140.0,140.0,Give. Me. The. Plant.,https://i.redd.it/3ki4t937atl21.jpg,,all_ads,6.0
ca614d06a45d498d,,,Acalde02,,,,text,[],e7de16a4-ac3f-11e6-b248-0e02d7c351aa,OC Memer,...,public,top,https://a.thumbs.redditmedia.com/2r89iB20J1-DN...,120.0,140.0,And you have to check whether its real or not,https://i.redd.it/frbhcbh4atl21.jpg,,all_ads,6.0
3e9b6d5f18646612,,,christopherl572,,,,text,[],d7b65196-8fd0-11e6-a52a-0e363e8280fe,I have crippling depression,...,public,top,https://b.thumbs.redditmedia.com/46wQufXMtgTNH...,139.0,140.0,The guy looks like thanos tho,https://i.redd.it/bbvffyvw9tl21.jpg,,all_ads,6.0
ac82966464606044,,,Pillowmastr,,,,,[],,,...,public,top,https://b.thumbs.redditmedia.com/RgBfhA06yRQRw...,83.0,140.0,I see your pengu meme and raise you my own,https://i.redd.it/iz9hnwxr9tl21.jpg,,all_ads,6.0
1b4f4e4a9826c6db,,,FreezingLlamaReddit,,,,,[],,,...,public,top,https://b.thumbs.redditmedia.com/o9DoC66TYYLkC...,138.0,140.0,Justice,https://i.redd.it/6fmwndeq9tl21.jpg,,all_ads,6.0
1b4f4e4a9826c6db,,,Turtle_Mcturtle,,,,yellow,[],,EX-NORMIE,...,public,top,https://b.thumbs.redditmedia.com/z90drnwRGyT4d...,138.0,140.0,Cheaters are the WORST!,https://i.redd.it/m4t9uk7pd6j21.jpg,,all_ads,6.0
c585c8c849cd88ca,,,Max_morty,,,,yellow,[],,EX-NORMIE,...,public,top,https://a.thumbs.redditmedia.com/C7XKBfzOPOk6o...,133.0,140.0,Nobody: Jews:,https://i.redd.it/0t8rp08f9tl21.jpg,,all_ads,6.0


In [None]:
nx = 50
ny = 40

# assign to grid
grid_assignment = rasterfairy.transformPointCloud2D(embeddings, target=(nx, ny))

tile_width = 72
tile_height = 56

full_width = tile_width * nx
full_height = tile_height * (ny +1)
aspect_ratio = float(tile_width) / tile_height

grid_image = Image.new('RGB', (full_width, full_height))

for img, grid_pos in zip(images, grid_assignment[0]):
    idx_x, idx_y = grid_pos
    x, y = tile_width * idx_x, tile_height * idx_y
    try:
        tile = Image.open(img)
        tile_ar = float(tile.width) / tile.height  # center-crop the tile to match aspect_ratio
        if (tile_ar > aspect_ratio):
            margin = 0.5 * (tile.width - aspect_ratio * tile.height)
            tile = tile.crop((margin, 0, margin + aspect_ratio * tile.height, tile.height))
        else:
            margin = 0.5 * (tile.height - float(tile.width) / aspect_ratio)
            tile = tile.crop((0, margin, tile.width, margin + float(tile.width) / aspect_ratio))
        tile = tile.resize((tile_width, tile_height), Image.ANTIALIAS)
        grid_image.paste(tile, (int(x), int(y)))
    except:
        pass
    
# get a font
fnt = ImageFont.truetype('Pillow/Tests/fonts/FreeMono.ttf', tile_height - 6)
# get a drawing context
draw = ImageDraw.Draw(grid_image)
draw.text((4, tile_height * (ny)), f"Gab Mosaic on {date.year}/{date.month}/{date.day} data via PushShift.io @LeonYin", 
          (128, 255, 0), font=fnt)
grid_image

## Animations
We can create animations with these GIFs by sorting our initial dataset by time

In [None]:
import glob
import imageio

pattern = '/beegfs/ly501/tiles/pol/charlottesville_100/pol_2000_[0-9][0-9][0-9][0-9][0-9][0-9].jpg'
out = '/beegfs/ly501/tiles/pol/gif/charlotte_100_out_30fps_august.gif'

def make_gif(pattern, dest, duration= .25):
    '''
    Saves a png for each congress into the figs subdirectory.
    Uses ImageIO to combine images into a gif.
    Deletes all png files in directory.
    '''

    filenames = glob.glob(pattern)
    filenames.sort()
    images = []

    for filename in filenames:
        images.append(imageio.imread(filename))

    kwargs = { 'duration': duration }
    imageio.mimsave(dest, images,  **kwargs)

def make_mp4(pattern, dest, duration=30):
    '''
    Saves a png for each congress into the figs subdirectory.
    Uses ImageIO to combine images into a gif.
    Deletes all png files in directory.
    '''
    filenames = glob.glob(pattern)
    filenames.sort()
    images = []

    writer = imageio.get_writer(dest, fps=duration)
    for filename in filenames:
        writer.append_data(imageio.imread(filename))
    writer.close()

make_mp4(pattern, out, duration = 23)