# Mosaic Analysis
See this on [Github](https://github.com/yinleon/doppler_tutorials/blob/master/2-mosaic-analysis.ipynb), [NbViewer](https://nbviewer.jupyter.org/github/yinleon/doppler_tutorials/blob/master/2-mosaic-analysis.ipynb)<br>
By Jansen Derr 2021-02-22<br>

In this notebook we will used convolutional features and image metadata to generate visual mosaics over time.

To do so, we must transform our dimension-rich data into 2-dimensions. There are a trove of techniques to do dimensionality reduction, but in this case we'll be using an algorithm called UMap. UMap is unique because it allows us to persist the model and reuse it. This allows us to project our data into the same 2-dimensional latent space with new data.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import sys
import gc
import random
import datetime
import json

from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot
from matplotlib.pyplot import imshow
import imageio
from PIL import Image, ImageFont, ImageDraw 
import joblib
import umap
import numba
from IPython.display import IFrame
from rasterfairy import transformPointCloud2D
from datetime import datetime

import config
from image_utils import resize_image, read_image
import mosaic

In [None]:
numba.__version__, umap.__version__

## Dimensionality Reduction
Fitting UMAP.

In [None]:
# UMAP Params
n_neighbors = 25
metric = 'euclidean'
min_dist = 0.5
training_set_size = config.umap_training_set_size
overwrite_model = False # set to True to re-train the model.

# Model files
file_encoder = (f'{ config.working_dir }/encoder_{ str(min_dist).replace(".", "-") }_'
                f'dist_{ metric }_sample_{ training_set_size }.pkl')
file_training_set = f'{ config.working_dir }/umap_training_data_{ training_set_size }.csv'

In [None]:
if not os.path.exists(file_encoder) or overwrite_model:
    # Create the training set (note: UMAP can be either supervised or unsupervised.)
    if not os.path.exists(file_training_set):
        df_conv = pd.read_csv(config.logits_file, 
                              index_col=0, 
                              compression='gzip')
        training_set = df_conv[config.cols_conv_feats].sample(training_set_size, 
                                                              random_state=303)
    else:
        training_set = pd.read_csv(file_training_set, 
                                   index_col=0)
    
    # fit the model scikit-learn style
    encoder = umap.UMAP(n_neighbors=n_neighbors,
                        min_dist=min_dist,
                        metric=metric,
                        random_state=303,
                        verbose=1).fit(training_set.values)

    # save the model for later! Save the training data, too.
    joblib.dump(encoder, file_encoder)                             
    training_set.to_csv(file_training_set)
else:
    encoder = joblib.load(file_encoder)
    encoder

This creates a scatterplot, we use Mario Klingmann's RasterFairy software to convert this pointcloud into neat rows and columns.

## Dimensionality Reduction and Visualization
Here we will reduce the convolutional features from 2048-dimensions to 2-dimensions so they are easy to visualize. We'll need to join in each image's metadata (including the path of each file) to the convolutonal features creating `df_merge`. We'll take a sample of that data and visualize it as a scatterplot of images.

In [None]:
# Join the image metadata with convolutional features
if not os.path.exists(config.full_metadata_file):
    # Read image metadata
    df_media = pd.read_csv(config.image_lookup_file, 
                           compression='gzip')
    df_media = df_media[~df_media['d_hash'].isin(config.skip_hash)]
    print(len(df_media))
    
    df_conv = pd.read_csv(config.logits_file, 
                          index_col=0, 
                          compression='gzip')
    print(len(df_conv))
    # Merge the datasets
    merge_cols = [c for c in df_media.columns if c != 'f_img']
    df_merged = (pd.merge(left=df_media[merge_cols],
                          right=df_conv.reset_index(), 
                          how='left',
                          left_on='d_hash',
                          right_on='index').sort_values(by='created_at',  
                                                        ascending=True))
    df_merged.created_at = pd.to_datetime(df_merged.created_at)
    df_merged.to_csv(config.full_metadata_file, 
                     compression='gzip')
else:
    df_merged = pd.read_csv(config.full_metadata_file, 
                            index_col=0, 
                            compression='gzip')
    df_merged.created_at = pd.to_datetime(df_merged.created_at, 
                                          format='%Y-%m-%d %H:%M:%S')

In [None]:
df_merged.head(2)

In [None]:
# variables for the mosaic
tile_width, tile_height = config.tile_width, config.tile_height # pixel dimenstions per image
nx, ny = config.mosiac_width, config.mosiac_height                  # number of images in the x and y axis
sample_size = nx * ny
aspect_ratio = float(tile_width) / tile_height

In [None]:
# sample the dataset
df_sample = df_merged.sample(sample_size, random_state=303)
min_date = df_sample.created_at.min()
max_date = df_sample.created_at.max()
images = df_sample.f_img
embeddings = encoder.transform(df_sample[config.cols_conv_feats].values)

In [None]:
mosaic.scatterplot_images(embeddings, images)

## Mosaics
We can further alter these dimensions by reducing the scatterplot into a grid of images using Mario Klingmann's `rasterfairy` Python package. We'll create two utility functions to crop and center each image (`preprocess_image_for_mosaics`), and one which converts the scatterplot of 2-dimentional image creatures into a nice gridded mosaic (`generate_mosaic`).

In [None]:
mosaic.generate_mosaic(embeddings, images, 
                       mosaic_width=nx, mosaic_height=ny,
                       tile_width=tile_width, tile_height=tile_height,
                       save_as_file=False, verbose=True, return_image=True,
                       title=f"Mosaic of r/{config.subreddit} "
                             f"from {min_date.strftime('%Y-%m-%d')} "
                             f"to {max_date.strftime('%Y-%m-%d')} "
                             f"author: {config.author}")

## Generating and Animation
We can create animations with mosaics by sorting our initial dataset by time and traversing through the dataset by `offset` images. We'll use the `create_mosaic` function for `n_steps` mosaics. We'll keep the filename for each mosaic and use `ImageIO` to create an mp4 file.

In [None]:
# dimensionality reduction on all of them
if not os.path.exists(config.two_dim_embeddings_file):
    encoder = joblib.load(file_encoder)
    embeddings = encoder.transform(df_merged[config.cols_conv_feats].values)
    images = df_merged.f_img
    two_dim_embeddings = pd.DataFrame({'x' : embeddings[:,0], 
                                       'y' : embeddings[:,1], 
                                       'f_img' : images, 
                                       'created_at' : df_merged.created_at}
                                     )
    two_dim_embeddings.sort_values(by='created_at', inplace=True, ascending=True)
    two_dim_embeddings.to_csv(config.two_dim_embeddings_file, index=False)
else:
    two_dim_embeddings = pd.read_csv(config.two_dim_embeddings_file)
    two_dim_embeddings.created_at = pd.to_datetime(two_dim_embeddings.created_at)
    two_dim_embeddings.sort_values(by='created_at', inplace=True, ascending=True)

In [None]:
offset = config.offset
n_steps = (len(two_dim_embeddings) - sample_size) // offset
mosaic_files = []
try:
    for i in tqdm(range(n_steps)):
        # where will the image live?
        file_mosaic = os.path.join(config.mosaic_dir, 
                                   os.path.basename(file_encoder.replace('.pkl', '') 
                                   + f'__mosaic__offset_{offset}_sample_{sample_size}_step_{i}.png'))
        if not os.path.exists(file_mosaic):
            # Sample the dataset for images to plot
            df_sample = two_dim_embeddings[i * offset : sample_size + (i * offset)]            
            min_date = df_sample.created_at.min()
            max_date = df_sample.created_at.max()
            
            emb = df_sample[['x', 'y']].values
            img = df_sample.f_img
            # create and save the mosaic
            mosaic.generate_mosaic(emb, img, 
                                   mosaic_width=nx, mosaic_height=ny,
                                   tile_width=tile_width, tile_height=tile_height,
                                   save_as_file=file_mosaic, return_image=False,
                                   title=f"Mosaic of r/{config.subreddit} "
                                         f"from {min_date.strftime('%Y-%m-%d')} "
                                         f"to {max_date.strftime('%Y-%m-%d')} "
                                         f"author {config.author} - Frame {i}")
        mosaic_files.append(file_mosaic)
        
except KeyboardInterrupt:
    print("Cancelled early")

In [None]:
len(mosaic_files)

In [None]:
def make_mp4(files, dest, fps=30):
    '''
    Takes a list of image filepaths..
    Uses ImageIO to combine images into an mp4.
    '''
    images = []
    writer = imageio.get_writer(dest, fps=fps)
    for f_img in files:
        img = Image.open(f_img)
        writer.append_data(np.array(img))
    writer.close()

In [None]:
# imageio.plugins.ffmpeg.download()
filename = datetime.now().isoformat() + '-' + config.subreddit + '.mp4'
filepath = os.path.join(config.output_dir, filename)
make_mp4(mosaic_files, filepath, fps=config.fps)

Knobs to turn:<br>
- Experiment with UMAP parameters.<br>
- Change the pretrained neural network for the feature extraction step.