# Part 1: Create labeled image dataset with two classes

1. Indoor photographs (e.g. Bedrooms, Bathrooms, Classrooms, Offices) 
2. Outdoor photographs (e.g. Landscapes, Skyscrapers, Mountains, Beaches)

## Setup

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import math
import sklearn
from glob import glob
import tensorflow as tf
from IPython.display import YouTubeVideo

  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
%pwd

'/Users/administrator/Documents/pex_challenge/analysis_notebooks'

In [3]:
# project directory
project_dir = Path('/Users/administrator/Documents/pex_challenge/')
data_dir = project_dir.joinpath('data/yt8m/frame')

## Step 1
Download a subset of examples from the YouTube-8M labeled video dataset: https://research.google.com/youtube8m/explore.html

In [18]:
# change directories into data_dir, where we want to download the data
%cd {data_dir}

/Users/administrator/Documents/pex_challenge/data/yt8m/frame


In [None]:
# download the 1/100th of the training frame level data
!curl data.yt8m.org/download.py | shard=1,100 partition=2/frame/train mirror=us python

In [None]:
# download the 1/100th of the validate frame level data
%%capture # stops from displaying the output to manage file size
curl data.yt8m.org/download.py | shard=1,20 partition=2/frame/validate mirror=us python

In [None]:
# download the 1/100th of the test frame level data
%%capture # stops from displaying the output to manage file size
curl data.yt8m.org/download.py | shard=1,20 partition=2/frame/test mirror=us python

## Step 2

Extract relevant frames from the videos to build a balanced dataset of indoor and outdoor images. The dataset should contain a few thousand images in total. This task can be performed with tools like OpenCV or FFmpeg.

In [10]:
# get data on the labels for videos
label_file = project_dir.joinpath('data/vocabulary.csv')
label_weight_file = project_dir.joinpath('data/vocabulary_with_weights.csv')
# read the csv that contains infromation about labels of videos into dataframe
df_labels = pd.read_csv(label_file.as_posix(), sep=',')
df_labels_weights = pd.read_csv(label_weight_file.as_posix(), sep=',')

In [34]:
def extract_data(file, file_name = 'data'):
    '''
    This function reads the frame level data of one tfrecord file
    It goes through all the frames in the video and returns a three lists where each row
    is an image (a frame from the video) and the column corresponds to the rgb data for that frame
    It also extract the video ID and associated labels
    
    file: the path to a tfrecord file
    '''
    
    # create an empty dataframe where the columns correspend to 
    # features we will extract
    df = pd.DataFrame(columns = ['id', 'rgb', 'labels'])
    
    num_video = 1
    for e in tf.python_io.tf_record_iterator(file): 
        print(num_video, len(df))
        
        tf_seq_example = tf.train.SequenceExample.FromString(e)
        # get the number of frames in the video
        n_frames = len(tf_seq_example.feature_lists.feature_list['audio'].feature)
        
        # start interactive TF session
        sess = tf.InteractiveSession()
    
        # iterate through frames
        for i in range(n_frames):
            # get the id of the video
            video_id = tf.cast(tf.decode_raw(
                    tf_seq_example.context.feature['id'].bytes_list.value[0],tf.uint8
                ),tf.float32).eval()
            # get rgb values for the frame image
            # this returns an array of 1024 rgb elements for the image
            arr_rgb = tf.cast(tf.decode_raw(
                    tf_seq_example.feature_lists.feature_list['rgb'].feature[i].bytes_list.value[0],tf.uint8
                ),tf.float32).eval()  
            # get the associated labels for the frame image
            arr_labels = tf_seq_example.context.feature['labels'].int64_list.value
            # add this list to the overall dataframe
            
            # create a row of the extracted information
            row = {
                'id': video_id,
                'rgb': arr_rgb,
                'labels': arr_labels
            }
            df = df.append(row, ignore_index=True)        
        
        sess.close()
        num_video += 1
        
        # save the dataframe after every video analyzed
        df.to_csv(project_dir.joinpath('data/' + file_name + '.csv').as_posix())
        
        # if we have information of over 10,000 images
        # break the for loop
        if len(df) > 3000:
            break
    
    return df

In [24]:
def classify_indoor_outdoor(df, vocabulary):
    '''
    This function takes in a dataframe holding information (id, rgb, labels) of
    each different images. From the labels provided, it predicts whether this image
    is indoor and outdoor and returns a dataframe with an additional dummy column 'indoor'.
    If 'indoor' == 1, then the image is indoors. If 'indoor' == 0, then the image is outdoors.
    
    df: the dataframe with image information
    vocabulary: a dataframe that maps numberical labels to strings (words)
    '''
    
    # create an array that to hold information about whether an image is indoor
    # or outdoor
    indoor = []
    
    #iterate through the images
    for row in df.iterrows():
        # get the labels of the image
        labels = row[1]['labels']
        
        weight_sum = 0
        
        #iterate through the labels
        for label in labels:
            # get the indoor weight for this label
            weight_sum = weight_sum + vocabulary[vocabulary.Index == label].weights.values[0]
        
        # take the average of the weights
        weight = weight_sum/len(labels)
        # convert the weight to indoor 1 or 0 classified
        if weight >= 0.5:
            indoor.append(1)
        else:
            indoor.append(0)    
        
    df['indoor'] = indoor
    return df

In [7]:
# get all the tensor flow files that we are going to read 
tf_files = [x for x in data_dir.glob('*.tfrecord')]

In [None]:
# go through each of the tensor files
# each tensor files contains thousands of videos
# extract information about each the frames in the videos
for file in tf_files[1:2]:
    df = extract_data(file.as_posix(), file_name = 'data1')

1 0
2 229
3 529
4 749
5 899
6 1199


In [None]:
df = classify_indoor_outdoor(df, df_labels_weights)

## Step 3

Create a train/test split of the data

In [48]:
def reformat_data(df):
    '''
    This function takes in the dataframe as an input and converts it to a new dataframe
    where the each value for the rgb has its own column
    '''
    
    # define a new dataframe
    df_new = pd.DataFrame([])
    
    num_row = 0
    for row in df.iterrows():
        # create a dictionary for the a new row of df_new
        new_row = {}
        new_row['id'] = row[1]['id']
        
        # get the rgb array of the image
        arr_rgb = row[1]['rgb']
        # iterate through the array
        run = 0
        for val in arr_rgb:
            new_row['rgb' + str(run)] = val
            run +=1
        
        new_row['indoor'] = row[1]['indoor']
        
        df_new = df_new.append(new_row, ignore_index=True)
        num_row += 1
    
    return df_new

In [None]:
df_reformat = reformat_data(df)

In [58]:
# create a train test split of the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_reformat.iloc[:, 2:], df['indoor'], 
                                                                            test_size=0.10)

In [67]:
# save the train/ test files
X_train.to_csv(project_dir.joinpath('data/Xtrain.csv'))
X_test.to_csv(project_dir.joinpath('data/Xtest.csv'))
pd.DataFrame(y_train).to_csv(project_dir.joinpath('data/ytrain.csv'))
pd.DataFrame(y_test).to_csv(project_dir.joinpath('data/ytest.csv'))