## Setup

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import math
import sklearn
from glob import glob
import tensorflow as tf
from IPython.display import YouTubeVideo

In [None]:
%pwd

In [None]:
# project directory
project_dir = Path('/Users/administrator/Documents/pex_challenge/')
data_dir = project_dir.joinpath('data/yt8m/frame')

### Load Vocabulary CSV

In [None]:
# get data on the labels for videos
label_file = project_dir.joinpath('data/vocabulary.csv')
# read the csv that contains infromation about labels of videos into dataframe
df_labels = pd.read_csv(label_file.as_posix(), sep=',')

### Generate Weights for each Label

In [None]:
def generate_weights(df):
    '''
    This function interates through the rows of the vocabulary csv
    and it gives a weight to each label on probability that it corresponds to something indoor
    based on the name and definition. It returns a df with an additional column corresponding to weights
    '''
    
    weights = []
    indoor_markers = ['room','office','indoor', 'class', 'house', 'home', 'gym', 'facility', 'building', 'kitchen',
                     'tv', 'computer', 'library', 'cafe', 'restaurant', 'concert', 'guitar', 'pet', 'game',
                     'theatre', 'performance art', 'movie', 'film', 'lunch', 'cook', 'recipe', 'instrument', 'store', 
                     'shop', 'food', 'sing', 'art', 'hair', 'draw', 'anime', 'beauty', 'makeup', 'roof', 'floor',
                     'wall', 'corner', 'party', 'school', 'bake', 'dress']
    outdoor_markers = ['cycling', 'cycle', 'fish', 'fishing', 'transport', 'outdoor', 'outside', 'sun',
                      'sport', 'soccer', 'skate', 'football', 'snow', 'rain', 'wind', 'storm', 'amusement park',
                      'bike', 'weather', 'surf', 'ocean', 'tree', 'sky', 'run', 'farm', 'car', 'nature', 'resort',
                      'pool', 'street', 'ave', 'walk']
    
    #iterate through the dataframe
    for row in df.iterrows():
        weight = 0.5
        for i_marker in indoor_markers:
            if i_marker in str(row[1].Name).lower() or i_marker in str(row[1].WikiDescription).lower():
                weight = 0.8
        for o_marker in outdoor_markers:
            if o_marker in str(row[1].Name).lower() or o_marker in str(row[1].WikiDescription).lower() and weight == 0.5:
                weight = 0.2
        weights.append(weight)
    
    df['weights'] = weights
    return df

In [None]:
generate_weights(df_labels)

In [None]:
df_labels.to_csv(project_dir.joinpath('data/vocabulary_with_weights.csv').as_posix())