In [2]:
import shutil
import subprocess
import glob
from tqdm import tqdm
import numpy as np
import os
import argparse

import torch
from torch import nn
import torch.nn.functional as F
import pretrainedmodels
from pretrainedmodels import utils

C, H, W = 3, 224, 224


def extract_frames(video, dst):
    with open(os.devnull, "w") as ffmpeg_log:
        if os.path.exists(dst):
            print(" cleanup: " + dst + "/")
            shutil.rmtree(dst)
        os.makedirs(dst)
        video_to_frames_command = ["ffmpeg",
                                   # (optional) overwrite output file if it exists
                                   '-y',
                                   '-i', video,  # input file
                                   '-vf', "scale=400:300",  # input file
                                   '-qscale:v', "2",  # quality for JPEG
                                   '{0}/%06d.jpg'.format(dst)]
        subprocess.call(video_to_frames_command,
                        stdout=ffmpeg_log, stderr=ffmpeg_log)

        
#Sample some frames from video and then convert to rgb and then pass through model to get features 
#and the save them.

def extract_feats(params, model, load_image_fn):
    global C, H, W
    model.eval()

    dir_fc = params['output_dir']
    if not os.path.isdir(dir_fc):
        os.mkdir(dir_fc)
    print("save video feats to %s" % (dir_fc))
    video_list = glob.glob(os.path.join(params['video_path'], '*.mp4'))
    caption_path = 'data/captions/train.json'
    videos_json = json.load(open(path, 'r'))
    for video in tqdm(video_list):
        print("name: ",video)
        video_id = video.split("/")[-1].split(".")[0]
        print("id: ",video_id)
        captions = []
        if video_id in videos_json:
            captions = videos_json[video_id]['sentences']
        else:
            print("Caption not found for ",video_id)
        
        dst = params['model'] + '_' + video_id
        extract_frames(video, dst)

        image_list = sorted(glob.glob(os.path.join(dst, '*.jpg')))
        samples = np.round(np.linspace(
            0, len(image_list) - 1, params['n_frame_steps']))
        image_list = [image_list[int(sample)] for sample in samples]
        images = torch.zeros((len(image_list), C, H, W))
        for iImg in range(len(image_list)):
            img = load_image_fn(image_list[iImg])
            images[iImg] = img
        with torch.no_grad():
            fc_feats = model(images).squeeze()
            #fc_feats = model(images.cuda()).squeeze()
        img_feats = fc_feats.cpu().numpy()
        # Save the inception features
        outfile = os.path.join(dir_fc, video_id + '.npy')
        np.save(outfile, img_feats)
        # cleanup
        shutil.rmtree(dst)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--gpu", dest='gpu', type=str, default='0',
                        help='Set CUDA_VISIBLE_DEVICES environment variable, optional')
    parser.add_argument("--output_dir", dest='output_dir', type=str,
                        default='data/feats/resnet152', help='directory to store features')
    parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=40,
                        help='how many frames to sampler per video')

    parser.add_argument("--video_path", dest='video_path', type=str,
                        default='data/train-video', help='path to video dataset')
    parser.add_argument("--model", dest="model", type=str, default='resnet152',
                        help='the CNN model you want to use to extract_feats')
    
    import sys
    sys.argv=['']
    del sys
    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    params = vars(args)
    if params['model'] == 'inception_v3':
        C, H, W = 3, 299, 299
        model = pretrainedmodels.inceptionv3(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)

    elif params['model'] == 'resnet152':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.resnet152(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)

    elif params['model'] == 'inception_v4':
        C, H, W = 3, 299, 299
        model = pretrainedmodels.inceptionv4(
            num_classes=1000, pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)

    else:
        print("doesn't support %s" % (params['model']))

    model.last_linear = utils.Identity()
    #model = nn.DataParallel(model)
    
    #model = model.cuda()
    extract_feats(params, model, load_image_fn)

  0%|          | 0/1 [00:00<?, ?it/s]

save video feats to data/feats/resnet152
name:  data/train-video/v_-AjZCBMb4qU.mp4
id:  v_-AjZCBMb4qU
 cleanup: resnet152_v_-AjZCBMb4qU/


100%|██████████| 1/1 [00:21<00:00, 21.52s/it]


In [3]:
a=np.load('data/feats/resnet152/v_-AjZCBMb4qU.npy')


In [7]:
import re
import json
import argparse
import numpy as np


def build_vocab(vids, params):
    count_thr = params['word_count_threshold']
    # count up the number of words
    counts = {}
    for vid, caps in vids.items():
        for cap in caps['captions']:
            ws = re.sub(r'[.!,;?]', ' ', cap).split()
            for w in ws:
                counts[w] = counts.get(w, 0) + 1
    # cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
    total_words = sum(counts.values())
    bad_words = [w for w, n in counts.items() if n <= count_thr]
    vocab = [w for w, n in counts.items() if n > count_thr]
    bad_count = sum(counts[w] for w in bad_words)
    print('number of bad words: %d/%d = %.2f%%' %
          (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts)))
    print('number of words in vocab would be %d' % (len(vocab), ))
    print('number of UNKs: %d/%d = %.2f%%' %
          (bad_count, total_words, bad_count * 100.0 / total_words))
    # lets now produce the final annotations
    if bad_count > 0:
        # additional special UNK token we will use below to map infrequent words to
        print('inserting the special UNK token')
        vocab.append('<UNK>')
    for vid, caps in vids.items():
        caps = caps['captions']
        vids[vid]['final_captions'] = []
        for cap in caps:
            ws = re.sub(r'[.!,;?]', ' ', cap).split()
            caption = [
                '<sos>'] + [w if counts.get(w, 0) > count_thr else '<UNK>' for w in ws] + ['<eos>']
            vids[vid]['final_captions'].append(caption)
    return vocab


def main(params):
    videos = json.load(open(params['input_json'], 'r'))['sentences']
    video_caption = {}
    for i in videos:
        if i['video_id'] not in video_caption.keys():
            video_caption[i['video_id']] = {'captions': []}
        video_caption[i['video_id']]['captions'].append(i['caption'])
    # create the vocab
    vocab = build_vocab(video_caption, params)
    itow = {i + 2: w for i, w in enumerate(vocab)}
    wtoi = {w: i + 2 for i, w in enumerate(vocab)}  # inverse table
    wtoi['<eos>'] = 0
    itow[0] = '<eos>'
    wtoi['<sos>'] = 1
    itow[1] = '<sos>'

    out = {}
    out['ix_to_word'] = itow
    out['word_to_ix'] = wtoi
    out['videos'] = {'train': [], 'val': [], 'test': []}
    videos = json.load(open(params['input_json'], 'r'))['videos']
    for i in videos:
        out['videos'][i['split']].append(int(i['id']))
    json.dump(out, open(params['info_json'], 'w'))
    json.dump(video_caption, open(params['caption_json'], 'w'))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # input json
    parser.add_argument('--input_json', type=str, default='data/videodatainfo_2017.json',
                        help='msr_vtt videoinfo json')
    parser.add_argument('--info_json', default='data/info.json',
                        help='info about iw2word and word2ix')
    parser.add_argument('--caption_json', default='data/caption.json', help='caption json file')


    parser.add_argument('--word_count_threshold', default=1, type=int,
                        help='only words that occur more than this number of times will be put in vocab')

    args = parser.parse_args()
    params = vars(args)  # convert to ordinary dict
    main(params)

(40, 2048)

In [None]:
# import re
# import json
# import argparse
# import numpy as np


# def build_vocab(vids, params):
#     count_thr = params['word_count_threshold']
#     # count up the number of words
#     counts = {}
#     for vid, caps in vids.items():
#         for cap in caps['captions']:
#             ws = re.sub(r'[.!,;?]', ' ', cap).split()
#             for w in ws:
#                 counts[w] = counts.get(w, 0) + 1
#     # cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
#     total_words = sum(counts.values())
#     bad_words = [w for w, n in counts.items() if n <= count_thr]
#     vocab = [w for w, n in counts.items() if n > count_thr]
#     bad_count = sum(counts[w] for w in bad_words)
#     print('number of bad words: %d/%d = %.2f%%' %
#           (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts)))
#     print('number of words in vocab would be %d' % (len(vocab), ))
#     print('number of UNKs: %d/%d = %.2f%%' %
#           (bad_count, total_words, bad_count * 100.0 / total_words))
#     # lets now produce the final annotations
#     if bad_count > 0:
#         # additional special UNK token we will use below to map infrequent words to
#         print('inserting the special UNK token')
#         vocab.append('<UNK>')
#     for vid, caps in vids.items():
#         caps = caps['captions']
#         vids[vid]['final_captions'] = []
#         for cap in caps:
#             ws = re.sub(r'[.!,;?]', ' ', cap).split()
#             caption = [
#                 '<sos>'] + [w if counts.get(w, 0) > count_thr else '<UNK>' for w in ws] + ['<eos>']
#             vids[vid]['final_captions'].append(caption)
#     return vocab


# def main(params):
#     videos = json.load(open(params['input_json'], 'r'))['sentences']
#     video_caption = {}
#     for i in videos:
#         if i['video_id'] not in video_caption.keys():
#             video_caption[i['video_id']] = {'captions': []}
#         video_caption[i['video_id']]['captions'].append(i['caption'])
#     # create the vocab
#     vocab = build_vocab(video_caption, params)
#     itow = {i + 2: w for i, w in enumerate(vocab)}
#     wtoi = {w: i + 2 for i, w in enumerate(vocab)}  # inverse table
#     wtoi['<eos>'] = 0
#     itow[0] = '<eos>'
#     wtoi['<sos>'] = 1
#     itow[1] = '<sos>'

#     out = {}
#     out['ix_to_word'] = itow
#     out['word_to_ix'] = wtoi
#     out['videos'] = {'train': [], 'val': [], 'test': []}
#     videos = json.load(open(params['input_json'], 'r'))['videos']
#     for i in videos:
#         out['videos'][i['split']].append(int(i['id']))
#     json.dump(out, open(params['info_json'], 'w'))
#     json.dump(video_caption, open(params['caption_json'], 'w'))


# if __name__ == "__main__":
#     parser = argparse.ArgumentParser()

#     # input json
#     parser.add_argument('--input_json', type=str, default='data/activity_net.v1-2.min.json',
#                         help='msr_vtt videoinfo json')
#     parser.add_argument('--info_json', default='data/info.json',
#                         help='info about iw2word and word2ix')
#     parser.add_argument('--caption_json', default='data/captions/train.json', help='caption json file')


#     parser.add_argument('--word_count_threshold', default=1, type=int,
#                         help='only words that occur more than this number of times will be put in vocab')

#     args = parser.parse_args()
#     params = vars(args)  # convert to ordinary dict
#     main(params)

In [2]:
import re
import json
import argparse
import numpy as np


def build_vocab(vids, params):
    count_thr = params['word_count_threshold']
    # count up the number of words
    counts = {}
    for vid, caps in vids.items():
        for cap in caps['captions']:
            print("caption: ",cap)
            ws = re.sub(r'[.!,;?]', ' ', cap).split()
            for w in ws:
                counts[w] = counts.get(w, 0) + 1
    # cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
    total_words = sum(counts.values())
    bad_words = [w for w, n in counts.items() if n <= count_thr]
    vocab = [w for w, n in counts.items() if n > count_thr]
    bad_count = sum(counts[w] for w in bad_words)
    print('number of bad words: %d/%d = %.2f%%' %
          (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts)))
    print('number of words in vocab would be %d' % (len(vocab), ))
    print('number of UNKs: %d/%d = %.2f%%' %
          (bad_count, total_words, bad_count * 100.0 / total_words))
    # lets now produce the final annotations
    if bad_count > 0:
        # additional special UNK token we will use below to map infrequent words to
        print('inserting the special UNK token')
        vocab.append('<UNK>')
    for vid, caps in vids.items():
        caps = caps['captions']
        vids[vid]['final_captions'] = []
        for cap in caps:
            ws = re.sub(r'[.!,;?]', ' ', cap).split()
            caption = [
                '<sos>'] + [w if counts.get(w, 0) > count_thr else '<UNK>' for w in ws] + ['<eos>']
            vids[vid]['final_captions'].append(caption)
    return vocab


def main(params):
    videos = json.load(open(params['input_json'], 'r'))['database']
    video_caption = {}
    captions = json.load(open('data/captions/train.json', 'r'))
    for i in captions:
        if i not in video_caption.keys():
            video_caption[i] = {'captions': []}    
        video_caption[i]['captions']= captions[i]['sentences']
    # create the vocab
    vocab = build_vocab(video_caption, params)
    itow = {i + 2: w for i, w in enumerate(vocab)}
    wtoi = {w: i + 2 for i, w in enumerate(vocab)}  # inverse table
    wtoi['<eos>'] = 0
    itow[0] = '<eos>'
    wtoi['<sos>'] = 1
    itow[1] = '<sos>'

    out = {}
    out['ix_to_word'] = itow
    out['word_to_ix'] = wtoi
    out['videos'] = {'training': [], 'validation': [], 'testing': []}
    for i in videos:
        out['videos'][videos[i]['subset']].append(i)
    json.dump(out, open(params['info_json'], 'w'))
    json.dump(video_caption, open(params['caption_json'], 'w'))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # input json
    parser.add_argument('--input_json', type=str, default='data/activity_net.v1-2.min.json',
                        help='msr_vtt videoinfo json')
    parser.add_argument('--info_json', default='data/info.json',
                        help='info about iw2word and word2ix')
    parser.add_argument('--caption_json', default='data/captions/out_captions.json', help='caption json file')


    parser.add_argument('--word_count_threshold', default=1, type=int,
                        help='only words that occur more than this number of times will be put in vocab')
    
    import sys
    sys.argv=['']
    del sys
    args = parser.parse_args()
    params = vars(args)  # convert to ordinary dict
    main(params)

caption:  A young woman is seen standing in a room and leads into her dancing.
caption:   The girl dances around the room while the camera captures her movements.
caption:   She continues dancing around the room and ends by laying on the floor.
caption:  The video starts with a title logo sequence.
caption:   A man and woman are in a living room demonstrating exercises.
caption:   The woman lays on the ground.
caption:   The man starts pointing to different areas of the woman's body as she does an exercise.
caption:   The woman begins to do small sit ups.
caption:   The woman ends with a final title logo sequence.
caption:  Two people are seen moving around a kitchen quickly performing various tasks and sitting down.
caption:   They then wax down a ski in the kitchen while continuing to move around.
caption:  We see a hallway with a wooden floor.
caption:   A dog in socks walks slowly out onto the floor as a lady films him.
caption:   The dog turns around and goes back to the other roo

caption:   There are several images of her from behind with very long hair.
caption:   We then see numerous images of her with her hair cut off to shoulder length.
caption:  A man in a red coat is outside on a snowboard.
caption:   He starts going down a course and doing some really cool tricks while moving.
caption:   Some men are rubbing down their boards and another gets on the field and starts doing some flips and stuff in mid air.
caption:   These are all pretty experienced people on the boards, they are even boarding through water and having a good time.
caption:  woman is standing talking to the camera holding a microphone and interviewing voleyball players.
caption:   men are standing in a sandy field being interviewed.
caption:   men are in sandy field showing he girl how to play beach soccer.
caption:   woman is sin front of the net talking to the camera.
caption:  A man gets pumped up to do a long jump then does it.
caption:    Others go next and try to get the crowd pumped 

caption:    One of the boys puts on a football helmet and stands on the side lines catching the ball.
caption:  On a track, a runner gets ready to perform.
caption:   He runs and skips along the way, jumping high at the end.
caption:   He walks along the track after being finished.
caption:   The man runs and jumps again.
caption:   The runner talks to someone.
caption:  Three men wearing snowboards get out a ski lift, and a man tie his shoe.
caption:   Then, the men ski down a hill covered with snow.
caption:   A man bend his legs on the snow, then continue skiing with other men.
caption:  A man plays the bagpipes on stage while wearing a Scottish skirt.
caption:   Then, the man ends playing and holds the bagpipes on his left arm.
caption:  A woman is on her hands and knees cutting grass with some scissors.
caption:   The camera pans out to give a full view of her cutting the grass.
caption:   She makes a joke about the grass having split ends and bad roots.
caption:   The camera goes

caption:   Two guys sit on their dirt bike and discuss.
caption:   One guy uses his rear tire to cause dirt to spray.
caption:   The credits of the video are shown.
caption:  A black book called Windmills is on the screen.
caption:    The camera zooms into a dancer.
caption:    VincaniTV comes up on the screen.
caption:    the instructor is talking.
caption:    The instructor is sitting on the floor introducing the move.
caption:    A shot of the move is shown, then the instructor shows how to do it.
caption:    He pushes  himself up on his hand and does another move.
caption:    He is on one knee to show another move.
caption:   The screen goes to another shot of the same move but faster this time.
caption:    The instructor is standing and talking.
caption:    The screen ends with a VincaniTV promo.
caption:  A woman is standing in a camp site.
caption:   She lights matches and throws them into a pile of wood.
caption:  A large group of people are seen playing a game of volley ball w

caption:  Two girls practice fencing in a school gym with protective gear on and a coach against the wall watching and coaching them as they fence.
caption:    The two girls talk to the coach and then begin to fence with each other as the coach motions to where they should stand and navigate the space during the spar.
caption:    Both girls end the spar and remove their head gear as they talk to the coach.
caption:  A woman in a green sweater puts on a face mask.
caption:   She paints a wooden fence.
caption:   She puts news paper onto the fence.
caption:   She pours a bucket of paint into a sprayer.
caption:   She starts spraying the fence with a hose.
caption:  Two men are talking in front of a helicopter.
caption:   A man on a wake board is being pulled by a helicopter in and out of the water.
caption:   He drops from the rope and pulls a parachute and lands on the ground.
caption:  A man is seated on a bench in a park.
caption:   He is playing an old accordian.
caption:   He pushes

caption:  A girl with two braids going down her back is standing outside.
caption:   She turns toward the camera, and her hair is now down.
caption:   She lays in a stylist chair, showing how the style was done.
caption:  The title appears and fades out.
caption:   We see a man pulling weight close to his head.
caption:   Three men pass by in the mirror.
caption:   The camera zooms in on the main subject.
caption:   A man in gray walks past in the mirror.
caption:   The screen goes black and we see the end credits.
caption:  A bottle of deep cleansing gel is shown.
caption:   A woman holds up the bottle, Then is shown using several different solutions that she lines up together on the counter.
caption:   She uses each on her face, then rinses them off.
caption:  We see a blue title screen with white letters.
caption:   We see two girls standing in a room, then perform a flip.
caption:   We see 2 different girls do different flips.
caption:   One girl does a back flip.
caption:   We the

caption:   The man scrubs the shoes down with a rag while continuing to speak to the camera.
caption:   In the end he holds up the shoes while still speaking and giving the camera a thumbs up.
caption:  A couple of teams are on an open field.
caption:   They are playing lacrosse together.
caption:   They run around, trying to hit the ball.
caption:  A small group of children are seen running around an indoor field kicking a soccer ball.
caption:   The kids move up and down in between poles while still kicking the ball.
caption:   The kids are then shown kicking balls into a goal one after the other.
caption:  A girl is standing in the back of a room that has paper air ballon, and she begins to dance moving her legs back and forth and slightly moving her arms.
caption:   The girl then picks it up and begins to move more of her arms along with her legs and dances in a small area in the room while facing the camera.
caption:  The girl then turns to her side and we see her dancing from the

caption:   He shows how to deal each hand, and how to dispense the chips to the winners.
caption:  A male gymnast hops up onto bars and begins performing a gymnastics routine.
caption:   He swings around the bars over and over again while the judges watch and critique.
caption:   The man eventually jumps off and the video fades away into an effect.
caption:  A man wearing boxing gloves is seen hitting a bag in an empty room.
caption:   He continuously hits the bag over and over again while pausing in between punches to catch his breath.
caption:  Two bikers are riding their motor bikes through rough terrain.
caption:   They are dressed in biking gear with protective helmets and clothing.
caption:  They are going on hilly slopes and uneven terrain on their motorbikes.
caption:   They go through extremely steep and dangerous dirt roads at high speed.
caption:   They also go over ramps created in the dirt road.
caption:   They continue biking through the dirt at high speed as they create 

caption:   The sponsor of the clips is introduced and displayed on the screen.
caption:  There's a man wearing a white graphic shirt, sitting with bongo drums drumming a beat on the drums.
caption:   He is wearing headphones as he drums a beat on the drums.
caption:   He continues playing the drums.
caption:   After a while he changes the beat after pausing for a little time.
caption:  Then continues playing another beat on the drums.
caption:   Then he stops for a little bit and rearranges the drums to play again.
caption:   Another person is drumming beats on an electrical drum set that has several drums.
caption:  A man is seen walking over to a young girl who stands on a stage and begins performing martial arts.
caption:   She performs several flips and tricks and bows to a man in the end while everyone claps for her.
caption:  The man in black shirt and blue jeans is rolling an old red carpet and vacuum the floor.
caption:   He then unroll the new gray carpet, install it by cuttin

caption:  We see the title screens for the video.
caption:   We see a person skiing.
caption:   We switch to a workshop and a list of tools.
caption:   We see a man adds a band on a ski and the ski sits on a contraption.
caption:   A man brushes and rubs the ski.
caption:   We see colorful wax and a man turning a dial.
caption:   The man pours wax and we get instructions before he irons the wax on the ski.
caption:   The man scrapes the wax and brushes the ski.
caption:   We get a ski fact and the video end card shows.
caption:  A person is seen throwing a frisbee off into the distance and a dog chasing after it.
caption:   Several shots are shown of dogs running after frisbees after their owner throws them.
caption:   More dogs are seen chasing after the frisbees while the owner watches from behind.
caption:  A man jumps onto bars and starts swinging on them.
caption:   He does a flip off them and lands on a mat.
caption:  A medical processional puts a device and needle in someones fa

caption:  A man stands outside holding a bottle of sun screen and talking.
caption:   He sprays the sun screen on his arm and rubs it in.
caption:  A man wearing a blue shirt throws a bowling ball down a bowling alley.
caption:    The ball strikes both pins and bounces into the adjoining alley striking the pin down there too.
caption:    A seated man in a red shirt shakes his head.
caption:    The man in the blue shirt walks back.
caption:  An introduction comes onto the screen for a video about the winter sport curling.
caption:   Curlers ar shown as they take different shot and sweep down the ice.
caption:   The video ends with the closing credits and graphics.
caption:  A man scrapes snow off of the side of a car.
caption:   He then scrapes snow off the hood.
caption:   He then moves to the other side to remove more snow.
caption:   Finally he finishes clearing the car.
caption:  A little baby boy is sitting in his high chair dipping a fork into a glass of chocolate milk being held 

caption:  A teen girl sits on the floor holding a shoe.
caption:   She is then shown wiping the shoe down with a cleaning solution.
caption:   She shows the shoe close up after she is done.
caption:  A man is standing outside of a large window and begins to clean it with a long stick.
caption:  One the window is wet,he takes the razor like stick and drags it up and down getting the water off.
caption:  After the two top windows are complete,he leaves the other two untouched and walks away.
caption:  A man bends at the knees in front of a barbell.
caption:   He lifts it slowly, bringing it to his chest.
caption:   He pauses, then lifts it over his head before dropping it back to the ground.
caption:  A man is standing on a ledge of a tower.
caption:   He jumps off and bungee jumps.
caption:   It is shown in slow motion.
caption:  The camera pans around several men drinking  a large container of alcohol and the camera panning back to the beginning.
caption:   The camera continues capturi

caption:   The man does a 90 degree move around as he holds the ball against his neck.
caption:   The man holds the ball against his neck as he does a full 180 degree rotation with his body.
caption:   He pretends to hold the ball as he rotates his body into a 270 degree movement.
caption:  A man is seen speaking to the camera and leads into him holding several objects out on a table.
caption:   The man then sharpens an object on a board and finishes by brushing off the board.
caption:  A man is standing up playing a saxophone.
caption:   A band is playing behind him.
caption:   Blue and white words come onto the screen.
caption:  We see the opening screen on gray.
caption:   We then see a man on discs smoothing a concrete floor.
caption:   The man removes concrete from his trowel.
caption:   The man uses his hand to remove concrete.
caption:   We then see the ending credits.
caption:  The boy is opening the box.
caption:   He then removes the bike.
caption:   He is removing the packag

caption:   The man shaves off part of his leg and holds up an object while walking away.
caption:  A black screen with a white banner at the bottom has various information that include a website, state, phone number and logo.
caption:  A white screen follows and it includes a logo, name of the products and indicates that this is an "INSTRUCTIONAL VIDEO".
caption:  Another white screen appears and a content list appears and there 1-5.
caption:  On the next white screen a cardboard box is shown and then it begins to show the many different contents that come in the box, along with the names of them next to it.
caption:  A woman is now standing in a hallway as she puts together her cleaning tool by screwing the top portion onto her pole and applying a cloth to the end, then she straps her spray bottle onto her hip.
caption:   The woman then begins to spray the cloth on the cleaning tool and starts cleaning various different areas in the office including windows, elevator doors, TVs and ch

caption:   We see shots of men riding outdoors and another gate opens.
caption:   We see a bike laying on it's side.
caption:  A intro begins and shows a man holding a shovel and talking to the camera.
caption:   He gradually rakes leaves and talks to the camera and shows several people also helping picking up leaves.
caption:   Different people are interviewed on camera while several others are shown raking up the leaves.
caption:   A man is seen sitting in his car and another puts his gloves on.
caption:   The camera pans over the raked up leaves while several others discuss their hard work.
caption:  We see two girls one sitting the other bending over.
caption:   The bending girl stands up.
caption:   We see a person painting the sitting girls toe nails.
caption:  little kid is in a parking playing hopscotch next to cars.
caption:   man wearing shorts is standing next to the hopscotch game.
caption:   litle blond kid is standing in street playing in font of a man.
caption:  A large 

caption:   The child and a different man sit in a canoe while the man paddles.
caption:   The camera pans to show the first man in the foreground.
caption:   The child and the first man are in a canoe while the first man paddles.
caption:   A series of still images of the various people shown during the video are shown.
caption:  Two people are seen hosting a news segment that leads into clips of people scraping off their cars.
caption:   A woman is seen speaking to the camera while scraping off her car and holding up products to help.
caption:   More people are seen speaking to the camera and showing off their methods of melting ice and leading back to the two reporters.
caption:  A young female gymnast wearing red prepares for her balance beam routine in the Olympics.
caption:   The female gymnast runs to the springboard and jumps from it to the balance beam.
caption:   The female gymnast then does her tumbling routine on the balance beam while commentators explain what she is doing 

caption:   The man places the yellow pot and the saucepan on the stove.
caption:   The man grabs the pot's lid and covers the pot.
caption:   Back at the table, the man removes the lid from the pot and removes the leaves.
caption:   The man mashes the potatoes.
caption:   The man throws in a pinch of salt onto the potatoes.
caption:   The man resumes mashing the potatoes.
caption:   The man pours in the milk from the saucepan into the pot.
caption:   The man mashes the potatoes some more.
caption:   The man grabs some pepper and throws it onto the potatoes.
caption:   The man adds some more milk.
caption:   The man returns to mashing the potatoes and stirring them as they have thoroughly become soft.
caption:   The man pauses for a second to throw in some water and resumes to stirring the potatoes.
caption:   The finished mashed potatoes dish is left on a counter.
caption:  A close up of a board with shown with several pucks sitting around the side.
caption:   Suddenly another puck is 

caption:  A large group of people are seen standing at the bottom of a snowy hill speaking to one another.
caption:   The people are then seen riding in tubes and going down a snowy trail.
caption:   Several shots are shown of people riding down the mountain while looking and smiling to the camera.
caption:  A seated man is using a sharpener to sharpen a large knife.
caption:   He scrapes it along the surface and underbelly of the knife, sharpening it.
caption:  The person is riding a blue single raft.
caption:   The man is paddling through the strong current of water.
caption:   The man fell in the small falls.
caption:  A video of back country skiing is shown.
caption:    The group prepares its equipment and then looks at its yurt.
caption:    They then show skiing videos.
caption:  Several performers walk out of a box and it ends up being one male and one female.
caption:  The lady then begins moving in strange ways and continuing telling a story in ballerina form.
caption:  After a

caption:   He pulls the handlebars back and fourth as the seat he's moving on continuously moves.
caption:   He performs this set several times and eventually pauses to let the instruction end.
caption:  A bald man in gray shirt is standing at the playground, he walked towards the monkey bars and started to pull himself up and down, he walked towards a lower pole and started to pull himself up while his legs are stretched down.
caption:   He walked back to the monkey bar and pull himself up and down, his legs are moves from side to side then pull him up and down.
caption:  kids are playing hurling in a large green field.
caption:   kids are sanding al in front of the goal to make a free shot.
caption:   kids are posing for a picture in the green field.
caption:  A woman is talking while holding a saxophone.
caption:   She begins to play the saxophone.
caption:  The credits of the clip are shown.
caption:   A guy raises a metal ball above his head, spins with a metal ball, and releases 

caption:   A group of band students practice inside a classroom and gym.
caption:   The school band  plays during outdoor events and sporting games.
caption:  We see a newscaster in a studio.
caption:   We see a man operating 2 manual vacuum cleaners.
caption:   We see some old style vacuums.
caption:   We see a black and white TV clip.
caption:   We see the man vacuum and still from the museum.
caption:   We see a lady in black and white and in color.
caption:   We then see a factory and a man talking.
caption:   We see the man talking and see him near a vacuum.
caption:   We see an old print ad.
caption:   We return to the newscaster in the studio.
caption:  A woman is seen speaking to the camera and leads into clips of her and another riding horses in a forest and beach.
caption:   The women then run the horses along the beach while the camera captures them from several angles.
caption:  Two men are seen doing a sumo move in a large circle surrounded by others.
caption:   One man pr

caption:   The man continues pushing the debris off the roof while digging deeper and deeper into the mess.
caption:  A person brush the teeth of a dog while holding the back neck.
caption:   After, she taps the chest of the dog and put toothpaste on the brush and continue brushing the dog's teeth.
caption:   After, the person tap again the the chest, put toothpaste on the brush and brush the dog's teeth.
caption:   When the person finish to brush, she kiss the dog.
caption:  A woman is talking to the camera surrounded by alcohol bottles.
caption:   The woman begins pouring different alcohols and lemon juice into a glass of ice.
caption:   The woman shakes the drink mixture and strains it into another glass.
caption:   The woman puts a small straw into the drink and holds it up and smiles.
caption:  A curler prepares to make a curling shot while being watch be other people in the arena.
caption:   She makes the shot and lets go of the puck as the other team members chase it and sweep t

caption:  A man walks into a gym and bends in front of  a weight.
caption:   the man tries to lift the weight two times and drops it.
caption:   The man lifts the weight three times and walks away.
caption:   The man lifts the weight two times.
caption:   the man lifts the weight three times and walks away.
caption:   The man lifts the weight two tines and removes his weight lifting belt.
caption:  A woman is seen speaking to the camera and leads into her sanding down a board in several locations.
caption:   She then sands more objects and shows off a large selection of tools she has.
caption:   She hammers down the sides of the board and ends with her painting the board.
caption:  We see rolling credits rise up the screen.
caption:   We see three people riding stationary bikes in a gym.
caption:   The camera moves closer to the bikers.
caption:   The lady stop and adjusts something on her bike.
caption:   We wee the fourth rider in the scene.
caption:   The lady is riding her bike slo

caption:   The girl starts throwing darts, she is only able to hit one balloon, she asks for help but does not get it.
caption:  A person is rowing at the edge of a boat.
caption:   The person takes out a fishing hook.
caption:   The person starts fishing at his edge.
caption:  A woman is seen performing a belly dancing routine on a large stage with other people watching on the sides.
caption:   The woman continues dancing and spinning around the stage and ends with her posing and walking off.
caption:  grandma is talking with a little blond child sitting in a coach while is knitting in a red thread in a liing room.
caption:   child holds the thread and run in the room pulling it and the woman is talking to him, the child grabs the thread from the floor and run in the living room.
caption:  A man stands on a tennis court preparing to serve the ball.
caption:   The man hits the tennis ball across the court.
caption:   The man turns and walks back to his basket and grabs a ball from his 

caption:  A man stands in a bathroom in front of a mirror, holding a razor.
caption:   The man begins speaking to the camera while showing the razor and its various parts.
caption:   Soon he begins shaving his face while looking in the mirror.
caption:   Afterward, he resumes speaking to the camera and showing how the razor performed.
caption:   The man places the razor on a charging stand which shows a light illuminating, and then picks it back up.
caption:   The video ends with the man touching his face to demonstrate the smooth job.
caption:  Two men demonstrate jump rope stunts, tricks and techniques in an auditorium in front of a group of students.
caption:    A man in a grey track suit performs jump rope tricks in front of a group of students.
caption:    The man in the grey track suit finishes and the children clap.
caption:     A second man walks beside the man and the two men begin to perform more tricks and stunts including backflips and dance moves during the demonstration.


caption:   The kids kick their feet around and show one instructor helping a child.
caption:   The man continues to swim around the pool with children and close ups of children.
caption:  A little boy is pushing a mop.
caption:   He closes an oven door that is open.
caption:   He continues to mop the floor.
caption:   He picks the mop up and puts it in the air.
caption:   He sets it down and continues mopping.
caption:  Someone is shown driving down a snowy street in the Ukraine.
caption:   Several skiers are skiing down the slopes on the side of a mountain.
caption:   A close up is seen of some of the skiers as they go.
caption:  A person is seen riding around on a dirt bike that leads into him speaking to the camera.
caption:   He shows off his bike as well as camera and leads into several clips of him riding around.
caption:   He continues to ride around on the track while also showing close ups of himself as well as the camera.
caption:  A young man is seen playing a single drum st

caption:   We we see the vehicle back and and go back and forth blowing the leaves.
caption:   The cameraman stops the vehicle and gets out.
caption:   We see the full vehicle from the front and see the field.
caption:  A marching band plays in a parade while people watch stand on the sidewalk.
caption:   The marching band pass on front a carnival and a green field.
caption:   Then, the marching band arrives to a city.
caption:   An old man holding a trophy walks on front the marching band.
caption:   After, the marching band arrives to a bus station.
caption:   A bond lady applaud, hug people and talks.
caption:  An intro is shown with a drink being poured into a glass and a caption about drink mixing.
caption:   A girl begins to make a drink called a Mai Tai.
caption:   She takes out all the things she will need to make the drink.
caption:   She gets some ice into a glass and into the shaker.
caption:   She adds in the rum, triple sec, syrup, sour mix, and puts the lid on the shaker.

caption:   The woman cuts the wallpaper with a box cutter.
caption:   The woman paints the wallpaper with paint.
caption:  A title screen appears and then several paragraphs begin to roll over the screen.
caption:  Next,a man with average sized curly black hair dressed in a suit begins to talk.
caption:  When he is finished,a man is shown playing the saxophone at a night time television show and then the man comes back to finish talking.
caption:  There's a man wearing s white shirt playing the drum set in a room.
caption:   He begins by playing the snare drum and the bass drum.
caption:   Then he continues playing by hitting the cymbals and the hi-hat.
caption:   He stops for a bit and then resumes playing the drum set.
caption:   He plays rhythmically as he picks up speed and plays the drum set.
caption:   Then he stops and gets up to turn off the camera.
caption:  We see an opening title screen.
caption:   A shirtless man talks in a gym.
caption:   A lady runs past both ways then ju

caption:   Close ups of her feet are shown as well as several other angles of her running.
caption:  A woman with very long hair is standing on a balcony showing off the length of her hair.
caption:   She turns around and faces the camera continuing to brush her hair and display its length.
caption:   She begins to twirl and sway her hair with her back to the camera and that is when the video ends.
caption:  A person is seen unscrewing a bottle of mouth wash and pouring it into a cup.
caption:   The girl then drinks from it and swishes it around her mouth.
caption:   She spits out the mixture and shows off pictures in the end.
caption:  A compilation of shot put is put together, you see many different men getting prepared.
caption:   They turn their bodies very quickly building up as much speed as they can before taking their shot.
caption:   They practice and they do it professionally, it a serious sport that people really enjoy yo be a part of.
caption:   They tend to get very excite

caption:   Finally the driveway is cleared of the leaves.
caption:  We see an opening title screen.
caption:   We see a man in an orange shirt twirl and spin to throw the hammer over 80km .
caption:   Next we see a man in a blue shirt throw 73km.
caption:   We see a man in a white shirt throw 75km.
caption:   The man in orange goes again and throws 81km.
caption:  A man sits on top of a camel.
caption:   The camel stands up and a man in front leads it.
caption:   A red barn is seen behind them.
caption:   The camel sits back down and the man gets off.
caption:  A woman is seen standing in front of buckets and speaking to the camera.
caption:   She pours water into a bucket while dragging it over as well as mixing cleaning into it and beginning to dump clothes.
caption:   The woman dips the clothes into the bucket and shows where she hangs them and waving to the camera.
caption:  A man carefully washes the side of a car with a sponge.
caption:   The man stands up and washes the windshie

caption:   The man reaches the house and turns to finish the other side of the walkway.
caption:   We see a person spraying an item on their sidewalk.
caption:   We then see two closing screens.
caption:  Two men are in an inside court.
caption:   They play a game of racquetball together.
caption:   They take turns hitting the ball back and forth against the wall.
caption:  A woman is seen putting her hair up in a pony tail with a man watching her on the side.
caption:   She then plays the piano while smiling  and the man continuing to watch her.
caption:  People are paddling in kayaks in a body of water.
caption:   A man takes off in a green kayak.
caption:   A man in a red jacket is talking to the camera.
caption:  A woman and a dog are playing on the grass with a frisbee.
caption:   The dog has the frisbee in its mouth.
caption:   The woman and the dog are playing with a rope.
caption:  At the beach, a group of soccer balls lie next to a man who is kicking them at the goal, protecte

caption:   The first woman pulls items out of a cabinet and places them on the floor.
caption:   A fourth individual is briefly shown in the doorway.
caption:  Two people are on a white slab and they begin fencing one another.
caption:  There are constant stops and the boy closest to the camera keeps winning.
caption:  Both of them keep being reset but the outcome is no different and the lady on the side keeps walking back and forth clicking a black button.
caption:  A man is shown speaking to the camera and leads into a woman trimming the fur on a dog.
caption:   The speaks more to the camera while other dogs are shown groomed and one woman walks away holding two excited dogs.
caption:   More shots of dogs held are shown while the man speaks and the camera pans around the building.
caption:  A logo shows that this video is presented by Expert Village.
caption:   A man in a small blue boat discusses the topic of hand surfing in a hole and gives hints on how to perform this activity.
ca

caption:   He is holding a large steel ball.
caption:   He spins and throws the ball, then walks around the track waiting for his next turn.
caption:  Several outside views of a casino are shown.
caption:   Card players and the dealer at a table are shown engaging in a game inside the casino.
caption:   The dealer deals cards to each player.
caption:   The dealer reveals the dealer's hand.
caption:   One man talks to the camera while another man stands nearby.
caption:  There's a little boy sitting on a bed in the bedroom.
caption:   There is a laundry basket and a lot of clothes on the bed.
caption:   The mother of the boy who is filming the video shows a vacuum cleaner.
caption:   Then she turns the vacuum cleaner on and begins vacuuming the bedroom floor.
caption:   The boy watches his mom vacuum as he jumps on the bed.
caption:   the boy then lays down on the bed as he sees his mother finish the job.
caption:   Then the boy gets off the bed and jumps down to the ground to look unde

In [34]:
import re
import json
import argparse
import numpy as np
path = 'data/captions/train.json'
videos_json = json.load(open(path, 'r'))
#videos_json.keys()
# type(videos_json['v_-AjZCBMb4qU']['sentences'])
for i,j in videos_json.items():
    print(i, j)
    #print(videos_json['v_-AjZCBMb4qU'])


# videos_json['v_-AjZCBMb4qU']['timestamps'][0]

v_QOlSCBRmfWY {'duration': 82.73, 'timestamps': [[0.83, 19.86], [17.37, 60.81], [56.26, 79.42]], 'sentences': ['A young woman is seen standing in a room and leads into her dancing.', ' The girl dances around the room while the camera captures her movements.', ' She continues dancing around the room and ends by laying on the floor.']}
v_ehGHCYKzyZ8 {'duration': 61.72, 'timestamps': [[0, 2.78], [3.09, 61.72], [15.43, 55.24], [17.59, 54], [39.81, 54.62], [56.47, 61.72]], 'sentences': ['The video starts with a title logo sequence.', ' A man and woman are in a living room demonstrating exercises.', ' The woman lays on the ground.', " The man starts pointing to different areas of the woman's body as she does an exercise.", ' The woman begins to do small sit ups.', ' The woman ends with a final title logo sequence.']}
v_nwznKOuZM7w {'duration': 31.65, 'timestamps': [[0, 15.51], [11.39, 31.65]], 'sentences': ['Two people are seen moving around a kitchen quickly performing various tasks and sit

v_xKDHIyd_tWA {'duration': 111.02000000000001, 'timestamps': [[0, 5], [5, 86.04], [5, 85.48], [60.5, 64.94], [86.59, 111.02]], 'sentences': ['The video begins with a title slide.', ' A woman and child are in a park with a jump rope.', ' The woman begins to demonstrate how to use the jump rope while the child attempts to copy her.', ' At one point, another woman in the background runs across the screen while jump roping.', 'The video ends with both women speaking to the camera and a final title sequence.']}
v_-v9YLmGCYO0 {'duration': 72.26, 'timestamps': [[0, 72.26], [0, 45.89], [45.16, 72.26]], 'sentences': ['A baby is sitting in a swing.', ' The baby is laughing and shaking its feet.', ' The baby turns to the side of the swing.']}
v_5kBKAfEX7XA {'duration': 45.65, 'timestamps': [[0, 25.79], [13.47, 43.6]], 'sentences': ['An athlete is seen walking up to a set of uneven bars and begins performing a gymnastics routine on the bars.', ' He continues moving around on the bars and ends by j

v_81F42Yyw_iY {'duration': 66.67, 'timestamps': [[11, 55], [51.33, 58.67], [59, 66.67]], 'sentences': ['A woman pole vaults over a tall bar.', ' She lands on a yellow mat under her.', ' Words come onto the screen at the end.']}
v_LZleSe6Kovg {'duration': 27.75, 'timestamps': [[0, 27.75], [3.47, 24.42], [24.7, 27.75]], 'sentences': ['A man is wearing a white robe with a black belt.', ' He starts doing karate moves in a room.', ' He stands up right at the end.']}
v_3cjtV-ldvto {'duration': 212.14, 'timestamps': [[0, 16.97], [16.97, 43.49], [43.49, 59.4], [59.4, 149.56], [149.56, 169.71], [170.77, 212.14]], 'sentences': ['woman put an egg on a bowl and mix it with vanilla.', ' quick oats and baking powder, cocoa powder and salt are mixed in a bowl.', ' a chocolate bar are cut in pieces with nuts and put in a bowl.', ' sugar and butter are mixed in a bowl till cream and put the vanilla and the powder ingredients with the chocolate and he nuts.', ' themix is wrap and put in a refrigerator.'

v_XqxJsWQqKRk {'duration': 173.29, 'timestamps': [[0, 14.73], [14.73, 23.39], [25.13, 33.79], [33.79, 55.45], [52.85, 57.19], [57.19, 64.98], [64.98, 71.92], [71.92, 122.17], [122.17, 123.04], [123.04, 135.17], [135.17, 173.29]], 'sentences': ['A white screen with blue and white shapes and black words flash across the screen.', " A smiling black man is in a residential kitchen and he's wearing a santa hat and pouring ingredients into a large bowl.", ' A hand pushes a clear bowl filled with dark figs and a cutting board appears immediately afterwards and the instructions on the screen say to cut the stems off of 1lb of figs and quarter them.', ' While the list of ingredients are shown that include: 1lb Figs, 1lb Raisins, 1lb Cherries, 1lb Currants, 1lb Prunes, Port Wine and Angostura Bitters; they are all poured into a white bucket.', ' Stirring begins immediately and the instructions on the screen say to let it soak for at least two weeks to up to 30 days.', ' 30 days later the content

v_zp86ztwZEKk {'duration': 88.42, 'timestamps': [[0, 13.26], [13.26, 19.45], [13.26, 88.42]], 'sentences': ['A talking man dressed in athletic gear is standing on a very green field and the words on the bottom left say his name is Neil Macmillan and his title is Head Coach.', 'A large group of girls dressed in athletic gear are shown playing land hockey, and the people not playing are dressed for cold weather.', ' Clips of the man continue to play and rotate from showing the man and the girls playing land hockey until they are done playing.']}
v_cNCkHqOnJV0 {'duration': 234.26, 'timestamps': [[0, 78.48], [49.19, 179.21], [122.98, 223.71]], 'sentences': ['The sky is shown that leads into a man speaking to the camera with his friend.', ' Several people are then seen sitting in tubes riding down a river past one another.', ' The people continuing riding down the river through tunnels and paths while the man still speaks to the camera.']}
v_FwV1XbjLJHY {'duration': 113.36, 'timestamps': [[

v_DDZFQKi1v2U {'duration': 112.34, 'timestamps': [[0, 28.08], [30.89, 82.57], [79.2, 108.97]], 'sentences': ['A woman is seen laying on the couch with another girl sitting above her.', ' The girl is then shown piercing the nipple of the girl laying down.', ' She finishes the piercing and ends by smiling to the camera.']}
v_FXN6qiDsClw {'duration': 87.38, 'timestamps': [[0, 20.1], [20.1, 31.02], [31.46, 44.13], [44.13, 74.27], [74.27, 80.39], [79.95, 87.38]], 'sentences': ['An animation demonstrating the arc length of a welding type tool emitting heat.', ' A demonstration of the tool emitting heat in real time.', ' A graph illustrating the Long Arc of the tool documenting Voltage and Amperage.', ' The tool is shown in real time emitting heat with a photo of a welded material.', ' A graph illustrates Short Arc with the Voltage and Amperage documented.', ' A man in a blue coat and orange gloves uses the arc tool.']}
v_hS4L3PMfYqA {'duration': 111.32, 'timestamps': [[11.13, 17.81], [17.81,

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [40]:
path = 'data/activity_net.v1-2.min.json'
videos = json.load(open(path, 'r'))
videos['database']

{'---9CpRcKoU': {'annotations': [{'label': 'Drinking beer',
    'segment': [0.01, 12.64441]}],
  'duration': 14.07,
  'resolution': '320x240',
  'subset': 'training',
  'url': 'https://www.youtube.com/watch?v=---9CpRcKoU'},
 '--0edUL8zmA': {'annotations': [{'label': 'Dodgeball',
    'segment': [5.46484, 86.71838]}],
  'duration': 92.18,
  'resolution': '640x480',
  'subset': 'training',
  'url': 'https://www.youtube.com/watch?v=--0edUL8zmA'},
 '--1DO2V4K74': {'annotations': [{'label': 'Rock climbing',
    'segment': [30.02588, 205.23186]}],
  'duration': 211.53,
  'resolution': '640x480',
  'subset': 'validation',
  'url': 'https://www.youtube.com/watch?v=--1DO2V4K74'},
 '--6bJUbfpnQ': {'annotations': [{'label': 'Drinking beer',
    'segment': [2.57876, 24.9141]}],
  'duration': 26.75,
  'resolution': '640x480',
  'subset': 'validation',
  'url': 'https://www.youtube.com/watch?v=--6bJUbfpnQ'},
 '-0r0HEwAYiQ': {'annotations': [{'label': 'Vacuuming floor',
    'segment': [27.52112, 39.94