<a href="https://colab.research.google.com/github/ngolla/video-captioning/blob/master/example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
from google.colab import drive
from pathlib import Path
import io
import pandas as pd
import numpy as np
import string
import time
from tqdm import tqdm
import pickle

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def load_csv(file_name, drive_path):
  corpus_file=drive_path.joinpath(file_name)
  if(corpus_file.is_file()):
    corpus_df=pd.read_csv(str(corpus_file))
  else:
    uploaded=files.upload()
    corpus_df=pd.read_csv(io.BytesIO(uploaded[file_name]))
    corpus_df.to_csv(corpus_file)
  return corpus_df

In [4]:
drive_path=Path("/content/drive/My Drive")
annotations=load_csv('video_corpus.csv', drive_path)

In [5]:
def extractBy(title, value, dataset):
  return dataset[dataset[title]==value]

In [6]:
annotations=extractBy('Language','English', annotations)

In [7]:
def merge_rows(cols,df):
  df=df.assign(AnnotationID=df[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1))
  return df

In [8]:
cols = ['VideoID', 'Start', 'End']
annotations=merge_rows(cols,annotations)

In [9]:
annotations=annotations[['AnnotationID','Description']]

In [10]:
annotations.shape

(85511, 2)

In [11]:
def drop_nullvalues(df):
  df=df.dropna()
  return df

In [12]:
#Drop null values
annotations=drop_nullvalues(annotations)

In [13]:
def convert_to_lower(data, title):
  return [x.lower() for x in data[title]]

In [14]:
annotations['Description'] = convert_to_lower(annotations, 'Description')

In [15]:
# To remove punctuations
def remove_punctuation(text_original):
    text_no_punctuation = text_original.translate(string.punctuation)
    return(text_no_punctuation)


# To remove single characters
def remove_single_character(text):
    text_len_more_than1 = ""
    for word in text.split():
        if len(word) > 1:
            text_len_more_than1 += " " + word
    return(text_len_more_than1)

# To remove numeric values
def remove_numeric(text,printTF=False):
    text_no_numeric = ""
    for word in text.split():
        isalpha = word.isalpha()
        if printTF:
            print("    {:10} : {:}".format(word,isalpha))
        if isalpha:
            text_no_numeric += " " + word
    return(text_no_numeric)
def text_clean(text_original):
    text = remove_punctuation(text_original)
    text = remove_single_character(text)
    text = remove_numeric(text)
    return(text)

In [16]:
for i, caption in tqdm(enumerate(annotations.Description.values),total=annotations.shape[0]):
    newcaption = text_clean(caption)
    annotations['Description'].iloc[i] = newcaption

100%|██████████| 85510/85510 [00:54<00:00, 1569.17it/s]


In [17]:
 def addPath(text_data,video_path):
   text_data['VideoPath'] = text_data.apply(lambda row: row['AnnotationID']+'.npy', axis=1)
   text_data['VideoPath'] = text_data['VideoPath'].map(lambda x: video_path.joinpath(x))
   text_data = text_data[text_data['VideoPath'].map(lambda x: x.is_file())]
   text_data = text_data[text_data['Description'].map(lambda x: isinstance(x, str))]
   unique_filenames = sorted(text_data['VideoPath'].unique())
   data = text_data[text_data['VideoPath'].map(lambda x: x in unique_filenames)]
   return data

In [18]:
video_path=drive_path.joinpath('VideoArrays')
annotations=addPath(annotations, video_path)

In [19]:
print(len(list(np.unique(annotations['VideoPath']))))
print(len(list(np.unique(annotations['AnnotationID']))))

1969
1969


In [20]:
annotations.shape

(80799, 3)

In [None]:
annotations[:10]

Unnamed: 0,AnnotationID,Description,VideoPath
18,mv89psg6zh4_33_46,bird in sink keeps getting under the running ...,/content/drive/My Drive/VideoArrays/mv89psg6zh...
19,mv89psg6zh4_33_46,bird is bathing in,/content/drive/My Drive/VideoArrays/mv89psg6zh...
20,mv89psg6zh4_33_46,bird is splashing around under running,/content/drive/My Drive/VideoArrays/mv89psg6zh...
21,mv89psg6zh4_33_46,bird is bathing in,/content/drive/My Drive/VideoArrays/mv89psg6zh...
22,mv89psg6zh4_33_46,bird is standing in sink drinking water that ...,/content/drive/My Drive/VideoArrays/mv89psg6zh...
23,mv89psg6zh4_33_46,faucet is running while bird stands in the sink,/content/drive/My Drive/VideoArrays/mv89psg6zh...
24,mv89psg6zh4_33_46,bird is playing in sink with running,/content/drive/My Drive/VideoArrays/mv89psg6zh...
25,mv89psg6zh4_33_46,bird is playing in tap,/content/drive/My Drive/VideoArrays/mv89psg6zh...
26,mv89psg6zh4_33_46,bird is bathing in the,/content/drive/My Drive/VideoArrays/mv89psg6zh...
27,mv89psg6zh4_33_46,bird is taking,/content/drive/My Drive/VideoArrays/mv89psg6zh...


In [22]:
annotations.to_csv(drive_path.joinpath('processed.csv'),index=False)

In [None]:
annotations_0=annotations[:20000]

In [None]:
annotations_0.shape

(20000, 3)

In [None]:
video_paths=np.unique(annotations_0.VideoPath)
len(video_paths)

489

In [None]:
def load_videos_as_numpy(video_paths):
  video_np={}
  for eachVideo in tqdm(np.unique(video_paths)):
    video_np[eachVideo.stem]=np.load(eachVideo)
  return video_np
video_np=load_videos_as_numpy(video_paths)
print(len(video_np))

100%|██████████| 489/489 [04:21<00:00,  1.87it/s]

489





In [None]:
def preprocess_captions(data):
  total_captions=[]
  for caption in tqdm(data['Description'].astype(str)):
    caption='<start> '+caption+' <end>'
    total_captions.append(caption)
  return total_captions
total_captions=preprocess_captions(annotations_0)

100%|██████████| 20000/20000 [00:00<00:00, 1215669.82it/s]


In [None]:
annotations_0['captions']=total_captions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
annotations_0.shape

(20000, 4)

In [None]:
keepindex = []
videos_dt=[]
for i, fnm in tqdm(enumerate(annotations_0.AnnotationID), total=annotations_0.shape[0]):
    if fnm in video_np.keys():
      videos_dt.append(video_np[fnm])
      keepindex.append(i)

total_captions = annotations_0["captions"].iloc[keepindex].values

100%|██████████| 20000/20000 [00:00<00:00, 663314.59it/s]


In [None]:
from keras.preprocessing.text import Tokenizer
# prepare tokenizer
#tokenizer = Tokenizer(
    #num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True,
    #split=' ', char_level=False, oov_token=None, document_count=0)
#tokenizer = Tokenizer(num_words=top_k, oov_token="<unk>", filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer = Tokenizer(num_words=None, oov_token="<unk>", filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(total_captions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 5001


In [None]:
# integer encode descriptions
sequences = tokenizer.texts_to_sequences(total_captions)

In [None]:
from keras.preprocessing.sequence import pad_sequences
# Pad each vector to the max_length of the captions
# If you do not provide a max_length value, pad_sequences calculates it automatically
cap_vector = pad_sequences(sequences, padding='post')
cap_vector[0].shape

(39,)

In [None]:
max_len=len(cap_vector[0])
max_len

39

In [None]:
print('Number of captions: ', str(len(total_captions)))
print('length of sequences : ',str(len(sequences)))
print('first sequence: ', sequences[0])
print('first sequence length : ', len(sequences[0]))
print('first caption: ', total_captions[0])
print('padding vector ', cap_vector.shape)
print('first caption padding vector ', cap_vector[0].shape)
print('Max len of single padding vector or number of rows ', max_len)

Number of captions:  20000
length of sequences :  20000
first sequence:  [2, 151, 9, 527, 2073, 146, 133, 5, 43, 45, 32, 3]
first sequence length :  12
first caption:  <start>  bird in sink keeps getting under the running water from <end>
padding vector  (20000, 39)
first caption padding vector  (39,)
Max len of single padding vector or number of rows  39


In [None]:
len(videos_dt), videos_dt[0].shape, videos_dt[0].ndim

(20000, (80, 4096), 2)

In [None]:
len(cap_vector), cap_vector[0].shape, cap_vector[0].ndim

(20000, (39,), 1)

In [None]:
# Scikit-learn includes many helpful utilities
from sklearn.model_selection import train_test_split
# Create training and validation sets using an 80-20 split
vid_train, vid_val, cap_train, cap_val = train_test_split(videos_dt,cap_vector,test_size=0.2,random_state=0)
len(vid_train), len(cap_train), len(vid_val), len(cap_val)
# Feel free to change these parameters according to your system's configuration

(16000, 16000, 4000, 4000)

In [None]:
outfile = open(drive_path.joinpath('train.dat'),'wb')
pickle.dump((vid_train, cap_train),outfile)
outfile.close()

In [None]:
outfile = open(drive_path.joinpath('test.dat'),'wb')
pickle.dump((vid_val, cap_val),outfile)
outfile.close()

In [None]:
outfile = open(drive_path.joinpath('tokens_word_index.txt'),'wb')
pickle.dump(tokenizer.word_index,outfile)
outfile.close()

In [None]:
outfile = open(drive_path.joinpath('tokens_index_word.txt'),'wb')
pickle.dump(tokenizer.index_word,outfile)
outfile.close()