In [1]:
!pip  install sk-video



distributed 1.21.8 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [37]:
import keras.backend as K
from keras.models import Sequential, Model
from keras.utils.data_utils import get_file
from keras.layers import Input, Conv3D, MaxPooling3D, Dense, Flatten, ZeroPadding3D, Dropout, Subtract, BatchNormalization
import skvideo.io
import numpy as np

# Base Model

Our base model consists of 3D ConvNets from Conv1 to Pool5 and 3 fully connected layers (FC6, FC7, FC8)which has been pre-trained on Sports-1M. We delete the FC8 from our model 


In [3]:
WEIGHTS_PATH = 'https://github.com/adamcasson/c3d/releases/download/v0.1/sports1M_weights_tf.h5'
C3D_MEAN_PATH = 'https://github.com/adamcasson/c3d/releases/download/v0.1/c3d_mean.npy'

In [4]:
def base_model():

    if K.image_data_format() == 'channels_last':    
        shape0 = (16,112,112,3)    
    else:   
        shape0 = (3,16,112,112)

    model_base = Sequential()
    
    model_base.add(Conv3D(64, 3, activation='relu', padding='same', name='conv1', input_shape=shape0))
    model_base.add(MaxPooling3D(pool_size=(1,2,2), strides=(1,2,2), padding='same', name='pool1'))
    
    model_base.add(Conv3D(128, 3, activation='relu', padding='same', name='conv2'))
    model_base.add(MaxPooling3D(pool_size=(2,2,2), strides=(2,2,2), padding='valid', name='pool2'))
    
    model_base.add(Conv3D(256, 3, activation='relu', padding='same', name='conv3a'))
    model_base.add(Conv3D(256, 3, activation='relu', padding='same', name='conv3b'))
    model_base.add(MaxPooling3D(pool_size=(2,2,2), strides=(2,2,2), padding='valid', name='pool3'))
    
    model_base.add(Conv3D(512, 3, activation='relu', padding='same', name='conv4a'))
    model_base.add(Conv3D(512, 3, activation='relu', padding='same', name='conv4b'))
    model_base.add(MaxPooling3D(pool_size=(2,2,2), strides=(2,2,2), padding='valid', name='pool4'))
    
    model_base.add(Conv3D(512, 3, activation='relu', padding='same', name='conv5a'))
    model_base.add(Conv3D(512, 3, activation='relu', padding='same', name='conv5b'))
    model_base.add(ZeroPadding3D(padding=(0,1,1)))
    model_base.add(MaxPooling3D(pool_size=(2,2,2), strides=(2,2,2), padding='valid', name='pool5'))
    
    model_base.add(Flatten())
    
    model_base.add(Dense(4096, activation='relu', name='fc6', input_shape = (None, 8192)))
    model_base.add(Dropout(0.5))
    model_base.add(Dense(4096, activation='relu', name='fc7'))
    model_base.add(Dropout(0.5))
    model_base.add(Dense(487, activation='softmax', name='fc8'))

    weights_path = get_file('sports1M_weights_tf.h5',
                         WEIGHTS_PATH,
                         cache_subdir='models',
                         md5_hash='b7a93b2f9156ccbebe3ca24b41fc5402')
        
    model_base.load_weights(weights_path)

    model_base.pop()
    
    return model_base




# Model the Temporal Consistency

We add a clasification layer FC8 to our model, that gives the class of the actual window (background or action). We also add a second loss to our model, that mesures the difference between the FC7 layer for the actual window and the next window.

In [5]:
model_base = base_model()

if K.image_data_format() == 'channels_last':    
    shape0 = (16,112,112,3)    
else:   
    shape0 = (3,16,112,112)

start_window = Input(shape=shape0, dtype='float32', name='start_window')
followup_window = Input(shape=shape0, dtype='float32', name='followup_window')

fc7 =model_base(start_window)

drop2 = Dropout(0.5)(fc7)
fc8 = Dense(2, activation='sigmoid', name='fc8')(drop2)


out1 = fc7
out2 = model_base(followup_window)

out = Subtract(name='out')([out1, out2])

model_1 = Model([start_window,followup_window],[fc8,out])

model_1.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
start_window (InputLayer)       (None, 16, 112, 112, 0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 4096)         77995776    start_window[0][0]               
                                                                 followup_window[0][0]            
__________________________________________________________________________________________________
followup_window (InputLayer)    (None, 16, 112, 112, 0                                            
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 4096)         0           sequential_1[1][0]               
__________

In [6]:
def loss_classification(y_true, y_pred):
    return -K.mean(K.log(K.dot(y_pred,K.transpose(y_true))), axis=-1)
    
def loss_temporal_consistency(y_true,y_pred):
    return K.mean(K.square(K.dot(out,K.transpose(y_true))), axis=-1)

In [7]:
model_1.compile(optimizer='adam',
              loss={'fc8': loss_classification, 'out': loss_temporal_consistency},
              loss_weights={'fc8': 1., 'out': 1})

# Training phase 1 
We train our model_1 by minimizing $\mathcal{L}_{classification} + \lambda\mathcal{L}_{similarity}$

In [None]:
import random 
import matplotlib.pyplot as plt 


path1 = 'D:/workspace/MVA/ORCV/Final_Project/Data_Sets/UCF-Anomaly-Detection-Dataset/UCF_Crimes/Videos/Normal/Training_Normal_Videos_Anomaly/Normal_Videos001_x264.mp4'
path2 = 'D:/workspace/MVA/ORCV/Final_Project/Data_Sets/UCF-Anomaly-Detection-Dataset/UCF_Crimes/Videos/Abnormal/Abuse/Abuse001_x264.mp4'


Normal_Video = skvideo.io.vread(path1,
                          outputdict={
                            "-sws_flags": "bilinear",
                            "-s": "112x112"
                          })

Abnormal_Video = skvideo.io.vread(path2,
                          outputdict={
                            "-sws_flags": "bilinear",
                            "-s": "112x112"
                          })

action_intervals = [[230, 365], [-1,-1]]

Videos = [Normal_Video, Abnormal_Video]


# Let's train the model on two videos (Extansion well be easy)
# We do just on training iteration, in which we create one batch containing Nb_training_examples = Nb_abnormal*15

print("Constructing Postive Exapmles")
positive_indexes = []

count = -1
for action in range(len(action_intervals)):
  positive_indexes.append([])
  start = action_intervals[action][0]
  idx = 0
  while (start!=-1) and (idx < len(action_intervals[action])):
    start = action_intervals[action][idx]
    idx += 2
    positive_indexes[action] = positive_indexes[action]+([start-15+i for i in range(15)])
    
    for i in range(15):
      count += 1
      if(count==0):
        actuals = np.expand_dims(Videos[action][start-15+i:start+i+1], axis=0)
        nexts = np.expand_dims(Videos[action][start+i+1:start+i+17], axis=0)
      if(count>0):
        actuals = np.vstack((actuals, np.expand_dims(Videos[action][start-15+i:start+i+1], axis=0)))
        nexts   = np.vstack((nexts, np.expand_dims(Videos[action][start+i+1:start+i+17], axis=0)))

nbr_positive = actuals.shape[0]
print("Constructing Negativve Exapmles")

nbr_neg = 0 # nbr of negative examples selected 
while (nbr_neg<nbr_positive):
  video_indx = random.randint(0,len(Videos)-1) #Pick randomly a video 
  start_frame = random.randint(0,Videos[video_indx].shape[0]-33)
  
  while (start_frame in positive_indexes[video_indx]):
    start_frame = random.randint(0,Videos[video_indx].shape[0]-33) #pick a new sequence till it is abnormal
  nbr_neg = nbr_neg + 1
  
  actuals = np.vstack((actuals, np.expand_dims(Videos[video_indx][start_frame:start_frame+16], axis=0)))
  nexts   = np.vstack((nexts, np.expand_dims(Videos[video_indx][start_frame+16:start_frame+32], axis=0)))
  
inputs = [actuals, nexts]
labels = np.zeros((nbr_positive+nbr_neg,1))
labels[0:nbr_positive,:] += 1 

labels_1 = labels

labels = np.zeros((nbr_positive+nbr_neg,4096))
labels[0:nbr_positive,:] += 1

labels_2 = labels

labels = [labels_1, labels_2]


print("Start Training")
loss = model_1.train_on_batch(inputs, labels)

In [None]:
loss

# Generate Hard Negative Samples via GAN



## Generator 

The generator takes as input a 100-dimensional noise, It is composed of two fully-connected layers of 8192 nodes

In [38]:
generator = Sequential()

generator.add(Dense(8192, activation='relu', name='fc1',input_shape=(100,)))
generator.add(BatchNormalization())
generator.add(Dense(8192, activation='relu', name='fc2'))
generator.add(BatchNormalization())

generator.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
fc1 (Dense)                  (None, 8192)              827392    
_________________________________________________________________
batch_normalization_1 (Batch (None, 8192)              32768     
_________________________________________________________________
fc2 (Dense)                  (None, 8192)              67117056  
_________________________________________________________________
batch_normalization_2 (Batch (None, 8192)              32768     
Total params: 68,009,984
Trainable params: 67,977,216
Non-trainable params: 32,768
_________________________________________________________________


## Convolutional Block and FC6+FC7 block

We create a block model containing all the covolutional layers of the base_model till pool5, And we get a modedl composed of layers fc6 and fc7

In [11]:
conv_model = Sequential()
fc6_fc7_model = Sequential()

index = 0 # 0 to add layers in conv_model, and 
for layer in model_1.get_layer(index=1).layers:
    if(index == 0):
        conv_model.add(layer)
    if(index == 1):
        fc6_fc7_model.add(layer)
        
    if(layer.name == 'flatten_1'):
        index = 1
        
print(conv_model.summary())

print(fc6_fc7_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1 (Conv3D)               (None, 16, 112, 112, 64)  5248      
_________________________________________________________________
pool1 (MaxPooling3D)         (None, 16, 56, 56, 64)    0         
_________________________________________________________________
conv2 (Conv3D)               (None, 16, 56, 56, 128)   221312    
_________________________________________________________________
pool2 (MaxPooling3D)         (None, 8, 28, 28, 128)    0         
_________________________________________________________________
conv3a (Conv3D)              (None, 8, 28, 28, 256)    884992    
_________________________________________________________________
conv3b (Conv3D)              (None, 8, 28, 28, 256)    1769728   
_________________________________________________________________
pool3 (MaxPooling3D)         (None, 4, 14, 14, 256)    0         
__________

## Descriminator 

We use the blocks that we have created to create the Descriminator 

In [40]:
if K.image_data_format() == 'channels_last':    
    shape0 = (16,112,112,3)    
else:   
    shape0 = (3,16,112,112)

actual_window = Input(shape=shape0, dtype='float32', name='actual_window')
next_window = Input(shape=shape0, dtype='float32', name='next_window')
noise = Input(shape=(None,100), dtype='float32', name='noise')

generated_sample = generator(noise)


TensorShape([Dimension(None), Dimension(None), Dimension(8192)])