
# **Proyecto Integrador - Avance 2.Ingenieria de caracteristicas**
## **Tecnologico de Monterrey**
------------------------------------------------------------------
### Profa. Dra. Grettel Barceló Alonso

### Prof. Dr. Luis Eduardo Falcón Morales

### Profa. Verónica Sandra Guzmán de Valle
------------------------------------------------------------------
### Marcela Alejandra Rosales Jiménez - A01032022
### José Antonio Mendoza Castro - A01794067



### Instalacion de librerias

In [None]:
pip install ipython

Collecting jedi>=0.16 (from ipython)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Using cached jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: jedi
Successfully installed jedi-0.19.1


In [1]:
!pip install -U get-video-properties

Collecting get-video-properties
  Downloading get_video_properties-0.1.1-py3-none-any.whl.metadata (1.6 kB)
Downloading get_video_properties-0.1.1-py3-none-any.whl (45.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: get-video-properties
Successfully installed get-video-properties-0.1.1


### Carpeta actual

In [2]:
import os

In [3]:
os.getcwd()

'/content'

### Librerias

In [4]:
from google.cloud import storage
from google.colab.patches import cv2_imshow

In [5]:
import pandas as pd
import cv2
from IPython.display import HTML
import matplotlib.pyplot as plt
import datetime
import random
import re
import json
import io
import pathlib
from base64 import b64encode
from videoprops import get_video_properties

In [241]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

### **Recoleccion de datos**

### Cuenta de servicio

In [6]:
credentials = 'project-team-13-8b8c41c85749.json'

In [7]:
client = storage.Client.from_service_account_json(credentials)

### Bucket

In [8]:
bucket_name = 'bucket-video-wts'

In [9]:
bucket = client.get_bucket(bucket_name)

### Lectura de datos

In [10]:
def make_folder(folder_name):
  """
  Funcion que crea una carpeta en el directorio actual.
  """
  folder_path = os.path.join('/content/', folder_name)
  os.makedirs(folder_path, exist_ok=True)

In [11]:
folder_ds = 'dataset'

In [12]:
make_folder(folder_ds)

In [13]:
list_folder = ['dataset/train/', 'dataset/val/', 'dataset/annotations/']

In [14]:
for folder in list_folder:
  make_folder(folder)

In [15]:
list_path = ['train/', 'val/', 'annotations/']

In [16]:
for path in list_path:
  blobs = [blob.name for blob in bucket.list_blobs(prefix=path)]
  df_blobs = pd.DataFrame(blobs, columns=['file'])
  df_blobs['check'] = df_blobs['file'].apply(lambda x: 1 if 'content' in x else 0)
  df_blobs['lenght'] = df_blobs['file'].apply(lambda x: len(x))
  for blob in blobs:
    if len(blob) < 24:
      pass
    else:
      blob_tmp = bucket.blob(blob)
      blob_tmp.download_to_filename('/content/dataset/' + path + blob.split('/')[-1])

### **Funciones**

In [171]:
def get_all_files(files_path):
  """
  Funcion que obtiene una lista de todos los archivos de un directorio.
  """
  files_path = pathlib.Path(files_path)
  listed_elements = list(files_path.rglob("*"))
  listed_files = [e for e in listed_elements if not e.is_dir()]

  return listed_files

In [172]:
def get_listed_files_with_types(listed_files):
  """
  Funcion que obtiene un diccionario con los tipos de archivos y la lista de archivos que pertenecen a ese tipo.
  """
  files_types = {}
  for ff in listed_files:
    ext = ff.suffix
    if not files_types.get(ext):
      files_types[ext] = []
    files_types[ext].append(str(ff))
  return files_types

In [173]:
def get_cap_dataframe(overhead_view_list):
  """
  Funcion que obtiene un dataframe con la informacion de los videos.
  """
  overhead_view_num_frames = {}

  for ff in overhead_view_list:
      cap = cv2.VideoCapture(ff)
      overhead_view_num_frames[ff] = [int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(cap.get(cv2.CAP_PROP_FPS))]

  df_tmp = pd.DataFrame.from_dict(overhead_view_num_frames, orient='index', columns=['num_frames', 'frame_width', 'frame_height', 'fps'])
  df_tmp.reset_index(inplace=True)
  df_tmp.rename(columns={'index': 'video_path'}, inplace=True)

  return df_tmp

In [174]:
def get_radom_file(file_list):
  """
  Funcion que obtiene un archivo aleatorio de un directorio.
  """
  if file_list:
    return random.choice(file_list)
  else:
    return None

In [175]:
def play_video(video_path):

  """
  Funcion que ejecuta un video desde un folder.
  """

  video = open(video_path, 'rb').read()
  video_data_url = 'data:video/mp4;base64,' + b64encode(video).decode()

  print('Video Name: ', video_path)

  return HTML(f"""
  <video width="640" height="480" controls>
    <source src="{video_data_url}" type="video/mp4">
  </video>
  """)

In [176]:
def get_annotation_path(video_path):
  """
  Funcion que obtiene la ruta correspondiente de una anotacion.
  """
  annotation_path = re.search('.*T[0-9](?=_)', video_path).group(0).replace('train', 'annotations') + '_caption.json'
  return annotation_path

In [177]:
def get_annotation(video_path):
  """
  Funcion que obtiene la anotacion correspondiente de un video.
  """
  with open(get_annotation_path(video_path), 'r') as f:
    data = json.load(f)
  return data

In [178]:
def get_video_duration_in_seconds(video_path):
  """
  Funcion que devuelve la duracion en segundos de un video.
  """

  cap = cv2.VideoCapture(video_path)
  fps = cap.get(cv2.CAP_PROP_FPS)
  totalNoFrames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
  durationInSeconds = totalNoFrames // fps

  return 'Duration in seconds: ' +  str(durationInSeconds)

In [179]:
def get_video_characteristics(df_tmp):
  """
  Funcion que obtiene las caracteristicas generales de los videos.
  """

  general_characteristics = {}

  list_seconds = []
  list_width = []
  list_height = []
  list_frames = []
  list_fps = []

  for i, row in df_tmp.iterrows():
    width = row['frame_width']
    height = row['frame_height']
    frame = row['num_frames']
    fps = row['fps']
    durationInSeconds = row['num_frames'] / row['fps']
    list_width.append(width)
    list_height.append(height)
    list_frames.append(frame)
    list_fps.append(fps)
    list_seconds.append(durationInSeconds)

  general_characteristics['width_mean'] = sum(list_width) / len(list_width)
  general_characteristics['height_mean'] = sum(list_height) / len(list_height)
  general_characteristics['frames_mean'] = sum(list_frames) / len(list_frames)
  general_characteristics['fps_mean'] = sum(list_fps) / len(list_fps)
  general_characteristics['seconds_mean'] = sum(list_seconds) / len(list_seconds)

  return general_characteristics

### **Analisis de datos**

In [180]:
dataset_path = f"/content/dataset"

### Cantidad y tipo de archivos a analizar

In [181]:
listed_files = get_all_files(f"{dataset_path}")
print(f"Cantidad de archivos en total = {len(listed_files)}")


files_by_ext = get_listed_files_with_types(listed_files)

for k, v in files_by_ext.items():
    print(f" - Extension {k} tiene {len(v)}")

Cantidad de archivos en total = 1753
 - Extension .json tiene 944
 - Extension .mp4 tiene 809


### Cantidad de videos de entranamiento y validacion

In [182]:
files_overhead_view = []
files_overhead_view_by_type = {
    'train': [],
    'val': [],
}

for mp4 in files_by_ext['.mp4']:
    if not f"{dataset_path}/external" in mp4 and not "vehicle_view" in  mp4:
        files_overhead_view.append(mp4)
        #omitir ruta videos al leer desde colab (videos/train)
        if "train" in mp4:
            files_overhead_view_by_type['train'].append(mp4)
        #omitir ruta videos al leer desde colab (videos/train)
        elif "val" in mp4:
            files_overhead_view_by_type['val'].append(mp4)
        else:
            print(f"W: No train ni val. Que es? {mp4}")


print(f"Cantidad de archivos a utilizar = {len(files_overhead_view)}")
for k, v in files_overhead_view_by_type.items():
    print(f" - {k} tiene {len(v)}")

Cantidad de archivos a utilizar = 600
 - train tiene 403
 - val tiene 197


In [183]:
train_files_overhead_view = [f for f in  files_overhead_view if 'train' in f]

In [184]:
val_files_overhead_view = [f for f in  files_overhead_view if 'val' in f]

### Generacion de un pandas dataframe con las caracteristicas generales de los videos (frames number, width, height, fps)

In [185]:
df_train = get_cap_dataframe(train_files_overhead_view)

In [186]:
df_train.head()

Unnamed: 0,video_path,num_frames,frame_width,frame_height,fps
0,/content/dataset/train/20231013_114328_normal_...,1937,1920,1080,30
1,/content/dataset/train/20230728_4_CN32_T1_Came...,2296,1920,1080,30
2,/content/dataset/train/20231006_27_SN19_T3_192...,2211,1920,1080,30
3,/content/dataset/train/20231013_114328_normal_...,1898,1920,1080,30
4,/content/dataset/train/20231013_101813_normal_...,2250,1920,1080,29


In [187]:
df_val = get_cap_dataframe(val_files_overhead_view)

In [188]:
df_val.head()

Unnamed: 0,video_path,num_frames,frame_width,frame_height,fps
0,/content/dataset/val/20230929_13_SN25_T1_192.1...,2139,1920,1080,30
1,/content/dataset/val/20230929_65_SN4_T1_192.16...,2139,1920,1080,30
2,/content/dataset/val/20231013_114328_normal_19...,1888,1920,1080,30
3,/content/dataset/val/20230728_37_CN38_T1_Camer...,2231,1920,1080,30
4,/content/dataset/val/20231013_101845_normal_19...,2029,1920,1080,30


### Anotaciones de los videos de entrenamiento y validacion

In [189]:
annotations = []

for j in files_by_ext['.json']:
    annotations.append(j)

print(f"Cantidad de archivos a utilizar = {len(annotations)}")

Cantidad de archivos a utilizar = 944


### Generacion de un pandas dataframe para la ruta de las anotaciones

In [190]:
df_annot = pd.DataFrame(annotations, columns=['annotation_path'])

In [191]:
df_annot.head()

Unnamed: 0,annotation_path
0,/content/dataset/annotations/20230929_54_SN38_...
1,/content/dataset/annotations/20230929_59_SN41_...
2,/content/dataset/annotations/20230728_37_CN38_...
3,/content/dataset/annotations/20230922_34_CN11_...
4,/content/dataset/annotations/20230929_20_CY20_...


### Ejecucion de un video aleatorio

In [192]:
video_path = get_radom_file(train_files_overhead_view)

In [None]:
play_video(video_path)

### Obtencion de la anotacion del video

In [193]:
get_annotation(video_path)

{'vehicle_view': '20230929_61_CN35_T1_vehicle_view.mp4',
 'event_phase': [{'labels': ['4'],
   'caption_pedestrian': "The pedestrian, a young man in his twenties, stands diagonally to the right of the vehicle, completely unaware of its presence. He is positioned directly in front of the car, close in distance. His body is motionless and his line of sight is immediately above. The pedestrian's attire consists of a black T-shirt and black slacks, matching the dark and cloudy weather. The road conditions are favorable, with the dry asphalt providing a level surface. The residential road intersects with a signal, indicating that both the vehicle and the pedestrian should exercise caution. Although the traffic volume is light, there are two-way lanes available. On this particular street, there are no sidewalks or roadside strips on both sides, but there are street lights illuminating the path. This simple event captures a snapshot of the pedestrian's surroundings and his lack of awareness t

In [194]:
get_video_duration_in_seconds(video_path)

'Duration in seconds: 73.0'

In [195]:
props = get_video_properties(video_path)

In [196]:
props

{'index': 0,
 'codec_name': 'h264',
 'codec_long_name': 'H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10',
 'profile': 'High',
 'codec_type': 'video',
 'codec_tag_string': 'avc1',
 'codec_tag': '0x31637661',
 'width': 1920,
 'height': 1080,
 'coded_width': 1920,
 'coded_height': 1080,
 'closed_captions': 0,
 'has_b_frames': 2,
 'sample_aspect_ratio': '1:1',
 'display_aspect_ratio': '16:9',
 'pix_fmt': 'yuvj420p',
 'level': 40,
 'color_range': 'pc',
 'chroma_location': 'left',
 'refs': 1,
 'is_avc': 'true',
 'nal_length_size': '4',
 'r_frame_rate': '30/1',
 'avg_frame_rate': '30/1',
 'time_base': '1/15360',
 'start_pts': 0,
 'start_time': '0.000000',
 'duration_ts': 1131008,
 'duration': '73.633333',
 'bit_rate': '9664631',
 'bits_per_raw_sample': '8',
 'nb_frames': '2209',
 'disposition': {'default': 1,
  'dub': 0,
  'original': 0,
  'comment': 0,
  'lyrics': 0,
  'karaoke': 0,
  'forced': 0,
  'hearing_impaired': 0,
  'visual_impaired': 0,
  'clean_effects': 0,
  'attached_pic': 0,
  'timed

### Estadisticas para los datos de entrenamiento

In [197]:
df_train.head(3)

Unnamed: 0,video_path,num_frames,frame_width,frame_height,fps
0,/content/dataset/train/20231013_114328_normal_...,1937,1920,1080,30
1,/content/dataset/train/20230728_4_CN32_T1_Came...,2296,1920,1080,30
2,/content/dataset/train/20231006_27_SN19_T3_192...,2211,1920,1080,30


In [198]:
get_video_characteristics(df_train)

{'width_mean': 1920.0,
 'height_mean': 1080.0,
 'frames_mean': 2239.2952853598017,
 'fps_mean': 29.98759305210918,
 'seconds_mean': 74.67660363366703}

### Estadisticas para los datos de validacion

In [199]:
df_val.head(3)

Unnamed: 0,video_path,num_frames,frame_width,frame_height,fps
0,/content/dataset/val/20230929_13_SN25_T1_192.1...,2139,1920,1080,30
1,/content/dataset/val/20230929_65_SN4_T1_192.16...,2139,1920,1080,30
2,/content/dataset/val/20231013_114328_normal_19...,1888,1920,1080,30


In [200]:
get_video_characteristics(df_val)

{'width_mean': 1923.4111675126903,
 'height_mean': 1084.3857868020305,
 'frames_mean': 2165.1522842639592,
 'fps_mean': 30.0,
 'seconds_mean': 72.17174280879857}

### **Ingenieria de caracteristicas**

In [None]:
def get_preprocessing_of_annotation(text):
  """
  Funcion que obtiene el preprocesamiento de una anotacion.
  """
  #Minusculas
  text = text.lower()

  #Remover puntuacion
  text = re.sub(r'[^\w\s]', '', text)

  #Remover stop words
  stop_words = set(stopwords.words('english'))
  text = ' '.join([word for word in text.split() if word not in stop_words])

  #Lemantizacion
  lemmatizer = WordNetLemmatizer()
  text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

  return text

In [None]:
def write_annotation_preprocess(df_tmp):
  """
  Funcion que obtiene la anotacion preprocesada.
  """
  for i, row in df_tmp.iterrows():

    if 'caption' in row['annotation_path']:

      with open(row['annotation_path'], 'r') as input_file:
        data_tmp = json.load(input_file)

      for i, j in enumerate(data_tmp['event_phase']):
        data_tmp['event_phase'][i]['caption_pedestrian'] = get_preprocessing_of_annotation(data_tmp['event_phase'][i]['caption_pedestrian'])
        data_tmp['event_phase'][i]['caption_vehicle'] = get_preprocessing_of_annotation(data_tmp['event_phase'][i]['caption_vehicle'])

        with open(row['annotation_path'].replace('annotations', 'annotations_preprocess'), 'w') as output_file:
          json.dump(data_tmp, output_file)
    else:
      pass

In [None]:
def feature_engineering(annotation_tmp):
  """
  Funcion que realiza la ingenieria de caracteristicas.
  """
  vectorizer = TfidfVectorizer(max_features=5000)

  vectorizer.fit(annotation_tmp)

  feature_matrix = vectorizer.transform(annotation_tmp)

  return feature_matrix

In [None]:
def clustering(feature_matrix, num_clusters=5):
  """
  Funcion que realiza el clustering.
  """
  kmeans = KMeans(n_clusters=num_clusters, random_state=0)

  kmeans.fit(feature_matrix)

  cluster_assigments = kmeans.labels_

  return cluster_assigments

In [201]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Se crea un nuevo folder para los datos pre-procesados

In [202]:
make_folder('dataset/annotations_preprocess')

### Se aplica el preprocesamiento y se escriben los archivos en el nuevo folder

In [223]:
write_annotation_preprocess(df_annot)

### Archivo sin procesamiento

In [224]:
get_annotation('dataset/annotations/20230707_12_SN17_T1_caption.json')

{'vehicle_view': '20230707_12_SN17_T1_vehicle_view.mp4',
 'event_phase': [{'labels': ['4'],
   'caption_pedestrian': 'The pedestrian, a male in his 30s approximately 170 cm tall, was wearing a black T-shirt and black slacks. It was a clear and bright day with dry road conditions on a residential road with two-way traffic. There were no sidewalks on both sides, and street lights were present. The pedestrian was positioned directly in front of a vehicle, facing the opposite direction. The pedestrian noticed the vehicle and was slowly moving in front of it. Suddenly, a collision occurred.',
   'caption_vehicle': 'The vehicle is positioned in front of a pedestrian, close in proximity. The vehicle has a clear field of view, as the pedestrian is visible. The vehicle is currently stopped and its speed is 0 km/h. The gender of the pedestrian is male, in his 30s with a height of 170 cm. He is wearing a black T-shirt on the upper body and black slacks on the lower body. The weather is clear and 

### Archivo con procesamiento

In [225]:
get_annotation('dataset/annotations_preprocess/20230707_12_SN17_T1_caption.json')

{'vehicle_view': '20230707_12_SN17_T1_vehicle_view.mp4',
 'event_phase': [{'labels': ['4'],
   'caption_pedestrian': 'pedestrian male 30 approximately 170 cm tall wearing black tshirt black slack clear bright day dry road condition residential road twoway traffic sidewalk side street light present pedestrian positioned directly front vehicle facing opposite direction pedestrian noticed vehicle slowly moving front suddenly collision occurred',
   'caption_vehicle': 'vehicle positioned front pedestrian close proximity vehicle clear field view pedestrian visible vehicle currently stopped speed 0 kmh gender pedestrian male 30 height 170 cm wearing black tshirt upper body black slack lower body weather clear brightness bright road surface condition dry road level asphalt traffic volume usual twoway residential road sidewalk side roadside strip street light present',
   'start_time': '9.476',
   'end_time': '14.017'},
  {'labels': ['3'],
   'caption_pedestrian': 'pedestrian male 30 stood per

In [230]:
folder_preprocess_path = 'dataset/annotations_preprocess/'

In [232]:
preprocessed_data = []

In [239]:
for filename in os.listdir(folder_preprocess_path):
  if filename.endswith('.json'):
    filepath = os.path.join(folder_preprocess_path, filename)
    with open(filepath, 'r') as f:
      data = json.load(f)
      for item in data['event_phase']:
        preprocessed_data.append(item['caption_pedestrian'])

In [242]:
feature_matrix = feature_engineering(preprocessed_data)

In [243]:
cluster_assigments = clustering(feature_matrix)

In [244]:
print(cluster_assigments)

[0 0 0 ... 3 3 3]
