# Exploratory Data Analysis

In [1]:
import os
import glob
import pandas as pd
import cv2 as cv

In [2]:
DATA_DIR = "../../../../datasets/public/BIOMETRICS"
PORN_DIR = f"{DATA_DIR}/pornography-database/data"
PORN_2K_DIR = f"{DATA_DIR}/pornography-2k-db/data/original"

In [31]:
def get_video_specs(video_path):
  video = cv.VideoCapture(video_path)

  _, fname = os.path.split(video_path)
  fname = fname.split(".")[0]
  label = 'non-porn' if 'NonPorn' in fname else 'porn'

  frame_width = int(video.get(cv.CAP_PROP_FRAME_WIDTH))    
  frame_height = int(video.get(cv.CAP_PROP_FRAME_HEIGHT))

  fps = video.get(cv.CAP_PROP_FPS)
  frame_count = int(video.get(cv.CAP_PROP_FRAME_COUNT))
  duration = frame_count/fps

  video.release()

  return { 
    "fname": fname, 
    "label": label, 
    "frame_count": frame_count, 
    "frame_width": frame_width, 
    "frame_height": frame_height,
    "duration": duration 
  }

## Pornography-800 Dataset

In [15]:
p800_videos_dir = f"{PORN_DIR}/Database"
p800_frames_dir = f"{PORN_DIR}/Frames"
p800_segments_dir = f"{PORN_DIR}/Segments"

dirs = ["vNonPornDifficulty", "vNonPornEasy", "vPorn"]

In [35]:
p800_videos_data = {
  "fname": [],
  "label": [],
  "frame_count": [],
  "frame_width": [],
  "frame_height": [],
  "duration_s": []
}

p800_frames_data = {
  "id": [],
  "video": []
}

p800_segments_data = {
  "id": [],
  "video": [],
  "duration_s": []
}

for dir in dirs:
  for video in os.listdir(os.path.join(p800_videos_dir, dir)):
    if video.startswith("."): continue

    specs = get_video_specs(os.path.join(p800_videos_dir, dir, video))
    
    p800_videos_data["fname"].append(specs["fname"])
    p800_videos_data["label"].append(specs["label"])
    p800_videos_data["frame_count"].append(specs["frame_count"])
    p800_videos_data["frame_width"].append(specs["frame_width"])
    p800_videos_data["frame_height"].append(specs["frame_height"])
    p800_videos_data["duration_s"].append(specs["duration"])

  for frame in os.listdir(os.path.join(p800_frames_dir, dir)):
    if frame.startswith("."): continue

    frame_video, frame_id = frame.split(".")[0].split("#")
    
    p800_frames_data["id"].append(int(frame_id))
    p800_frames_data["video"].append(frame_video)

  for segment in os.listdir(os.path.join(p800_segments_dir, dir)):
    if segment.startswith("."): continue

    specs = get_video_specs(os.path.join(p800_segments_dir, dir, segment))
    segment_video, segment_id = specs["fname"].split("#")

    p800_segments_data["id"].append(int(segment_id))
    p800_segments_data["video"].append(segment_video)
    p800_segments_data["duration_s"].append(specs["duration"])

p800_videos = pd.DataFrame(p800_videos_data)
p800_frames = pd.DataFrame(p800_frames_data)
p800_segments = pd.DataFrame(p800_segments_data)

In [36]:
p800_videos["duration_m"] = p800_videos["duration_s"] / 60
p800_videos["duration_h"] = p800_videos["duration_s"] / 3600

p800_segments["duration_m"] = p800_segments["duration_s"] / 60
p800_segments["duration_h"] = p800_segments["duration_s"] / 3600

In [37]:
p800_videos

Unnamed: 0,fname,label,frame_count,frame_width,frame_height,duration_s,duration_m,duration_h
0,vNonPorn1076,non-porn,12170,320,240,406.072333,6.767872,0.112798
1,vNonPorn1079,non-porn,1587,480,360,52.900000,0.881667,0.014694
2,vNonPorn1112,non-porn,727,320,240,48.466667,0.807778,0.013463
3,vNonPorn1039,non-porn,7278,320,240,243.275766,4.054596,0.067577
4,vNonPorn789,non-porn,18250,320,238,610.027855,10.167131,0.169452
...,...,...,...,...,...,...,...,...
795,vPorn264,porn,7980,584,448,532.532000,8.875533,0.147926
796,vPorn68,porn,3721,520,390,124.157367,2.069289,0.034488
797,vPorn79,porn,33051,480,360,1322.040000,22.034000,0.367233
798,vPorn370,porn,3837,584,336,128.027900,2.133798,0.035563


In [79]:
print("Are there any duplicate 'fname' values in p800_videos?", "NO" if p800_videos["fname"].is_unique else "YES")

Are there any duplicate 'fname' values in p800_videos? NO


In [57]:
p800_frames = p800_frames.sort_values(by=["video","id"])
p800_frames

Unnamed: 0,id,video
4431,1,vNonPorn1
8964,2,vNonPorn1
10305,3,vNonPorn1
4972,4,vNonPorn1
7421,5,vNonPorn1
...,...,...
16693,3,vPorn98
16600,4,vPorn98
15973,5,vPorn98
10676,6,vPorn98


In [78]:
print("Total number of videos in p800_videos: ", p800_videos.nunique()["fname"])
print("Total number of videos in p800_frames: ", p800_frames.nunique()["video"])

# TODO: number is not equal, check difference

Total number of videos in p800_videos:  800
Total number of videos in p800_frames:  802


In [58]:
video_frames = p800_frames.groupby("video")["id"].apply(list)
video_frames

video
vNonPorn1                                      [1, 2, 3, 4, 5, 6]
vNonPorn10                                                    [1]
vNonPorn100     [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
vNonPorn1006    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
vNonPorn1007          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
                                      ...                        
vPorn94                                              [1, 2, 3, 4]
vPorn95                                  [1, 2, 3, 4, 5, 6, 7, 8]
vPorn96                                     [1, 2, 3, 4, 5, 6, 7]
vPorn97                                           [1, 2, 3, 4, 5]
vPorn98                                     [1, 2, 3, 4, 5, 6, 7]
Name: id, Length: 802, dtype: object

In [63]:
# TODO: only after checking why the number of videos in both dfs is not the same

aux = p800_videos.merge(video_frames, how="left", left_on="fname", right_on="video")

In [39]:
p800_segments

Unnamed: 0,id,video,duration_s,duration_m,duration_h
0,21,vNonPorn1010,10.920000,0.182000,0.003033
1,39,vNonPorn1010,2.280000,0.038000,0.000633
2,40,vNonPorn1010,32.280000,0.538000,0.008967
3,42,vNonPorn1013,1.080000,0.018000,0.000300
4,27,vNonPorn1025,2.520000,0.042000,0.000700
...,...,...,...,...,...
16722,30,vPorn111,4.240000,0.070667,0.001178
16723,5,vPorn422,5.160000,0.086000,0.001433
16724,1,vPorn168,7.280000,0.121333,0.002022
16725,7,vPorn231,2.920000,0.048667,0.000811


In [40]:
p800_videos.describe()

Unnamed: 0,frame_count,frame_width,frame_height,duration_s,duration_m,duration_h
count,800.0,800.0,800.0,800.0,800.0,800.0
mean,9914.8775,428.42,308.41125,348.483919,5.808065,0.096801
std,11170.35545,128.234038,92.478551,378.036673,6.300611,0.10501
min,44.0,128.0,88.0,7.8078,0.13013,0.002169
25%,2731.25,320.0,240.0,104.042758,1.734046,0.028901
50%,5818.0,400.0,240.0,213.44,3.557333,0.059289
75%,12132.0,520.0,390.0,420.595,7.009917,0.116832
max,60450.0,1280.0,720.0,2020.612813,33.67688,0.561281


In [42]:
p800_segments.describe()

Unnamed: 0,id,duration_s,duration_m,duration_h
count,16727.0,16727.0,16727.0,16727.0
mean,34.408561,19.43833,0.323972,0.0054
std,39.828583,61.849805,1.03083,0.017181
min,1.0,0.04,0.000667,1.1e-05
25%,8.0,2.16,0.036,0.0006
50%,21.0,4.76,0.079333,0.001322
75%,46.0,12.5,0.208333,0.003472
max,311.0,2087.12,34.785333,0.579756


In [43]:
p800_porn = p800_videos[p800_videos["label"] == "porn"]
p800_porn.describe()

Unnamed: 0,frame_count,frame_width,frame_height,duration_s,duration_m,duration_h
count,400.0,400.0,400.0,400.0,400.0,400.0
mean,14790.88,513.21,369.8025,511.50804,8.525134,0.142086
std,13456.506298,103.086872,76.603418,454.697606,7.578293,0.126305
min,253.0,256.0,208.0,10.12,0.168667,0.002811
25%,4942.75,480.0,336.0,179.695,2.994917,0.049915
50%,8758.0,520.0,390.0,301.101393,5.018357,0.083639
75%,22483.25,584.0,438.0,750.685759,12.511429,0.208524
max,60450.0,720.0,576.0,2020.612813,33.67688,0.561281


In [44]:
p800_non_porn = p800_videos[p800_videos["label"] == "non-porn"]
p800_non_porn.describe()

Unnamed: 0,frame_count,frame_width,frame_height,duration_s,duration_m,duration_h
count,400.0,400.0,400.0,400.0,400.0,400.0
mean,5038.875,343.63,247.02,185.459799,3.090997,0.051517
std,4595.526979,88.811722,60.837922,161.693444,2.694891,0.044915
min,44.0,128.0,88.0,7.8078,0.13013,0.002169
25%,1611.0,320.0,233.5,63.165,1.05275,0.017546
50%,3761.0,320.0,240.0,145.77785,2.429631,0.040494
75%,7004.0,320.0,240.0,258.612755,4.310213,0.071837
max,29612.0,1280.0,720.0,1216.791667,20.279861,0.337998


In [50]:
print(f"{p800_porn['duration_h'].sum():.2f} hours ({p800_porn['duration_m'].sum():.2f} minutes) of pornographic content")
print(f"{p800_non_porn['duration_h'].sum():.2f} hours ({p800_non_porn['duration_m'].sum():.2f} minutes) of non-pornographic content")

56.83 hours (3410.05 minutes) of pornographic content
20.61 hours (1236.40 minutes) of non-pornographic content


## Pornography-2k Dataset

In [None]:
video_paths = glob.glob(f"{PORN_2K_DIR}/*.mp4")

p2k_data = {
  "fname": [],
  "label": [],
  "frame_count": [],
  "frame_width": [],
  "frame_height": [],
  "duration_s": [],
}

for video_path in video_paths:
  specs = get_video_specs(video_path)

  p2k_data["fname"].append(specs["fname"])
  p2k_data["label"].append(specs["label"])
  p2k_data["frame_count"].append(specs["frame_count"])
  p2k_data["frame_width"].append(specs["frame_width"])
  p2k_data["frame_height"].append(specs["frame_height"])
  p2k_data["duration_s"].append(specs["duration"])

p2k = pd.DataFrame(p2k_data)

In [26]:
p2k["duration_m"] = p2k["duration_s"] / 60
p2k["duration_h"] = p2k["duration_s"] / 3600

In [None]:
p2k.describe()

In [None]:
print("Total number of videos in p2k: ", p2k.nunique()["fname"])
print("Are there any duplicate 'fname' values in p2k?", "NO" if p2k["fname"].is_unique else "YES")

In [None]:
p2k_porn = p2k[p2k["label"] == "porn"]
p2k_porn.describe()

In [None]:
p2k_non_porn = p2k[p2k["label"] == "non-porn"]
p2k_non_porn.describe()

In [None]:
print(f"{p2k_porn['duration_h'].sum():.2f} hours ({p2k_porn['duration_m']:.2f} minutes) of pornographic content")
print(f"{p2k_non_porn['duration_h'].sum():.2f} hours ({p2k_non_porn['duration_m']:.2f} minutes) of non-pornographic content")